diff mbox series

[v8,01/11] timekeeping: move multigrain timestamp floor handling into timekeeper

Message ID 20240914-mgtime-v8-1-5bd872330bed@kernel.org (mailing list archive)
State New
Headers show
Series fs: multigrain timestamp redux | expand

Commit Message

Jeff Layton Sept. 14, 2024, 5:07 p.m. UTC
For multigrain timestamps, we must keep track of the latest timestamp
that has ever been handed out, and never hand out a coarse time below
that value.

Add a static singleton atomic64_t into timekeeper.c that we can use to
keep track of the latest fine-grained time ever handed out. This is
tracked as a monotonic ktime_t value to ensure that it isn't affected by
clock jumps.

Add two new public interfaces:

- ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
  coarse-grained clock and the floor time

- ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
  to swap it into the floor. A timespec64 is filled with the result.

Since the floor is global, we take great pains to avoid updating it
unless it's absolutely necessary. If we do the cmpxchg and find that the
value has been updated since we fetched it, then we discard the
fine-grained time that was fetched in favor of the recent update.

To maximize the window of this occurring when multiple tasks are racing
to update the floor, ktime_get_coarse_real_ts64_mg returns a cookie
value that represents the state of the floor tracking word, and
ktime_get_real_ts64_mg accepts a cookie value that it uses as the "old"
value when calling cmpxchg().

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/timekeeping.h |  4 +++
 kernel/time/timekeeping.c   | 82 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

Comments

John Stultz Sept. 14, 2024, 8:10 p.m. UTC | #1
On Sat, Sep 14, 2024 at 10:07 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> For multigrain timestamps, we must keep track of the latest timestamp
> that has ever been handed out, and never hand out a coarse time below
> that value.
>
> Add a static singleton atomic64_t into timekeeper.c that we can use to
> keep track of the latest fine-grained time ever handed out. This is
> tracked as a monotonic ktime_t value to ensure that it isn't affected by
> clock jumps.
>
> Add two new public interfaces:
>
> - ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
>   coarse-grained clock and the floor time
>
> - ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
>   to swap it into the floor. A timespec64 is filled with the result.
>
> Since the floor is global, we take great pains to avoid updating it
> unless it's absolutely necessary. If we do the cmpxchg and find that the
> value has been updated since we fetched it, then we discard the
> fine-grained time that was fetched in favor of the recent update.
>
> To maximize the window of this occurring when multiple tasks are racing
> to update the floor, ktime_get_coarse_real_ts64_mg returns a cookie
> value that represents the state of the floor tracking word, and
> ktime_get_real_ts64_mg accepts a cookie value that it uses as the "old"
> value when calling cmpxchg().

This last bit seems out of date.

> ---
>  include/linux/timekeeping.h |  4 +++
>  kernel/time/timekeeping.c   | 82 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 86 insertions(+)
>
> diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
> index fc12a9ba2c88..7aa85246c183 100644
> --- a/include/linux/timekeeping.h
> +++ b/include/linux/timekeeping.h
> @@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv);
>  extern void ktime_get_coarse_ts64(struct timespec64 *ts);
>  extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
>
> +/* Multigrain timestamp interfaces */
> +extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
> +extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
> +
>  void getboottime64(struct timespec64 *ts);
>
>  /*
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index 5391e4167d60..16937242b904 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -114,6 +114,13 @@ static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
>         .base[1] = FAST_TK_INIT,
>  };
>
> +/*
> + * This represents the latest fine-grained time that we have handed out as a
> + * timestamp on the system. Tracked as a monotonic ktime_t, and converted to the
> + * realtime clock on an as-needed basis.
> + */
> +static __cacheline_aligned_in_smp atomic64_t mg_floor;
> +
>  static inline void tk_normalize_xtime(struct timekeeper *tk)
>  {
>         while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
> @@ -2394,6 +2401,81 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
>  }
>  EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
>
> +/**
> + * ktime_get_coarse_real_ts64_mg - get later of coarse grained time or floor
> + * @ts: timespec64 to be filled
> + *
> + * Adjust floor to realtime and compare it to the coarse time. Fill
> + * @ts with the latest one. Note that this is a filesystem-specific
> + * interface and should be avoided outside of that context.
> + */
> +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
> +{
> +       struct timekeeper *tk = &tk_core.timekeeper;
> +       u64 floor = atomic64_read(&mg_floor);
> +       ktime_t f_real, offset, coarse;
> +       unsigned int seq;
> +
> +       WARN_ON(timekeeping_suspended);
> +
> +       do {
> +               seq = read_seqcount_begin(&tk_core.seq);
> +               *ts = tk_xtime(tk);
> +               offset = *offsets[TK_OFFS_REAL];
> +       } while (read_seqcount_retry(&tk_core.seq, seq));
> +
> +       coarse = timespec64_to_ktime(*ts);
> +       f_real = ktime_add(floor, offset);
> +       if (ktime_after(f_real, coarse))
> +               *ts = ktime_to_timespec64(f_real);
> +}
> +EXPORT_SYMBOL_GPL(ktime_get_coarse_real_ts64_mg);
> +
> +/**
> + * ktime_get_real_ts64_mg - attempt to update floor value and return result
> + * @ts:                pointer to the timespec to be set
> + *
> + * Get a current monotonic fine-grained time value and attempt to swap
> + * it into the floor. @ts will be filled with the resulting floor value,
> + * regardless of the outcome of the swap. Note that this is a filesystem
> + * specific interface and should be avoided outside of that context.
> + */
> +void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)

Still passing a cookie. It doesn't match the header definition, so I'm
surprised this builds.

> +{
> +       struct timekeeper *tk = &tk_core.timekeeper;
> +       ktime_t old = atomic64_read(&mg_floor);
> +       ktime_t offset, mono;
> +       unsigned int seq;
> +       u64 nsecs;
> +
> +       WARN_ON(timekeeping_suspended);
> +
> +       do {
> +               seq = read_seqcount_begin(&tk_core.seq);
> +
> +               ts->tv_sec = tk->xtime_sec;
> +               mono = tk->tkr_mono.base;
> +               nsecs = timekeeping_get_ns(&tk->tkr_mono);
> +               offset = *offsets[TK_OFFS_REAL];
> +       } while (read_seqcount_retry(&tk_core.seq, seq));
> +
> +       mono = ktime_add_ns(mono, nsecs);
> +
> +       if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
> +               ts->tv_nsec = 0;
> +               timespec64_add_ns(ts, nsecs);
> +       } else {
> +               /*
> +                * Something has changed mg_floor since "old" was
> +                * fetched. "old" has now been updated with the
> +                * current value of mg_floor, so use that to return
> +                * the current coarse floor value.
> +                */
> +               *ts = ktime_to_timespec64(ktime_add(old, offset));
> +       }
> +}
> +EXPORT_SYMBOL_GPL(ktime_get_real_ts64_mg);

Other than those issues, I'm ok with it. Thanks again for working
through my concerns!

Since I'm traveling for LPC soon, to save the next cycle, once the
fixes above are sorted:
 Acked-by: John Stultz <jstultz@google.com>

thanks
-john
Jeff Layton Sept. 14, 2024, 11:14 p.m. UTC | #2
On Sat, 2024-09-14 at 13:10 -0700, John Stultz wrote:
> On Sat, Sep 14, 2024 at 10:07 AM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > For multigrain timestamps, we must keep track of the latest timestamp
> > that has ever been handed out, and never hand out a coarse time below
> > that value.
> > 
> > Add a static singleton atomic64_t into timekeeper.c that we can use to
> > keep track of the latest fine-grained time ever handed out. This is
> > tracked as a monotonic ktime_t value to ensure that it isn't affected by
> > clock jumps.
> > 
> > Add two new public interfaces:
> > 
> > - ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
> >   coarse-grained clock and the floor time
> > 
> > - ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
> >   to swap it into the floor. A timespec64 is filled with the result.
> > 
> > Since the floor is global, we take great pains to avoid updating it
> > unless it's absolutely necessary. If we do the cmpxchg and find that the
> > value has been updated since we fetched it, then we discard the
> > fine-grained time that was fetched in favor of the recent update.
> > 
> > To maximize the window of this occurring when multiple tasks are racing
> > to update the floor, ktime_get_coarse_real_ts64_mg returns a cookie
> > value that represents the state of the floor tracking word, and
> > ktime_get_real_ts64_mg accepts a cookie value that it uses as the "old"
> > value when calling cmpxchg().
> 
> This last bit seems out of date.
> 

Thanks. Dropped the last paragraph.

> > ---
> >  include/linux/timekeeping.h |  4 +++
> >  kernel/time/timekeeping.c   | 82 +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 86 insertions(+)
> > 
> > diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
> > index fc12a9ba2c88..7aa85246c183 100644
> > --- a/include/linux/timekeeping.h
> > +++ b/include/linux/timekeeping.h
> > @@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv);
> >  extern void ktime_get_coarse_ts64(struct timespec64 *ts);
> >  extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
> > 
> > +/* Multigrain timestamp interfaces */
> > +extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
> > +extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
> > +
> >  void getboottime64(struct timespec64 *ts);
> > 
> >  /*
> > diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> > index 5391e4167d60..16937242b904 100644
> > --- a/kernel/time/timekeeping.c
> > +++ b/kernel/time/timekeeping.c
> > @@ -114,6 +114,13 @@ static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
> >         .base[1] = FAST_TK_INIT,
> >  };
> > 
> > +/*
> > + * This represents the latest fine-grained time that we have handed out as a
> > + * timestamp on the system. Tracked as a monotonic ktime_t, and converted to the
> > + * realtime clock on an as-needed basis.
> > + */
> > +static __cacheline_aligned_in_smp atomic64_t mg_floor;
> > +
> >  static inline void tk_normalize_xtime(struct timekeeper *tk)
> >  {
> >         while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
> > @@ -2394,6 +2401,81 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
> >  }
> >  EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
> > 
> > +/**
> > + * ktime_get_coarse_real_ts64_mg - get later of coarse grained time or floor
> > + * @ts: timespec64 to be filled
> > + *
> > + * Adjust floor to realtime and compare it to the coarse time. Fill
> > + * @ts with the latest one. Note that this is a filesystem-specific
> > + * interface and should be avoided outside of that context.
> > + */
> > +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
> > +{
> > +       struct timekeeper *tk = &tk_core.timekeeper;
> > +       u64 floor = atomic64_read(&mg_floor);
> > +       ktime_t f_real, offset, coarse;
> > +       unsigned int seq;
> > +
> > +       WARN_ON(timekeeping_suspended);
> > +
> > +       do {
> > +               seq = read_seqcount_begin(&tk_core.seq);
> > +               *ts = tk_xtime(tk);
> > +               offset = *offsets[TK_OFFS_REAL];
> > +       } while (read_seqcount_retry(&tk_core.seq, seq));
> > +
> > +       coarse = timespec64_to_ktime(*ts);
> > +       f_real = ktime_add(floor, offset);
> > +       if (ktime_after(f_real, coarse))
> > +               *ts = ktime_to_timespec64(f_real);
> > +}
> > +EXPORT_SYMBOL_GPL(ktime_get_coarse_real_ts64_mg);
> > +
> > +/**
> > + * ktime_get_real_ts64_mg - attempt to update floor value and return result
> > + * @ts:                pointer to the timespec to be set
> > + *
> > + * Get a current monotonic fine-grained time value and attempt to swap
> > + * it into the floor. @ts will be filled with the resulting floor value,
> > + * regardless of the outcome of the swap. Note that this is a filesystem
> > + * specific interface and should be avoided outside of that context.
> > + */
> > +void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)
> 
> Still passing a cookie. It doesn't match the header definition, so I'm
> surprised this builds.
> 

Yeah, I didn't see a warning when I built it. Luckily the extra
parameter is ignored anyway, so it no harm. I've fixed that up in my
tree.

> > +{
> > +       struct timekeeper *tk = &tk_core.timekeeper;
> > +       ktime_t old = atomic64_read(&mg_floor);
> > +       ktime_t offset, mono;
> > +       unsigned int seq;
> > +       u64 nsecs;
> > +
> > +       WARN_ON(timekeeping_suspended);
> > +
> > +       do {
> > +               seq = read_seqcount_begin(&tk_core.seq);
> > +
> > +               ts->tv_sec = tk->xtime_sec;
> > +               mono = tk->tkr_mono.base;
> > +               nsecs = timekeeping_get_ns(&tk->tkr_mono);
> > +               offset = *offsets[TK_OFFS_REAL];
> > +       } while (read_seqcount_retry(&tk_core.seq, seq));
> > +
> > +       mono = ktime_add_ns(mono, nsecs);
> > +
> > +       if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
> > +               ts->tv_nsec = 0;
> > +               timespec64_add_ns(ts, nsecs);
> > +       } else {
> > +               /*
> > +                * Something has changed mg_floor since "old" was
> > +                * fetched. "old" has now been updated with the
> > +                * current value of mg_floor, so use that to return
> > +                * the current coarse floor value.
> > +                */
> > +               *ts = ktime_to_timespec64(ktime_add(old, offset));
> > +       }
> > +}
> > +EXPORT_SYMBOL_GPL(ktime_get_real_ts64_mg);
> 
> Other than those issues, I'm ok with it. Thanks again for working
> through my concerns!
> 
> Since I'm traveling for LPC soon, to save the next cycle, once the
> fixes above are sorted:
>  Acked-by: John Stultz <jstultz@google.com>
> 

Thanks for the review!
Thomas Gleixner Sept. 16, 2024, 10:12 a.m. UTC | #3
On Sat, Sep 14 2024 at 13:07, Jeff Layton wrote:
> For multigrain timestamps, we must keep track of the latest timestamp

What is a multgrain timestamp? Can you please describe the concept
behind it? I'm not going to chase random documentation or whatever
because change logs have to self contained.

And again 'we' do nothing. Describe the problem in technical terms and
do not impersonate code.

> To maximize the window of this occurring when multiple tasks are racing
> to update the floor, ktime_get_coarse_real_ts64_mg returns a cookie
> value that represents the state of the floor tracking word, and
> ktime_get_real_ts64_mg accepts a cookie value that it uses as the "old"
> value when calling cmpxchg().

Clearly:

> +void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)

Can you please get your act together?

> +/**
> + * ktime_get_coarse_real_ts64_mg - get later of coarse grained time or floor
> + * @ts: timespec64 to be filled
> + *
> + * Adjust floor to realtime and compare it to the coarse time. Fill
> + * @ts with the latest one.

This explains nothing.

>      Note that this is a filesystem-specific
> + * interface and should be avoided outside of that context.
> + */
> +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
> +{
> +	struct timekeeper *tk = &tk_core.timekeeper;
> +	u64 floor = atomic64_read(&mg_floor);
> +	ktime_t f_real, offset, coarse;
> +	unsigned int seq;
> +
> +	WARN_ON(timekeeping_suspended);
> +
> +	do {
> +		seq = read_seqcount_begin(&tk_core.seq);
> +		*ts = tk_xtime(tk);
> +		offset = *offsets[TK_OFFS_REAL];

Why this indirection? What's wrong with using
tk_core.timekeeper.offs_real directly?

> +	} while (read_seqcount_retry(&tk_core.seq, seq));
> +
> +	coarse = timespec64_to_ktime(*ts);
> +	f_real = ktime_add(floor, offset);

How is any of this synchronized against concurrent updates of the floor
value or the offset? I'm failing to see anything which keeps this
consistent. If this is magically consistent then it wants a big fat
comment in the code which explains it.

> +void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)

What is this cookie argument for and how does that match the
declaration?

> +extern void ktime_get_real_ts64_mg(struct timespec64 *ts);

This does not even build.

> +{
> +	struct timekeeper *tk = &tk_core.timekeeper;
> +	ktime_t old = atomic64_read(&mg_floor);
> +	ktime_t offset, mono;
> +	unsigned int seq;
> +	u64 nsecs;
> +
> +	WARN_ON(timekeeping_suspended);

WARN_ON_ONCE() if at all.

> +	do {
> +		seq = read_seqcount_begin(&tk_core.seq);
> +
> +		ts->tv_sec = tk->xtime_sec;
> +		mono = tk->tkr_mono.base;
> +		nsecs = timekeeping_get_ns(&tk->tkr_mono);
> +		offset = *offsets[TK_OFFS_REAL];
> +	} while (read_seqcount_retry(&tk_core.seq, seq));
> +
> +	mono = ktime_add_ns(mono, nsecs);
> +
> +	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
> +		ts->tv_nsec = 0;
> +		timespec64_add_ns(ts, nsecs);
> +	} else {
> +		/*
> +		 * Something has changed mg_floor since "old" was
> +		 * fetched. "old" has now been updated with the
> +		 * current value of mg_floor, so use that to return
> +		 * the current coarse floor value.

'Something has changed' is a truly understandable technical
explanation.

I'm not going to accept this voodoo which makes everyone scratch his
head who wasn't involved in this.

Thanks,

        tglx
Thomas Gleixner Sept. 16, 2024, 10:32 a.m. UTC | #4
On Mon, Sep 16 2024 at 12:12, Thomas Gleixner wrote:
> On Sat, Sep 14 2024 at 13:07, Jeff Layton wrote:
>> +	do {
>> +		seq = read_seqcount_begin(&tk_core.seq);
>> +
>> +		ts->tv_sec = tk->xtime_sec;
>> +		mono = tk->tkr_mono.base;
>> +		nsecs = timekeeping_get_ns(&tk->tkr_mono);
>> +		offset = *offsets[TK_OFFS_REAL];
>> +	} while (read_seqcount_retry(&tk_core.seq, seq));
>> +
>> +	mono = ktime_add_ns(mono, nsecs);
>> +
>> +	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
>> +		ts->tv_nsec = 0;
>> +		timespec64_add_ns(ts, nsecs);
>> +	} else {
>> +		/*
>> +		 * Something has changed mg_floor since "old" was
>> +		 * fetched. "old" has now been updated with the
>> +		 * current value of mg_floor, so use that to return
>> +		 * the current coarse floor value.
>
> 'Something has changed' is a truly understandable technical
> explanation.

     old = mg_floor
                                mono = T1;
                                mg_floor = mono
preemption

     do {
        mono = T2;
     }

     cmpxchg fails and the function returns a value based on T1

No?

Thanks,

        tglx
Jeff Layton Sept. 16, 2024, 10:57 a.m. UTC | #5
On Mon, 2024-09-16 at 12:32 +0200, Thomas Gleixner wrote:
> On Mon, Sep 16 2024 at 12:12, Thomas Gleixner wrote:
> > On Sat, Sep 14 2024 at 13:07, Jeff Layton wrote:
> > > +	do {
> > > +		seq = read_seqcount_begin(&tk_core.seq);
> > > +
> > > +		ts->tv_sec = tk->xtime_sec;
> > > +		mono = tk->tkr_mono.base;
> > > +		nsecs = timekeeping_get_ns(&tk->tkr_mono);
> > > +		offset = *offsets[TK_OFFS_REAL];
> > > +	} while (read_seqcount_retry(&tk_core.seq, seq));
> > > +
> > > +	mono = ktime_add_ns(mono, nsecs);
> > > +
> > > +	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
> > > +		ts->tv_nsec = 0;
> > > +		timespec64_add_ns(ts, nsecs);
> > > +	} else {
> > > +		/*
> > > +		 * Something has changed mg_floor since "old" was
> > > +		 * fetched. "old" has now been updated with the
> > > +		 * current value of mg_floor, so use that to return
> > > +		 * the current coarse floor value.
> > 
> > 'Something has changed' is a truly understandable technical
> > explanation.
> 
>      old = mg_floor
>                                 mono = T1;
>                                 mg_floor = mono
> preemption
> 
>      do {
>         mono = T2;
>      }
> 
>      cmpxchg fails and the function returns a value based on T1
> 
> No?
> 
> 

Packing for LPC, so I can't respond to all of these just now, but I
will later. You're correct, but either outcome is OK.

The requirement is that we don't hand out any values that were below
the floor at the time that the task entered the kernel. Since the time
changed while the task was already inside the kernel, either T1 or T2
would be valid timestamps.
Jeff Layton Sept. 19, 2024, 4:50 p.m. UTC | #6
On Mon, 2024-09-16 at 12:12 +0200, Thomas Gleixner wrote:
> On Sat, Sep 14 2024 at 13:07, Jeff Layton wrote:
> > For multigrain timestamps, we must keep track of the latest timestamp
> 
> What is a multgrain timestamp? Can you please describe the concept
> behind it? I'm not going to chase random documentation or whatever
> because change logs have to self contained.
> 
> And again 'we' do nothing. Describe the problem in technical terms and
> do not impersonate code.
> 

Hi Thomas!

Sorry for the delay in responding. I'll try to summarize below, but
I'll also note that patch #7 in the v8 series adds a file to
Documentation/ that explains this in a bit more depth:

Currently the kernel always stamps files (mtime, ctime, etc.) using the
coarse-grained clock. This is usually a good thing, since it reduces
the number of metadata updates, but means that you can't reliably use
file timestamps to detect whether there have been changes to the file
since it was last checked. This is particularly a problem for NFSv3
clients, which use timestamps to know when to invalidate their
pagecache for an inode [1].

The idea is to allow the kernel to use fine-grained timestamps (mtime
and ctime) on files when they are under direct observation. When a task
does a ->getattr against an inode for STATX_MTIME or STATX_CTIME, a
flag is set in the inode that tells the kernel to use the fine-grained
clock for the timestamp update iff the current coarse-grained clock
value would not cause a change to the mtime/ctime.

This works, but there is a problem:

It's possible for one inode to get a fine-grained timestamp, and then
another to get a coarse-grained timestamp. If this happens within a
single coarse-grained timer tick, then the files may appear to have
been modified in reverse order, which breaks POSIX guarantees (and
obscure programs like "make").

The fix for this is to establish a floor value for the coarse-grained
clock. When stamping a file with a fine-grained timestamp, we update
the floor value with the current monotonic time (using cmpxchg). Then
later, when a coarse-grained timestamp is requested, check whether the
floor is later than the current coarse-grained time. If it is, then the
kernel will return the floor value (converted to realtime) instead of
the current coarse-grained clock. That allows us to maintain the
ordering guarantees.

My original implementation of this tracked the floor value in
fs/inode.c (also using cmpxchg), but that caused a performance
regression, mostly due to multiple calls into the timekeeper functions
with seqcount loops. By adding the floor to the timekeeper we can get
that back down to 1 seqcount loop.

Let me know if you have more questions about this, or suggestions about
how to do this better. The timekeeping code is not my area of expertise
(obviously) so I'm open to doing this a better way if there is one.

Thanks for the review so far!

[1]: NFSv4 mandates an opaque change attribute (usually using
inode->i_version), but only some filesystems have a proper
implementation of it (XFS being the notable exception). For the others,
we end up using the ctime to generate a change attribute, which means
that NFSv4 has the same problem on those filesystems. i_version also
doesn't help NFSv3 clients and servers.
diff mbox series

Patch

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fc12a9ba2c88..7aa85246c183 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -45,6 +45,10 @@  extern void ktime_get_real_ts64(struct timespec64 *tv);
 extern void ktime_get_coarse_ts64(struct timespec64 *ts);
 extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
 
+/* Multigrain timestamp interfaces */
+extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
+extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+
 void getboottime64(struct timespec64 *ts);
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5391e4167d60..16937242b904 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -114,6 +114,13 @@  static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
 	.base[1] = FAST_TK_INIT,
 };
 
+/*
+ * This represents the latest fine-grained time that we have handed out as a
+ * timestamp on the system. Tracked as a monotonic ktime_t, and converted to the
+ * realtime clock on an as-needed basis.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
 	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -2394,6 +2401,81 @@  void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 }
 EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
 
+/**
+ * ktime_get_coarse_real_ts64_mg - get later of coarse grained time or floor
+ * @ts: timespec64 to be filled
+ *
+ * Adjust floor to realtime and compare it to the coarse time. Fill
+ * @ts with the latest one. Note that this is a filesystem-specific
+ * interface and should be avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	u64 floor = atomic64_read(&mg_floor);
+	ktime_t f_real, offset, coarse;
+	unsigned int seq;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		*ts = tk_xtime(tk);
+		offset = *offsets[TK_OFFS_REAL];
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	coarse = timespec64_to_ktime(*ts);
+	f_real = ktime_add(floor, offset);
+	if (ktime_after(f_real, coarse))
+		*ts = ktime_to_timespec64(f_real);
+}
+EXPORT_SYMBOL_GPL(ktime_get_coarse_real_ts64_mg);
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts:		pointer to the timespec to be set
+ *
+ * Get a current monotonic fine-grained time value and attempt to swap
+ * it into the floor. @ts will be filled with the resulting floor value,
+ * regardless of the outcome of the swap. Note that this is a filesystem
+ * specific interface and should be avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	ktime_t old = atomic64_read(&mg_floor);
+	ktime_t offset, mono;
+	unsigned int seq;
+	u64 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+
+		ts->tv_sec = tk->xtime_sec;
+		mono = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
+		offset = *offsets[TK_OFFS_REAL];
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	mono = ktime_add_ns(mono, nsecs);
+
+	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+		ts->tv_nsec = 0;
+		timespec64_add_ns(ts, nsecs);
+	} else {
+		/*
+		 * Something has changed mg_floor since "old" was
+		 * fetched. "old" has now been updated with the
+		 * current value of mg_floor, so use that to return
+		 * the current coarse floor value.
+		 */
+		*ts = ktime_to_timespec64(ktime_add(old, offset));
+	}
+}
+EXPORT_SYMBOL_GPL(ktime_get_real_ts64_mg);
+
 void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;