diff mbox series

[v4] fs: Fix data race in inode_set_ctime_to_ts

Message ID 20241124094253.565643-1-zhenghaoran@buaa.edu.cn (mailing list archive)
State New
Headers show
Series [v4] fs: Fix data race in inode_set_ctime_to_ts | expand

Commit Message

Hao-ran Zheng Nov. 24, 2024, 9:42 a.m. UTC
A data race may occur when the function `inode_set_ctime_to_ts()` and
the function `inode_get_ctime_sec()` are executed concurrently. When
two threads call `aio_read` and `aio_write` respectively, they will
be distributed to the read and write functions of the corresponding
file system respectively. Taking the btrfs file system as an example,
the `btrfs_file_read_iter` and `btrfs_file_write_iter` functions are
finally called. These two functions created a data race when they
finally called `inode_get_ctime_sec()` and `inode_set_ctime_to_ns()`.
The specific call stack that appears during testing is as follows:

============DATA_RACE============
btrfs_delayed_update_inode+0x1f61/0x7ce0 [btrfs]
btrfs_update_inode+0x45e/0xbb0 [btrfs]
btrfs_dirty_inode+0x2b8/0x530 [btrfs]
btrfs_update_time+0x1ad/0x230 [btrfs]
touch_atime+0x211/0x440
filemap_read+0x90f/0xa20
btrfs_file_read_iter+0xeb/0x580 [btrfs]
aio_read+0x275/0x3a0
io_submit_one+0xd22/0x1ce0
__se_sys_io_submit+0xb3/0x250
do_syscall_64+0xc1/0x190
entry_SYSCALL_64_after_hwframe+0x77/0x7f
============OTHER_INFO============
btrfs_write_check+0xa15/0x1390 [btrfs]
btrfs_buffered_write+0x52f/0x29d0 [btrfs]
btrfs_do_write_iter+0x53d/0x1590 [btrfs]
btrfs_file_write_iter+0x41/0x60 [btrfs]
aio_write+0x41e/0x5f0
io_submit_one+0xd42/0x1ce0
__se_sys_io_submit+0xb3/0x250
do_syscall_64+0xc1/0x190
entry_SYSCALL_64_after_hwframe+0x77/0x7f

To address this issue, it is recommended to add WRITE_ONCE
and READ_ONCE when reading or writing the `inode->i_ctime_sec`
and `inode->i_ctime_nsec` variable.

Signed-off-by: Hao-ran Zheng <zhenghaoran@buaa.edu.cn>
---
V3 -> V4: Fixed patch for latest code
V2 -> V3: Added READ_ONCE in inode_get_ctime_nsec() and addressed review comments
V1 -> V2: Added READ_ONCE in inode_get_ctime_sec()
---
 fs/inode.c | 16 ++++++++--------
 fs/stat.c  |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

Comments

Darrick J. Wong Nov. 24, 2024, 5:44 p.m. UTC | #1
On Sun, Nov 24, 2024 at 05:42:53PM +0800, Hao-ran Zheng wrote:
> A data race may occur when the function `inode_set_ctime_to_ts()` and
> the function `inode_get_ctime_sec()` are executed concurrently. When
> two threads call `aio_read` and `aio_write` respectively, they will
> be distributed to the read and write functions of the corresponding
> file system respectively. Taking the btrfs file system as an example,
> the `btrfs_file_read_iter` and `btrfs_file_write_iter` functions are
> finally called. These two functions created a data race when they
> finally called `inode_get_ctime_sec()` and `inode_set_ctime_to_ns()`.
> The specific call stack that appears during testing is as follows:
> 
> ============DATA_RACE============
> btrfs_delayed_update_inode+0x1f61/0x7ce0 [btrfs]
> btrfs_update_inode+0x45e/0xbb0 [btrfs]
> btrfs_dirty_inode+0x2b8/0x530 [btrfs]
> btrfs_update_time+0x1ad/0x230 [btrfs]
> touch_atime+0x211/0x440
> filemap_read+0x90f/0xa20
> btrfs_file_read_iter+0xeb/0x580 [btrfs]
> aio_read+0x275/0x3a0
> io_submit_one+0xd22/0x1ce0
> __se_sys_io_submit+0xb3/0x250
> do_syscall_64+0xc1/0x190
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> ============OTHER_INFO============
> btrfs_write_check+0xa15/0x1390 [btrfs]
> btrfs_buffered_write+0x52f/0x29d0 [btrfs]
> btrfs_do_write_iter+0x53d/0x1590 [btrfs]
> btrfs_file_write_iter+0x41/0x60 [btrfs]
> aio_write+0x41e/0x5f0
> io_submit_one+0xd42/0x1ce0
> __se_sys_io_submit+0xb3/0x250
> do_syscall_64+0xc1/0x190
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
> To address this issue, it is recommended to add WRITE_ONCE
> and READ_ONCE when reading or writing the `inode->i_ctime_sec`
> and `inode->i_ctime_nsec` variable.

Excuse my ignorance, but how exactly does this annotation fix the race?
Does it prevent reordering of the memory accesses or something?  "it is
recommended" is not enough for dunces such as myself to figure out why
this fixes the problem. :(

--D

> Signed-off-by: Hao-ran Zheng <zhenghaoran@buaa.edu.cn>
> ---
> V3 -> V4: Fixed patch for latest code
> V2 -> V3: Added READ_ONCE in inode_get_ctime_nsec() and addressed review comments
> V1 -> V2: Added READ_ONCE in inode_get_ctime_sec()
> ---
>  fs/inode.c | 16 ++++++++--------
>  fs/stat.c  |  2 +-
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index b13b778257ae..bfab370c8622 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2713,8 +2713,8 @@ struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 t
>  {
>  	trace_inode_set_ctime_to_ts(inode, &ts);
>  	set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
> -	inode->i_ctime_sec = ts.tv_sec;
> -	inode->i_ctime_nsec = ts.tv_nsec;
> +	WRITE_ONCE(inode->i_ctime_sec, ts.tv_sec);
> +	WRITE_ONCE(inode->i_ctime_nsec, ts.tv_nsec);
>  	return ts;
>  }
>  EXPORT_SYMBOL(inode_set_ctime_to_ts);
> @@ -2788,7 +2788,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
>  	 */
>  	cns = smp_load_acquire(&inode->i_ctime_nsec);
>  	if (cns & I_CTIME_QUERIED) {
> -		struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
> +		struct timespec64 ctime = { .tv_sec = READ_ONCE(inode->i_ctime_sec),
>  					    .tv_nsec = cns & ~I_CTIME_QUERIED };
>  
>  		if (timespec64_compare(&now, &ctime) <= 0) {
> @@ -2809,7 +2809,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
>  	/* Try to swap the nsec value into place. */
>  	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
>  		/* If swap occurred, then we're (mostly) done */
> -		inode->i_ctime_sec = now.tv_sec;
> +		WRITE_ONCE(inode->i_ctime_sec, now.tv_sec);
>  		trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
>  		mgtime_counter_inc(mg_ctime_swaps);
>  	} else {
> @@ -2824,7 +2824,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
>  			goto retry;
>  		}
>  		/* Otherwise, keep the existing ctime */
> -		now.tv_sec = inode->i_ctime_sec;
> +		now.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  		now.tv_nsec = cur & ~I_CTIME_QUERIED;
>  	}
>  out:
> @@ -2857,7 +2857,7 @@ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
>  	/* pairs with try_cmpxchg below */
>  	cur = smp_load_acquire(&inode->i_ctime_nsec);
>  	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
> -	cur_ts.tv_sec = inode->i_ctime_sec;
> +	cur_ts.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  
>  	/* If the update is older than the existing value, skip it. */
>  	if (timespec64_compare(&update, &cur_ts) <= 0)
> @@ -2883,7 +2883,7 @@ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
>  retry:
>  	old = cur;
>  	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
> -		inode->i_ctime_sec = update.tv_sec;
> +		WRITE_ONCE(inode->i_ctime_sec, update.tv_sec);
>  		mgtime_counter_inc(mg_ctime_swaps);
>  		return update;
>  	}
> @@ -2899,7 +2899,7 @@ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
>  		goto retry;
>  
>  	/* Otherwise, it was a new timestamp. */
> -	cur_ts.tv_sec = inode->i_ctime_sec;
> +	cur_ts.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
>  	return cur_ts;
>  }
> diff --git a/fs/stat.c b/fs/stat.c
> index 0870e969a8a0..e39c78cd62f3 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -53,7 +53,7 @@ void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
>  	}
>  
>  	stat->mtime = inode_get_mtime(inode);
> -	stat->ctime.tv_sec = inode->i_ctime_sec;
> +	stat->ctime.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  	stat->ctime.tv_nsec = (u32)atomic_read(pcn);
>  	if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
>  		stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
> -- 
> 2.34.1
> 
>
Mateusz Guzik Nov. 24, 2024, 5:56 p.m. UTC | #2
On Sun, Nov 24, 2024 at 09:44:35AM -0800, Darrick J. Wong wrote:
> On Sun, Nov 24, 2024 at 05:42:53PM +0800, Hao-ran Zheng wrote:
> > A data race may occur when the function `inode_set_ctime_to_ts()` and
> > the function `inode_get_ctime_sec()` are executed concurrently. When
> > two threads call `aio_read` and `aio_write` respectively, they will
> > be distributed to the read and write functions of the corresponding
> > file system respectively. Taking the btrfs file system as an example,
> > the `btrfs_file_read_iter` and `btrfs_file_write_iter` functions are
> > finally called. These two functions created a data race when they
> > finally called `inode_get_ctime_sec()` and `inode_set_ctime_to_ns()`.
> > The specific call stack that appears during testing is as follows:
> > 
> > ============DATA_RACE============
> > btrfs_delayed_update_inode+0x1f61/0x7ce0 [btrfs]
> > btrfs_update_inode+0x45e/0xbb0 [btrfs]
> > btrfs_dirty_inode+0x2b8/0x530 [btrfs]
> > btrfs_update_time+0x1ad/0x230 [btrfs]
> > touch_atime+0x211/0x440
> > filemap_read+0x90f/0xa20
> > btrfs_file_read_iter+0xeb/0x580 [btrfs]
> > aio_read+0x275/0x3a0
> > io_submit_one+0xd22/0x1ce0
> > __se_sys_io_submit+0xb3/0x250
> > do_syscall_64+0xc1/0x190
> > entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > ============OTHER_INFO============
> > btrfs_write_check+0xa15/0x1390 [btrfs]
> > btrfs_buffered_write+0x52f/0x29d0 [btrfs]
> > btrfs_do_write_iter+0x53d/0x1590 [btrfs]
> > btrfs_file_write_iter+0x41/0x60 [btrfs]
> > aio_write+0x41e/0x5f0
> > io_submit_one+0xd42/0x1ce0
> > __se_sys_io_submit+0xb3/0x250
> > do_syscall_64+0xc1/0x190
> > entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > 
> > To address this issue, it is recommended to add WRITE_ONCE
> > and READ_ONCE when reading or writing the `inode->i_ctime_sec`
> > and `inode->i_ctime_nsec` variable.
> 
> Excuse my ignorance, but how exactly does this annotation fix the race?
> Does it prevent reordering of the memory accesses or something?  "it is
> recommended" is not enough for dunces such as myself to figure out why
> this fixes the problem. :(
> 

It prevents the compiler from getting ideas. One hypothetical is
splitting the load from one asm op to several, possibly resulting a
corrupted value when racing against an update

A not hypothethical concerns some like this:
	time64_t sec = inode_get_ctime_sec(inode);
	/* do stuff with sec */
	/* do other stuff */
	/* do more stuff sec */

The compiler might have decided to throw away the read sec value and do
the load again later. Thus if an update happened in the meantime then
the code ends up operating on 2 different values, even though the
programmer clearly intended it to be stable.

However, since both sec and nsec are updated separately and there is no
synchro, reading *both* can still result in values from 2 different
updates which is a bug not addressed by any of the above. To my
underestanding of the vfs folk take on it this is considered tolerable.
Darrick J. Wong Nov. 24, 2024, 6:34 p.m. UTC | #3
On Sun, Nov 24, 2024 at 06:56:57PM +0100, Mateusz Guzik wrote:
> On Sun, Nov 24, 2024 at 09:44:35AM -0800, Darrick J. Wong wrote:
> > On Sun, Nov 24, 2024 at 05:42:53PM +0800, Hao-ran Zheng wrote:
> > > A data race may occur when the function `inode_set_ctime_to_ts()` and
> > > the function `inode_get_ctime_sec()` are executed concurrently. When
> > > two threads call `aio_read` and `aio_write` respectively, they will
> > > be distributed to the read and write functions of the corresponding
> > > file system respectively. Taking the btrfs file system as an example,
> > > the `btrfs_file_read_iter` and `btrfs_file_write_iter` functions are
> > > finally called. These two functions created a data race when they
> > > finally called `inode_get_ctime_sec()` and `inode_set_ctime_to_ns()`.
> > > The specific call stack that appears during testing is as follows:
> > > 
> > > ============DATA_RACE============
> > > btrfs_delayed_update_inode+0x1f61/0x7ce0 [btrfs]
> > > btrfs_update_inode+0x45e/0xbb0 [btrfs]
> > > btrfs_dirty_inode+0x2b8/0x530 [btrfs]
> > > btrfs_update_time+0x1ad/0x230 [btrfs]
> > > touch_atime+0x211/0x440
> > > filemap_read+0x90f/0xa20
> > > btrfs_file_read_iter+0xeb/0x580 [btrfs]
> > > aio_read+0x275/0x3a0
> > > io_submit_one+0xd22/0x1ce0
> > > __se_sys_io_submit+0xb3/0x250
> > > do_syscall_64+0xc1/0x190
> > > entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > > ============OTHER_INFO============
> > > btrfs_write_check+0xa15/0x1390 [btrfs]
> > > btrfs_buffered_write+0x52f/0x29d0 [btrfs]
> > > btrfs_do_write_iter+0x53d/0x1590 [btrfs]
> > > btrfs_file_write_iter+0x41/0x60 [btrfs]
> > > aio_write+0x41e/0x5f0
> > > io_submit_one+0xd42/0x1ce0
> > > __se_sys_io_submit+0xb3/0x250
> > > do_syscall_64+0xc1/0x190
> > > entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > > 
> > > To address this issue, it is recommended to add WRITE_ONCE
> > > and READ_ONCE when reading or writing the `inode->i_ctime_sec`
> > > and `inode->i_ctime_nsec` variable.
> > 
> > Excuse my ignorance, but how exactly does this annotation fix the race?
> > Does it prevent reordering of the memory accesses or something?  "it is
> > recommended" is not enough for dunces such as myself to figure out why
> > this fixes the problem. :(
> > 
> 
> It prevents the compiler from getting ideas. One hypothetical is
> splitting the load from one asm op to several, possibly resulting a
> corrupted value when racing against an update
> 
> A not hypothethical concerns some like this:
> 	time64_t sec = inode_get_ctime_sec(inode);
> 	/* do stuff with sec */
> 	/* do other stuff */
> 	/* do more stuff sec */
> 
> The compiler might have decided to throw away the read sec value and do
> the load again later. Thus if an update happened in the meantime then
> the code ends up operating on 2 different values, even though the
> programmer clearly intended it to be stable.

<nod> I figured as much, but I wanted the commit message to spell that
out for everybody, e.g. "Use READ_ONCE to force compilers to sample the
ctime values one time only, and WRITE_ONCE to prevent the compiler from
turning one line of code into multiple stores to the same address."

> However, since both sec and nsec are updated separately and there is no
> synchro, reading *both* can still result in values from 2 different
> updates which is a bug not addressed by any of the above. To my
> underestanding of the vfs folk take on it this is considered tolerable.

<nod> This is the other thing -- the commit message refers to preventing
a data race, but there is indeed nothing about access ordering that
prevents /that/ race.  Someone not an fs developer could interpret that
as "the patch does not fully fix all possible problems" whereas the
community has decided there's an acceptable tradeoff with not taking
i_rwsem.

--D
Al Viro Nov. 24, 2024, 9:50 p.m. UTC | #4
[Linus Cc'd]
On Sun, Nov 24, 2024 at 06:56:57PM +0100, Mateusz Guzik wrote:

> However, since both sec and nsec are updated separately and there is no
> synchro, reading *both* can still result in values from 2 different
> updates which is a bug not addressed by any of the above. To my
> underestanding of the vfs folk take on it this is considered tolerable.

Well...   You have a timestamp changing.  A reader might get the value
before change, the value after change *or* one of those with nanoseconds
from another.  It's really hard to see the scenario where that would
be a problem - theoretically something might get confused seeing something
like
	Jan 14 1995 12:34:49.214 ->
	Jan 14 1995 12:34:49.137 ->
	Nov 23 2024 14:09:17.137
but... what would that something be?

We could add a seqcount, but stat(2) and friends already cost more than
they should, IMO...

Linus, do you see any good reasons to bother with that kind of stuff?
It's not the first time such metadata update vs. read atomicity comes
up, maybe we ought to settle that for good and document the decision
and reasons for it.

This time it's about timestamp (seconds vs. nanoseconds), but there'd
been mode vs. uid vs. gid mentioned in earlier threads.
Linus Torvalds Nov. 24, 2024, 10:10 p.m. UTC | #5
On Sun, 24 Nov 2024 at 13:50, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Linus, do you see any good reasons to bother with that kind of stuff?
> It's not the first time such metadata update vs. read atomicity comes
> up, maybe we ought to settle that for good and document the decision
> and reasons for it.
>
> This time it's about timestamp (seconds vs. nanoseconds), but there'd
> been mode vs. uid vs. gid mentioned in earlier threads.

I think the only one we ended up really caring about was i_size, which
had the 32-bit tearing problem and we do i_size_read() as a result.

I *do* think that we could perhaps extend (and rename) the
inode->i_size_seqcount to just cover all of the core inode metadata
stuff.

And then - exactly like we already do in practice with
inode->i_size_seqcount - some places might just choose to ignore it
anyway.

But at least using a sequence count shouldn't make things like stat()
any worse in practice.

That said, I don't think this is a real problem in practice. The race
window is too small, and the race effects are too insignificant.

Yes, getting the nanoseconds out of sync with the seconds is a bug,
but when it effectively never happens, and when it *does* happen it
likely has no real downsides, I suspect it's also not something we
should worry over-much about.

So I mention the "rename and extend i_size_seqcount" as a solution
that I suspect might be acceptable if somebody has the motivation and
energy, but honestly I also think "nobody can be bothered" is
acceptable in practice.

              Linus
Dr. David Alan Gilbert Nov. 24, 2024, 10:10 p.m. UTC | #6
* Al Viro (viro@zeniv.linux.org.uk) wrote:
> [Linus Cc'd]
> On Sun, Nov 24, 2024 at 06:56:57PM +0100, Mateusz Guzik wrote:
> 
> > However, since both sec and nsec are updated separately and there is no
> > synchro, reading *both* can still result in values from 2 different
> > updates which is a bug not addressed by any of the above. To my
> > underestanding of the vfs folk take on it this is considered tolerable.
> 
> Well...   You have a timestamp changing.  A reader might get the value
> before change, the value after change *or* one of those with nanoseconds
> from another.  It's really hard to see the scenario where that would
> be a problem - theoretically something might get confused seeing something
> like
> 	Jan 14 1995 12:34:49.214 ->
> 	Jan 14 1995 12:34:49.137 ->
> 	Nov 23 2024 14:09:17.137
> but... what would that something be?

make?
i.e. if the change was from:
 a) mmm dd yyyy hh::MM::00:950 ->
 b) mmm dd yyyy hh::MM::01:950 ->
 c) mmm dd yyyy hh::MM::01:200 ->
   
If you read (b) then you'd think that the file was 750ms newer
than it really was; which is a long time these days.

Dave

> We could add a seqcount, but stat(2) and friends already cost more than
> they should, IMO...
> 
> Linus, do you see any good reasons to bother with that kind of stuff?
> It's not the first time such metadata update vs. read atomicity comes
> up, maybe we ought to settle that for good and document the decision
> and reasons for it.
> 
> This time it's about timestamp (seconds vs. nanoseconds), but there'd
> been mode vs. uid vs. gid mentioned in earlier threads.
>
Matthew Wilcox (Oracle) Nov. 24, 2024, 10:19 p.m. UTC | #7
On Sun, Nov 24, 2024 at 10:10:31PM +0000, Dr. David Alan Gilbert wrote:
> * Al Viro (viro@zeniv.linux.org.uk) wrote:
> > [Linus Cc'd]
> > On Sun, Nov 24, 2024 at 06:56:57PM +0100, Mateusz Guzik wrote:
> > 
> > > However, since both sec and nsec are updated separately and there is no
> > > synchro, reading *both* can still result in values from 2 different
> > > updates which is a bug not addressed by any of the above. To my
> > > underestanding of the vfs folk take on it this is considered tolerable.
> > 
> > Well...   You have a timestamp changing.  A reader might get the value
> > before change, the value after change *or* one of those with nanoseconds
> > from another.  It's really hard to see the scenario where that would
> > be a problem - theoretically something might get confused seeing something
> > like
> > 	Jan 14 1995 12:34:49.214 ->
> > 	Jan 14 1995 12:34:49.137 ->
> > 	Nov 23 2024 14:09:17.137
> > but... what would that something be?
> 
> make?
> i.e. if the change was from:
>  a) mmm dd yyyy hh::MM::00:950 ->
>  b) mmm dd yyyy hh::MM::01:950 ->
>  c) mmm dd yyyy hh::MM::01:200 ->
>    
> If you read (b) then you'd think that the file was 750ms newer
> than it really was; which is a long time these days.

... and file fs/inode.c might have a timestamp of :01:717 so inode.o
doesn't get rebuilt when it ought to have been?
Al Viro Nov. 24, 2024, 10:24 p.m. UTC | #8
On Sun, Nov 24, 2024 at 02:10:30PM -0800, Linus Torvalds wrote:

> I *do* think that we could perhaps extend (and rename) the
> inode->i_size_seqcount to just cover all of the core inode metadata
> stuff.

That would bring ->i_size_seqcount to 64bit architectures and
potentially extend the time it's being held quite a bit even
on 32bit...

> And then - exactly like we already do in practice with
> inode->i_size_seqcount - some places might just choose to ignore it
> anyway.
> 
> But at least using a sequence count shouldn't make things like stat()
> any worse in practice.
>
> That said, I don't think this is a real problem in practice. The race
> window is too small, and the race effects are too insignificant.
> 
> Yes, getting the nanoseconds out of sync with the seconds is a bug,
> but when it effectively never happens, and when it *does* happen it
> likely has no real downsides, I suspect it's also not something we
> should worry over-much about.
> 
> So I mention the "rename and extend i_size_seqcount" as a solution
> that I suspect might be acceptable if somebody has the motivation and
> energy, but honestly I also think "nobody can be bothered" is
> acceptable in practice.

*nod*
Matthew Wilcox (Oracle) Nov. 24, 2024, 10:34 p.m. UTC | #9
On Sun, Nov 24, 2024 at 10:24:50PM +0000, Al Viro wrote:
> On Sun, Nov 24, 2024 at 02:10:30PM -0800, Linus Torvalds wrote:
> > I *do* think that we could perhaps extend (and rename) the
> > inode->i_size_seqcount to just cover all of the core inode metadata
> > stuff.
> 
> That would bring ->i_size_seqcount to 64bit architectures and
> potentially extend the time it's being held quite a bit even
> on 32bit...

Could we just do:

again:
	nsec = READ_ONCE(inode->nsec)
	sec = READ_ONCE(inode->sec)
	if (READ_ONCE(inode->nsec) != nsec)
		goto again;

?
Linus Torvalds Nov. 24, 2024, 10:40 p.m. UTC | #10
On Sun, 24 Nov 2024 at 14:24, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Sun, Nov 24, 2024 at 02:10:30PM -0800, Linus Torvalds wrote:
>
> > I *do* think that we could perhaps extend (and rename) the
> > inode->i_size_seqcount to just cover all of the core inode metadata
> > stuff.
>
> That would bring ->i_size_seqcount to 64bit architectures and
> potentially extend the time it's being held quite a bit even
> on 32bit...

Yes. But it seems fairly benign.

*IF* somebody really cares, that is.

         Linus
Linus Torvalds Nov. 24, 2024, 10:43 p.m. UTC | #11
On Sun, 24 Nov 2024 at 14:34, Matthew Wilcox <willy@infradead.org> wrote:
>
> Could we just do:
>
> again:
>         nsec = READ_ONCE(inode->nsec)
>         sec = READ_ONCE(inode->sec)
>         if (READ_ONCE(inode->nsec) != nsec)
>                 goto again;

No. You would need to use the right memory ordering barriers.

And make sure the writes are in the right order.

And even then it wouldn't protect against the race in theory, since
two (separate) time writes could make that nsec check work, even when
the 'sec' read wouldn't necessarily match *either* of the matching
nsec cases.

So it might catch some case of value tearing, and make a "this happens
in once in a blue moon" turn into a "this happens once in five blue
moons" situation instead.

But anybody who really cares about this case would presumably still
care about the "once in five blue moons" case too.

             Linus
Mateusz Guzik Nov. 24, 2024, 11:05 p.m. UTC | #12
On Sun, Nov 24, 2024 at 11:10 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> So I mention the "rename and extend i_size_seqcount" as a solution
> that I suspect might be acceptable if somebody has the motivation and
> energy, but honestly I also think "nobody can be bothered" is
> acceptable in practice.
>

So happens recently the metadata ordeal also came up around getattr
where a submitter wanted to lock the inode around it.

Looks like this is a recurring topic?

Until the day comes when someone has way too much time on their hands
and patches it up (even that may encounter resistance though), I do
think it would make sense to nicely write it down somewhere so for
easy reference -- maybe as a comment above getattr and note around
other places like the timespec helpers to read that.
Mateusz Guzik Nov. 24, 2024, 11:19 p.m. UTC | #13
On Mon, Nov 25, 2024 at 12:05 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> Looks like this is a recurring topic?
>
> Until the day comes when someone has way too much time on their hands
> and patches it up (even that may encounter resistance though), I do
> think it would make sense to nicely write it down somewhere so for
> easy reference -- maybe as a comment above getattr and note around
> other places like the timespec helpers to read that.
>

I'll add personally I was concerned with uid:gid pairs vs chown --
after all you can read an incorrect pair after getting unlucky enough.
Again one has to really try to run into it though.

So how about something like this:
diff --git a/fs/stat.c b/fs/stat.c
index 0870e969a8a0..302586d6afae 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -78,6 +78,11 @@ EXPORT_SYMBOL(fill_mg_cmtime);
  * take care to map the inode according to @idmap before filling in the
  * uid and gid filds. On non-idmapped mounts or if permission checking is to be
  * performed on the raw inode simply pass @nop_mnt_idmap.
+ *
+ * DONTFIXME: no effort is put into ensuring a consistent snapshot of the
+ * metadata read below. For example a call racing against parallel setattr()
+ * can end up with a mixture of old and new attributes. This is not considered
+ * enough to warrant fixing.
  */
 void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
                      struct inode *inode, struct kstat *stat)

not an actual patch submission, any party is free to take the comment
and tweak in whatever capacity without credit.

What I am after here is preventing more people from spotting the
problem and thinking it is new.
Al Viro Nov. 24, 2024, 11:38 p.m. UTC | #14
On Mon, Nov 25, 2024 at 12:05:29AM +0100, Mateusz Guzik wrote:
> On Sun, Nov 24, 2024 at 11:10 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > So I mention the "rename and extend i_size_seqcount" as a solution
> > that I suspect might be acceptable if somebody has the motivation and
> > energy, but honestly I also think "nobody can be bothered" is
> > acceptable in practice.
> >
> 
> So happens recently the metadata ordeal also came up around getattr
> where a submitter wanted to lock the inode around it.

The posting Linus had been replying to:
https://lore.kernel.org/all/20241124215014.GA3387508@ZenIV/

> Until the day comes when someone has way too much time on their hands
> and patches it up (even that may encounter resistance though), I do
> think it would make sense to nicely write it down somewhere so for
> easy reference -- maybe as a comment above getattr and note around
> other places like the timespec helpers to read that.

See above.

For those who'd missed the getattr thread - the approach proposed and
NAKed there was to take ->i_rwsem (shared) in stat(2).  A non-starter
for obvious reasons, IMO.  Seqcount avoids those, but it would need to
be a pair of primitives used around the stores, with i_size_write()
*not* usable inside such scope.  Potential problems would be the
amount of time spent inside those scopes and amount of spinning it
would cause on the stat(2) side + the inode bloat.

All of that is modulo usefulness of such atomicity - nothing mentioned
so far seems to be a good reason to bother with all of that in the
first place...
Al Viro Nov. 24, 2024, 11:41 p.m. UTC | #15
On Mon, Nov 25, 2024 at 12:19:44AM +0100, Mateusz Guzik wrote:

> + *
> + * DONTFIXME: no effort is put into ensuring a consistent snapshot of the
> + * metadata read below. For example a call racing against parallel setattr()
> + * can end up with a mixture of old and new attributes. This is not considered
> + * enough to warrant fixing.
>   */
>  void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
>                       struct inode *inode, struct kstat *stat)
> 
> not an actual patch submission, any party is free to take the comment
> and tweak in whatever capacity without credit.
> 
> What I am after here is preventing more people from spotting the
> problem and thinking it is new.

getattr() is not the only reader for those - permission() is *much*
hotter; for uid/gid issues, if somebody can think of any scenario
where it's a real problem, permission() would be the main source
of headache.
Matthew Wilcox (Oracle) Nov. 24, 2024, 11:53 p.m. UTC | #16
On Sun, Nov 24, 2024 at 02:43:58PM -0800, Linus Torvalds wrote:
> On Sun, 24 Nov 2024 at 14:34, Matthew Wilcox <willy@infradead.org> wrote:
> >
> > Could we just do:
> >
> > again:
> >         nsec = READ_ONCE(inode->nsec)
> >         sec = READ_ONCE(inode->sec)
> >         if (READ_ONCE(inode->nsec) != nsec)
> >                 goto again;
> 
> No. You would need to use the right memory ordering barriers.
> 
> And make sure the writes are in the right order.
> 
> And even then it wouldn't protect against the race in theory, since
> two (separate) time writes could make that nsec check work, even when
> the 'sec' read wouldn't necessarily match *either* of the matching
> nsec cases.

But if we assume that time only goes forwards (ie nobody's calling
utime()), I don't think there's a sequence of updates which let you see
a file time which is newer than the actual time of the file.  I tried
to construct an example, and I couldn't.  eg:

A:	WRITE_ONCE(inode->sec, 5)
A:	WRITE_ONCE(inode->nsec, 950)
A:	WRITE_ONCE(inode->sec, 6)
B:	READ_ONCE(inode->nsec)
B:	READ_ONCE(inode->sec)
A:	WRITE_ONCE(inode->sec, 170)
A:	WRITE_ONCE(inode->sec, 7)
A:	WRITE_ONCE(inode->sec, 950)
B:	READ_ONCE(inode->nsec)

Now we have a time of 6:950 which is never a time that this file had,
but it's intermediate in time between two times that the file _did_
have, so it won't break make.

Or did I not try hard enough to construct a counterexample that
would break make?

(assume the appropriate read/write barriers are in there)
Linus Torvalds Nov. 25, 2024, 12:53 a.m. UTC | #17
On Sun, 24 Nov 2024 at 15:53, Matthew Wilcox <willy@infradead.org> wrote:
>
> a file time which is newer than the actual time of the file.  I tried
> to construct an example, and I couldn't.  eg:
>
> A:      WRITE_ONCE(inode->sec, 5)
> A:      WRITE_ONCE(inode->nsec, 950)
> A:      WRITE_ONCE(inode->sec, 6)
> B:      READ_ONCE(inode->nsec)
> B:      READ_ONCE(inode->sec)
> A:      WRITE_ONCE(inode->sec, 170)
> A:      WRITE_ONCE(inode->sec, 7)
> A:      WRITE_ONCE(inode->sec, 950)
> B:      READ_ONCE(inode->nsec)

I assume those WRITE_ONCE(170/190) should be nsec.

Also note that as long as you insist on using READ_ONCE, your example
is completely bogus due to memory ordering issues. It happens to work
on x86 where all reads are ordered, but on other architectures you'd
literally re-order the reads in question completely.

So assuming we make the read_once be "smp_read_acquire()" to make them
ordered wrt each other, and make the writes ordered with smp_wmb() or
smp_store_release(), I think it still can fail.

Look, let's write 5.000950, 6.000150 and 7.000950, while there is a
single reader (and let's assume these are all properly ordered reads
and writes):

  W1.s 5
  W1.ns 950
  W2.s 6
  R.ns (950)
  R.s (6)
  W2.ns 150
  W3.s 7
  W3.ns 950
  R.ns (950)

and look how the reader is happy, because it got the same nanoseconds
twice. But the reader thinks it had a time of 6.000950, and AT NO
POINT was that actually a valid time.

                    Linus
Linus Torvalds Nov. 25, 2024, 1:02 a.m. UTC | #18
On Sun, 24 Nov 2024 at 16:53, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> and look how the reader is happy, because it got the same nanoseconds
> twice. But the reader thinks it had a time of 6.000950, and AT NO
> POINT was that actually a valid time.

And let me clarify again that I don't actually personally think we
should care deeply about any of this.

I think the race is entirely theoretical and doesn't happen in
practice (regardless of the "re-read nsec", which I don't think
works), and I don't think anybody cares that deeply about nanoseconds
anyway.

The "worst" that would happen is likely that some cache that depended
on timestamps would get invalidated, there really aren't a ton of
things that depend on exact time outside of that.

Also, fixing it with the sequence count should be fairly low-cost, but
is not entirely free.

I wouldn't worry about the writer side, but the reader side ends up
with a smp_read_acquire() on the first sequence count read, and a
smp_rmb() before the final sequence count read.

That's free on x86 where all reads are ordered (well, "free" - you
still need to actually do the sequence count read, but it is hopefully
in the same cacheline as the hot data), but smp_rmb() can be fairly
expensive on other architectures.

              Linus
Matthew Wilcox (Oracle) Nov. 25, 2024, 1:15 a.m. UTC | #19
On Sun, Nov 24, 2024 at 04:53:39PM -0800, Linus Torvalds wrote:
> On Sun, 24 Nov 2024 at 15:53, Matthew Wilcox <willy@infradead.org> wrote:
> Look, let's write 5.000950, 6.000150 and 7.000950, while there is a
> single reader (and let's assume these are all properly ordered reads
> and writes):
> 
>   W1.s 5
>   W1.ns 950
>   W2.s 6
>   R.ns (950)
>   R.s (6)
>   W2.ns 150
>   W3.s 7
>   W3.ns 950
>   R.ns (950)
> 
> and look how the reader is happy, because it got the same nanoseconds
> twice. But the reader thinks it had a time of 6.000950, and AT NO
> POINT was that actually a valid time.

I literally said that.

"Now we have a time of 6:950 which is never a time that this file had,
but it's intermediate in time between two times that the file _did_
have, so it won't break make."
Linus Torvalds Nov. 25, 2024, 1:26 a.m. UTC | #20
On Sun, 24 Nov 2024 at 17:15, Matthew Wilcox <willy@infradead.org> wrote:
>
> I literally said that.

Bah, I misunderstood you.

Then yes, if all the writers are always in order, and you don't
actually care about exact time matching but only ordering, I guess it
might work.

But since you need all the same barriers that you would need for just
doing it right with a seqcount, it's not much of an advantage, and it
doesn't give you consistency for any other kind of action.

             Linus
Christian Brauner Nov. 25, 2024, 12:20 p.m. UTC | #21
> So I mention the "rename and extend i_size_seqcount" as a solution
> that I suspect might be acceptable if somebody has the motivation and
> energy, but honestly I also think "nobody can be bothered" is
> acceptable in practice.

I've said it before but I'm strongly on the "let's not care" side of
this rather than complicating this code unnecessarily for some weird
corner case. So I'd be pretty reluctant to exited about patches for
this...
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index b13b778257ae..bfab370c8622 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2713,8 +2713,8 @@  struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 t
 {
 	trace_inode_set_ctime_to_ts(inode, &ts);
 	set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
-	inode->i_ctime_sec = ts.tv_sec;
-	inode->i_ctime_nsec = ts.tv_nsec;
+	WRITE_ONCE(inode->i_ctime_sec, ts.tv_sec);
+	WRITE_ONCE(inode->i_ctime_nsec, ts.tv_nsec);
 	return ts;
 }
 EXPORT_SYMBOL(inode_set_ctime_to_ts);
@@ -2788,7 +2788,7 @@  struct timespec64 inode_set_ctime_current(struct inode *inode)
 	 */
 	cns = smp_load_acquire(&inode->i_ctime_nsec);
 	if (cns & I_CTIME_QUERIED) {
-		struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
+		struct timespec64 ctime = { .tv_sec = READ_ONCE(inode->i_ctime_sec),
 					    .tv_nsec = cns & ~I_CTIME_QUERIED };
 
 		if (timespec64_compare(&now, &ctime) <= 0) {
@@ -2809,7 +2809,7 @@  struct timespec64 inode_set_ctime_current(struct inode *inode)
 	/* Try to swap the nsec value into place. */
 	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
 		/* If swap occurred, then we're (mostly) done */
-		inode->i_ctime_sec = now.tv_sec;
+		WRITE_ONCE(inode->i_ctime_sec, now.tv_sec);
 		trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
 		mgtime_counter_inc(mg_ctime_swaps);
 	} else {
@@ -2824,7 +2824,7 @@  struct timespec64 inode_set_ctime_current(struct inode *inode)
 			goto retry;
 		}
 		/* Otherwise, keep the existing ctime */
-		now.tv_sec = inode->i_ctime_sec;
+		now.tv_sec = READ_ONCE(inode->i_ctime_sec);
 		now.tv_nsec = cur & ~I_CTIME_QUERIED;
 	}
 out:
@@ -2857,7 +2857,7 @@  struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
 	/* pairs with try_cmpxchg below */
 	cur = smp_load_acquire(&inode->i_ctime_nsec);
 	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
-	cur_ts.tv_sec = inode->i_ctime_sec;
+	cur_ts.tv_sec = READ_ONCE(inode->i_ctime_sec);
 
 	/* If the update is older than the existing value, skip it. */
 	if (timespec64_compare(&update, &cur_ts) <= 0)
@@ -2883,7 +2883,7 @@  struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
 retry:
 	old = cur;
 	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
-		inode->i_ctime_sec = update.tv_sec;
+		WRITE_ONCE(inode->i_ctime_sec, update.tv_sec);
 		mgtime_counter_inc(mg_ctime_swaps);
 		return update;
 	}
@@ -2899,7 +2899,7 @@  struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
 		goto retry;
 
 	/* Otherwise, it was a new timestamp. */
-	cur_ts.tv_sec = inode->i_ctime_sec;
+	cur_ts.tv_sec = READ_ONCE(inode->i_ctime_sec);
 	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
 	return cur_ts;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 0870e969a8a0..e39c78cd62f3 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -53,7 +53,7 @@  void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
 	}
 
 	stat->mtime = inode_get_mtime(inode);
-	stat->ctime.tv_sec = inode->i_ctime_sec;
+	stat->ctime.tv_sec = READ_ONCE(inode->i_ctime_sec);
 	stat->ctime.tv_nsec = (u32)atomic_read(pcn);
 	if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
 		stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));