diff mbox series

fs/direct-io: avoid data race on ->s_dio_done_wq

Message ID 20200713033330.205104-1-ebiggers@kernel.org (mailing list archive)
State New, archived
Headers show
Series fs/direct-io: avoid data race on ->s_dio_done_wq | expand

Commit Message

Eric Biggers July 13, 2020, 3:33 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

Fix the preliminary checks for ->s_dio_done_wq to use READ_ONCE(), since
it's a data race, and technically the behavior is undefined without
READ_ONCE().  Also, on one CPU architecture (Alpha), the data read
dependency barrier included in READ_ONCE() is needed to guarantee that
the pointed-to struct is seen as fully initialized.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/direct-io.c       | 8 +++-----
 fs/internal.h        | 9 ++++++++-
 fs/iomap/direct-io.c | 3 +--
 3 files changed, 12 insertions(+), 8 deletions(-)

Comments

Dave Chinner July 15, 2020, 1:30 a.m. UTC | #1
On Sun, Jul 12, 2020 at 08:33:30PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Fix the preliminary checks for ->s_dio_done_wq to use READ_ONCE(), since
> it's a data race, and technically the behavior is undefined without
> READ_ONCE().  Also, on one CPU architecture (Alpha), the data read
> dependency barrier included in READ_ONCE() is needed to guarantee that
> the pointed-to struct is seen as fully initialized.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
>  fs/direct-io.c       | 8 +++-----
>  fs/internal.h        | 9 ++++++++-
>  fs/iomap/direct-io.c | 3 +--
>  3 files changed, 12 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 6d5370eac2a8..26221ae24156 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -590,7 +590,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
>   * filesystems that don't need it and also allows us to create the workqueue
>   * late enough so the we can include s_id in the name of the workqueue.
>   */
> -int sb_init_dio_done_wq(struct super_block *sb)
> +int __sb_init_dio_done_wq(struct super_block *sb)
>  {
>  	struct workqueue_struct *old;
>  	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
> @@ -615,9 +615,7 @@ static int dio_set_defer_completion(struct dio *dio)
>  	if (dio->defer_completion)
>  		return 0;
>  	dio->defer_completion = true;
> -	if (!sb->s_dio_done_wq)
> -		return sb_init_dio_done_wq(sb);
> -	return 0;
> +	return sb_init_dio_done_wq(sb);
>  }
>  
>  /*
> @@ -1250,7 +1248,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  		retval = 0;
>  		if (iocb->ki_flags & IOCB_DSYNC)
>  			retval = dio_set_defer_completion(dio);
> -		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +		else {
>  			/*
>  			 * In case of AIO write racing with buffered read we
>  			 * need to defer completion. We can't decide this now,
> diff --git a/fs/internal.h b/fs/internal.h
> index 9b863a7bd708..6736c9eee978 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -178,7 +178,14 @@ extern void mnt_pin_kill(struct mount *m);
>  extern const struct dentry_operations ns_dentry_operations;
>  
>  /* direct-io.c: */
> -int sb_init_dio_done_wq(struct super_block *sb);
> +int __sb_init_dio_done_wq(struct super_block *sb);
> +static inline int sb_init_dio_done_wq(struct super_block *sb)
> +{
> +	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
> +	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> +		return 0;
> +	return __sb_init_dio_done_wq(sb);
> +}

Ummm, why don't you just add this check in sb_init_dio_done_wq(). I
don't see any need for adding another level of function call
abstraction in the source code?

Also, you need to explain the reason for the READ_ONCE() existing
rather than just saying "it pairs with <some other operation>".
Knowing what operation it pairs with doesn't explain why the pairing
is necessary in the first place, and that leads to nobody reading
the code being able to understand what this is protecting against.

Cheers,

Dave.
Eric Biggers July 15, 2020, 2:37 a.m. UTC | #2
On Wed, Jul 15, 2020 at 11:30:08AM +1000, Dave Chinner wrote:
> On Sun, Jul 12, 2020 at 08:33:30PM -0700, Eric Biggers wrote:
> > From: Eric Biggers <ebiggers@google.com>
> > 
> > Fix the preliminary checks for ->s_dio_done_wq to use READ_ONCE(), since
> > it's a data race, and technically the behavior is undefined without
> > READ_ONCE().  Also, on one CPU architecture (Alpha), the data read
> > dependency barrier included in READ_ONCE() is needed to guarantee that
> > the pointed-to struct is seen as fully initialized.
> > 
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > ---
> >  fs/direct-io.c       | 8 +++-----
> >  fs/internal.h        | 9 ++++++++-
> >  fs/iomap/direct-io.c | 3 +--
> >  3 files changed, 12 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > index 6d5370eac2a8..26221ae24156 100644
> > --- a/fs/direct-io.c
> > +++ b/fs/direct-io.c
> > @@ -590,7 +590,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
> >   * filesystems that don't need it and also allows us to create the workqueue
> >   * late enough so the we can include s_id in the name of the workqueue.
> >   */
> > -int sb_init_dio_done_wq(struct super_block *sb)
> > +int __sb_init_dio_done_wq(struct super_block *sb)
> >  {
> >  	struct workqueue_struct *old;
> >  	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
> > @@ -615,9 +615,7 @@ static int dio_set_defer_completion(struct dio *dio)
> >  	if (dio->defer_completion)
> >  		return 0;
> >  	dio->defer_completion = true;
> > -	if (!sb->s_dio_done_wq)
> > -		return sb_init_dio_done_wq(sb);
> > -	return 0;
> > +	return sb_init_dio_done_wq(sb);
> >  }
> >  
> >  /*
> > @@ -1250,7 +1248,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
> >  		retval = 0;
> >  		if (iocb->ki_flags & IOCB_DSYNC)
> >  			retval = dio_set_defer_completion(dio);
> > -		else if (!dio->inode->i_sb->s_dio_done_wq) {
> > +		else {
> >  			/*
> >  			 * In case of AIO write racing with buffered read we
> >  			 * need to defer completion. We can't decide this now,
> > diff --git a/fs/internal.h b/fs/internal.h
> > index 9b863a7bd708..6736c9eee978 100644
> > --- a/fs/internal.h
> > +++ b/fs/internal.h
> > @@ -178,7 +178,14 @@ extern void mnt_pin_kill(struct mount *m);
> >  extern const struct dentry_operations ns_dentry_operations;
> >  
> >  /* direct-io.c: */
> > -int sb_init_dio_done_wq(struct super_block *sb);
> > +int __sb_init_dio_done_wq(struct super_block *sb);
> > +static inline int sb_init_dio_done_wq(struct super_block *sb)
> > +{
> > +	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
> > +	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > +		return 0;
> > +	return __sb_init_dio_done_wq(sb);
> > +}
> 
> Ummm, why don't you just add this check in sb_init_dio_done_wq(). I
> don't see any need for adding another level of function call
> abstraction in the source code?

This keeps the fast path doing no function calls and one fewer branch, as it was
before.  People care a lot about minimizing direct I/O overhead, so it seems
desirable to keep this simple optimization.  Would you rather it be removed?

> 
> Also, you need to explain the reason for the READ_ONCE() existing
> rather than just saying "it pairs with <some other operation>".
> Knowing what operation it pairs with doesn't explain why the pairing
> is necessary in the first place, and that leads to nobody reading
> the code being able to understand what this is protecting against.
> 

How about this?

	/*
	 * Nothing to do if ->s_dio_done_wq is already set.  But since another
	 * process may set it concurrently, we need to use READ_ONCE() rather
	 * than a plain read to avoid a data race (undefined behavior) and to
	 * ensure we observe the pointed-to struct to be fully initialized.
	 */
	if (likely(READ_ONCE(sb->s_dio_done_wq)))
		return 0;
Dave Chinner July 15, 2020, 8:01 a.m. UTC | #3
On Tue, Jul 14, 2020 at 07:37:14PM -0700, Eric Biggers wrote:
> On Wed, Jul 15, 2020 at 11:30:08AM +1000, Dave Chinner wrote:
> > On Sun, Jul 12, 2020 at 08:33:30PM -0700, Eric Biggers wrote:
> > > From: Eric Biggers <ebiggers@google.com>
> > > 
> > > Fix the preliminary checks for ->s_dio_done_wq to use READ_ONCE(), since
> > > it's a data race, and technically the behavior is undefined without
> > > READ_ONCE().  Also, on one CPU architecture (Alpha), the data read
> > > dependency barrier included in READ_ONCE() is needed to guarantee that
> > > the pointed-to struct is seen as fully initialized.
> > > 
> > > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > > ---
> > >  fs/direct-io.c       | 8 +++-----
> > >  fs/internal.h        | 9 ++++++++-
> > >  fs/iomap/direct-io.c | 3 +--
> > >  3 files changed, 12 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > > index 6d5370eac2a8..26221ae24156 100644
> > > --- a/fs/direct-io.c
> > > +++ b/fs/direct-io.c
> > > @@ -590,7 +590,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
> > >   * filesystems that don't need it and also allows us to create the workqueue
> > >   * late enough so the we can include s_id in the name of the workqueue.
> > >   */
> > > -int sb_init_dio_done_wq(struct super_block *sb)
> > > +int __sb_init_dio_done_wq(struct super_block *sb)
> > >  {
> > >  	struct workqueue_struct *old;
> > >  	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
> > > @@ -615,9 +615,7 @@ static int dio_set_defer_completion(struct dio *dio)
> > >  	if (dio->defer_completion)
> > >  		return 0;
> > >  	dio->defer_completion = true;
> > > -	if (!sb->s_dio_done_wq)
> > > -		return sb_init_dio_done_wq(sb);
> > > -	return 0;
> > > +	return sb_init_dio_done_wq(sb);
> > >  }
> > >  
> > >  /*
> > > @@ -1250,7 +1248,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
> > >  		retval = 0;
> > >  		if (iocb->ki_flags & IOCB_DSYNC)
> > >  			retval = dio_set_defer_completion(dio);
> > > -		else if (!dio->inode->i_sb->s_dio_done_wq) {
> > > +		else {
> > >  			/*
> > >  			 * In case of AIO write racing with buffered read we
> > >  			 * need to defer completion. We can't decide this now,
> > > diff --git a/fs/internal.h b/fs/internal.h
> > > index 9b863a7bd708..6736c9eee978 100644
> > > --- a/fs/internal.h
> > > +++ b/fs/internal.h
> > > @@ -178,7 +178,14 @@ extern void mnt_pin_kill(struct mount *m);
> > >  extern const struct dentry_operations ns_dentry_operations;
> > >  
> > >  /* direct-io.c: */
> > > -int sb_init_dio_done_wq(struct super_block *sb);
> > > +int __sb_init_dio_done_wq(struct super_block *sb);
> > > +static inline int sb_init_dio_done_wq(struct super_block *sb)
> > > +{
> > > +	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
> > > +	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > +		return 0;
> > > +	return __sb_init_dio_done_wq(sb);
> > > +}
> > 
> > Ummm, why don't you just add this check in sb_init_dio_done_wq(). I
> > don't see any need for adding another level of function call
> > abstraction in the source code?
> 
> This keeps the fast path doing no function calls and one fewer branch, as it was
> before.  People care a lot about minimizing direct I/O overhead, so it seems
> desirable to keep this simple optimization.  Would you rather it be removed?

No.

What I'm trying to say is that I'd prefer fast path checks don't get
hidden away in a static inline function wrappers that require the
reader to go look up code in a different file to understand that
code in yet another different file is conditionally executed.

Going from obvious, easy to read fast path code to spreading the
fast path logic over functions in 3 different files is not an
improvement in the code - it is how we turn good code into an
unmaintainable mess...

> > Also, you need to explain the reason for the READ_ONCE() existing
> > rather than just saying "it pairs with <some other operation>".
> > Knowing what operation it pairs with doesn't explain why the pairing
> > is necessary in the first place, and that leads to nobody reading
> > the code being able to understand what this is protecting against.
> > 
> 
> How about this?
> 
> 	/*
> 	 * Nothing to do if ->s_dio_done_wq is already set.  But since another
> 	 * process may set it concurrently, we need to use READ_ONCE() rather
> 	 * than a plain read to avoid a data race (undefined behavior) and to
> 	 * ensure we observe the pointed-to struct to be fully initialized.
> 	 */
> 	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> 		return 0;

You still need to document what it pairs with, as "data race" doesn't
describe the actual dependency we are synchronising against is.

AFAICT from your description, the data race is not on
sb->s_dio_done_wq itself, but on seeing the contents of the
structure being pointed to incorrectly. i.e. we need to ensure that
writes done before the cmpxchg are ordered correctly against
reads done after the pointer can be seen here.

If so, can't we just treat this as a normal
store-release/load-acquire ordering pattern and hence use more
relaxed memory barriers instead of have to patch up what we have now
to specifically make ancient platforms that nobody actually uses
with weird and unusual memory models work correctly?

Cheers,

Dave.
Eric Biggers July 15, 2020, 4:13 p.m. UTC | #4
On Wed, Jul 15, 2020 at 06:01:44PM +1000, Dave Chinner wrote:
> > > >  /* direct-io.c: */
> > > > -int sb_init_dio_done_wq(struct super_block *sb);
> > > > +int __sb_init_dio_done_wq(struct super_block *sb);
> > > > +static inline int sb_init_dio_done_wq(struct super_block *sb)
> > > > +{
> > > > +	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
> > > > +	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > > +		return 0;
> > > > +	return __sb_init_dio_done_wq(sb);
> > > > +}
> > > 
> > > Ummm, why don't you just add this check in sb_init_dio_done_wq(). I
> > > don't see any need for adding another level of function call
> > > abstraction in the source code?
> > 
> > This keeps the fast path doing no function calls and one fewer branch, as it was
> > before.  People care a lot about minimizing direct I/O overhead, so it seems
> > desirable to keep this simple optimization.  Would you rather it be removed?
> 
> No.
> 
> What I'm trying to say is that I'd prefer fast path checks don't get
> hidden away in a static inline function wrappers that require the
> reader to go look up code in a different file to understand that
> code in yet another different file is conditionally executed.
> 
> Going from obvious, easy to read fast path code to spreading the
> fast path logic over functions in 3 different files is not an
> improvement in the code - it is how we turn good code into an
> unmaintainable mess...

The alternative would be to duplicate the READ_ONCE() at all 3 call sites --
including the explanatory comment.  That seems strictly worse.

And the code before was broken, so I disagree it was "obvious" or "good".

> 
> > > Also, you need to explain the reason for the READ_ONCE() existing
> > > rather than just saying "it pairs with <some other operation>".
> > > Knowing what operation it pairs with doesn't explain why the pairing
> > > is necessary in the first place, and that leads to nobody reading
> > > the code being able to understand what this is protecting against.
> > > 
> > 
> > How about this?
> > 
> > 	/*
> > 	 * Nothing to do if ->s_dio_done_wq is already set.  But since another
> > 	 * process may set it concurrently, we need to use READ_ONCE() rather
> > 	 * than a plain read to avoid a data race (undefined behavior) and to
> > 	 * ensure we observe the pointed-to struct to be fully initialized.
> > 	 */
> > 	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > 		return 0;
> 
> You still need to document what it pairs with, as "data race" doesn't
> describe the actual dependency we are synchronising against is.
> 
> AFAICT from your description, the data race is not on
> sb->s_dio_done_wq itself, but on seeing the contents of the
> structure being pointed to incorrectly. i.e. we need to ensure that
> writes done before the cmpxchg are ordered correctly against
> reads done after the pointer can be seen here.
> 

No, the data race is on ->s_dio_done_wq itself.  How about this:

        /*
         * Nothing to do if ->s_dio_done_wq is already set.  The READ_ONCE()
         * here pairs with the cmpxchg() in __sb_init_dio_done_wq().  Since the
         * cmpxchg() may set ->s_dio_done_wq concurrently, a plain load would be
         * a data race (undefined behavior), so READ_ONCE() is needed.
         * READ_ONCE() also includes any needed read data dependency barrier to
         * ensure that the pointed-to struct is seen to be fully initialized.
         */

FWIW, long-term we really need to get developers to understand these sorts of
issues, so that the code is written correctly in the first place and we don't
need to annotate common patterns like one-time-init with a long essay and have a
long discussion.  Recently KCSAN was merged upstream
(https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/dev-tools/kcsan.rst)
and the memory model documentation was improved
(https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/explanation.txt?h=v5.8-rc5#n1922),
so hopefully that will raise awareness...

> If so, can't we just treat this as a normal
> store-release/load-acquire ordering pattern and hence use more
> relaxed memory barriers instead of have to patch up what we have now
> to specifically make ancient platforms that nobody actually uses
> with weird and unusual memory models work correctly?

READ_ONCE() is already as relaxed as it can get, as it includes a read data
dependency barrier only (which is no-op on everything other than Alpha).

If anything it should be upgraded to smp_load_acquire(), which handles control
dependencies too.  I didn't see anything obvious in the workqueue code that
would need that (i.e. accesses to some global structure that isn't transitively
reachable via the workqueue_struct itself).  But we could use it to be safe if
we're okay with any performance implications of the additional memory barrier it
would add.

- Eric
Darrick J. Wong July 15, 2020, 4:41 p.m. UTC | #5
On Wed, Jul 15, 2020 at 09:13:42AM -0700, Eric Biggers wrote:
> On Wed, Jul 15, 2020 at 06:01:44PM +1000, Dave Chinner wrote:
> > > > >  /* direct-io.c: */
> > > > > -int sb_init_dio_done_wq(struct super_block *sb);
> > > > > +int __sb_init_dio_done_wq(struct super_block *sb);
> > > > > +static inline int sb_init_dio_done_wq(struct super_block *sb)
> > > > > +{
> > > > > +	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
> > > > > +	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > > > +		return 0;
> > > > > +	return __sb_init_dio_done_wq(sb);
> > > > > +}
> > > > 
> > > > Ummm, why don't you just add this check in sb_init_dio_done_wq(). I
> > > > don't see any need for adding another level of function call
> > > > abstraction in the source code?
> > > 
> > > This keeps the fast path doing no function calls and one fewer branch, as it was
> > > before.  People care a lot about minimizing direct I/O overhead, so it seems
> > > desirable to keep this simple optimization.  Would you rather it be removed?
> > 
> > No.
> > 
> > What I'm trying to say is that I'd prefer fast path checks don't get
> > hidden away in a static inline function wrappers that require the
> > reader to go look up code in a different file to understand that
> > code in yet another different file is conditionally executed.
> > 
> > Going from obvious, easy to read fast path code to spreading the
> > fast path logic over functions in 3 different files is not an
> > improvement in the code - it is how we turn good code into an
> > unmaintainable mess...
> 
> The alternative would be to duplicate the READ_ONCE() at all 3 call sites --
> including the explanatory comment.  That seems strictly worse.
> 
> And the code before was broken, so I disagree it was "obvious" or "good".
> 
> > 
> > > > Also, you need to explain the reason for the READ_ONCE() existing
> > > > rather than just saying "it pairs with <some other operation>".
> > > > Knowing what operation it pairs with doesn't explain why the pairing
> > > > is necessary in the first place, and that leads to nobody reading
> > > > the code being able to understand what this is protecting against.
> > > > 
> > > 
> > > How about this?
> > > 
> > > 	/*
> > > 	 * Nothing to do if ->s_dio_done_wq is already set.  But since another
> > > 	 * process may set it concurrently, we need to use READ_ONCE() rather
> > > 	 * than a plain read to avoid a data race (undefined behavior) and to
> > > 	 * ensure we observe the pointed-to struct to be fully initialized.
> > > 	 */
> > > 	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > 		return 0;
> > 
> > You still need to document what it pairs with, as "data race" doesn't
> > describe the actual dependency we are synchronising against is.
> > 
> > AFAICT from your description, the data race is not on
> > sb->s_dio_done_wq itself, but on seeing the contents of the
> > structure being pointed to incorrectly. i.e. we need to ensure that
> > writes done before the cmpxchg are ordered correctly against
> > reads done after the pointer can be seen here.
> > 
> 
> No, the data race is on ->s_dio_done_wq itself.  How about this:
> 
>         /*
>          * Nothing to do if ->s_dio_done_wq is already set.  The READ_ONCE()
>          * here pairs with the cmpxchg() in __sb_init_dio_done_wq().  Since the
>          * cmpxchg() may set ->s_dio_done_wq concurrently, a plain load would be
>          * a data race (undefined behavior), so READ_ONCE() is needed.
>          * READ_ONCE() also includes any needed read data dependency barrier to
>          * ensure that the pointed-to struct is seen to be fully initialized.
>          */
> 
> FWIW, long-term we really need to get developers to understand these sorts of
> issues, so that the code is written correctly in the first place and we don't
> need to annotate common patterns like one-time-init with a long essay and have a
> long discussion.  Recently KCSAN was merged upstream
> (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/dev-tools/kcsan.rst)
> and the memory model documentation was improved
> (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/explanation.txt?h=v5.8-rc5#n1922),
> so hopefully that will raise awareness...

I tried to understand that, but TBH this whole topic area seems very
complex and difficult to understand.  I more or less understand what
READ_ONCE and WRITE_ONCE do wrt restricting compiler optimizations, but
I wouldn't say that I understand all that machinery.

Granted, my winning strategy so far is to write a simple version with
big dumb locks and let the rest of you argue over slick optimizations.
:P

<shrug> If using READ_ONCE and cmpxchg for pointer initialization (or I
guess smp_store_release and smp_load_acquire?) are a commonly used
paradigm, then maybe that should get its own section in
tools/memory-model/Documentation/recipes.txt and then any code that uses
it can point readers at that?

--D

> > If so, can't we just treat this as a normal
> > store-release/load-acquire ordering pattern and hence use more
> > relaxed memory barriers instead of have to patch up what we have now
> > to specifically make ancient platforms that nobody actually uses
> > with weird and unusual memory models work correctly?
> 
> READ_ONCE() is already as relaxed as it can get, as it includes a read data
> dependency barrier only (which is no-op on everything other than Alpha).
> 
> If anything it should be upgraded to smp_load_acquire(), which handles control
> dependencies too.  I didn't see anything obvious in the workqueue code that
> would need that (i.e. accesses to some global structure that isn't transitively
> reachable via the workqueue_struct itself).  But we could use it to be safe if
> we're okay with any performance implications of the additional memory barrier it
> would add.
> 
> - Eric
Dave Chinner July 16, 2020, 1:46 a.m. UTC | #6
On Wed, Jul 15, 2020 at 09:13:42AM -0700, Eric Biggers wrote:
> On Wed, Jul 15, 2020 at 06:01:44PM +1000, Dave Chinner wrote:
> > > > >  /* direct-io.c: */
> > > > > -int sb_init_dio_done_wq(struct super_block *sb);
> > > > > +int __sb_init_dio_done_wq(struct super_block *sb);
> > > > > +static inline int sb_init_dio_done_wq(struct super_block *sb)
> > > > > +{
> > > > > +	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
> > > > > +	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > > > +		return 0;
> > > > > +	return __sb_init_dio_done_wq(sb);
> > > > > +}
> > > > 
> > > > Ummm, why don't you just add this check in sb_init_dio_done_wq(). I
> > > > don't see any need for adding another level of function call
> > > > abstraction in the source code?
> > > 
> > > This keeps the fast path doing no function calls and one fewer branch, as it was
> > > before.  People care a lot about minimizing direct I/O overhead, so it seems
> > > desirable to keep this simple optimization.  Would you rather it be removed?
> > 
> > No.
> > 
> > What I'm trying to say is that I'd prefer fast path checks don't get
> > hidden away in a static inline function wrappers that require the
> > reader to go look up code in a different file to understand that
> > code in yet another different file is conditionally executed.
> > 
> > Going from obvious, easy to read fast path code to spreading the
> > fast path logic over functions in 3 different files is not an
> > improvement in the code - it is how we turn good code into an
> > unmaintainable mess...
> 
> The alternative would be to duplicate the READ_ONCE() at all 3 call sites --
> including the explanatory comment.  That seems strictly worse.
> 
> And the code before was broken, so I disagree it was "obvious" or "good".

It's only "broken" on anything but an ancient platform that
has a unique, one-off, never to be repeated, whacky
memory model.

And why should we compromise performance on hundreds of millions of
modern systems to fix an extremely rare race on an extremely rare
platform that maybe only a hundred people world-wide might still
use?

Indeed, if you are going to use "fast path performance" as the
justification for READ_ONCE() over a more robust but slightly slower
release/acquire barrier pair, then I'd suggest that the check and
dynamic allocation of the workqueue should be removed entirely and
it should be allocated at mount time by filesytems that require it. 

IOWs, If we are truly concerned with fast path performance, then
we'd elide the entire branch from the fast path altogether, thereby
getting rid of the cmpxchg code and the data race at the same
time....

I'm talking from self interest here: I need to be able to understand
and debug this code, and if I struggle to understand what the memory
ordering relationship is and have to work it out from first
principles every time I have to look at the code, then *that is bad
code*.

I don't care about speed if I can't understand or easily modify the
code without it breaking unexpectedly. IO performance comes from the
algorithms, not micro-optimisation of CPU usage. If one CPU is not
enough, I've got another thousand I can also use. Make the code
simple, stupid, easy to maintain, and easy to test - performance
will come from smart, highly concurrent algorithms, not single CPU
usage micro-optimisations.

> > > > Also, you need to explain the reason for the READ_ONCE() existing
> > > > rather than just saying "it pairs with <some other operation>".
> > > > Knowing what operation it pairs with doesn't explain why the pairing
> > > > is necessary in the first place, and that leads to nobody reading
> > > > the code being able to understand what this is protecting against.
> > > > 
> > > 
> > > How about this?
> > > 
> > > 	/*
> > > 	 * Nothing to do if ->s_dio_done_wq is already set.  But since another
> > > 	 * process may set it concurrently, we need to use READ_ONCE() rather
> > > 	 * than a plain read to avoid a data race (undefined behavior) and to
> > > 	 * ensure we observe the pointed-to struct to be fully initialized.
> > > 	 */
> > > 	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > 		return 0;
> > 
> > You still need to document what it pairs with, as "data race" doesn't
> > describe the actual dependency we are synchronising against is.
> > 
> > AFAICT from your description, the data race is not on
> > sb->s_dio_done_wq itself, but on seeing the contents of the
> > structure being pointed to incorrectly. i.e. we need to ensure that
> > writes done before the cmpxchg are ordered correctly against
> > reads done after the pointer can be seen here.
> > 
> 
> No, the data race is on ->s_dio_done_wq itself.  How about this:

Then we -just don't care- that it races because false negatives call
into sb_init_dio_done_wq() and it does the right thing. And given
that -false positives- cannot occur (requires time travel to see the
variable set before it has actually been set by the cmpxchg) there
is nothing wrong with this check being racy.

>         /*
>          * Nothing to do if ->s_dio_done_wq is already set.  The READ_ONCE()
>          * here pairs with the cmpxchg() in __sb_init_dio_done_wq().  Since the
>          * cmpxchg() may set ->s_dio_done_wq concurrently, a plain load would be
>          * a data race (undefined behavior),

No, it is *not undefined*. If we know what is racing, then it is
trivial to determine what the outcome of the race will be. And that
-defines- the behaviour that will occur, and as per above, once
we've defined the behaviour we realise that *we just don't care
about races on setting/reading ->s_dio_done_wq because the code
will *always do the right thing*.

>          so READ_ONCE() is needed.
>          * READ_ONCE() also includes any needed read data dependency barrier to
>          * ensure that the pointed-to struct is seen to be fully initialized.
>          */

IOWs, the only problem started here is the read data dependency
that results from not explicitly ordering loads from the memory that
->s_dio_done_wq points to before it is made visible via the cmpxchg.

Most people will be able to understand this as a
store-release/load-acquire data relationship.  Yes, I know that
because the related write operations that need ordering occur in the
same structure the atomic ops publish via a pointer to that
structure, it can be reduced to just a pure read data dependency.
But understanding that from the code and reasoning that the
dependency is unchanged by future modifications is complex and
beyond the capability of many people, likely including *me*.

If we leave the code as a simple store-release/load_acquire pattern,
then we just don't have to care about hidden or subtle data/control
dependencies. The code will work regardless of whether they exist or
not, so it will be much more robust against future changes.

> FWIW, long-term we really need to get developers to understand these sorts of
> issues, so that the code is written correctly in the first place and we don't
> need to annotate common patterns like one-time-init with a long essay and have a
> long discussion.

Well, yes, but slamming READ_ONCE/WRITE_ONCE all through the code
-does not acheive that-. Go back and look at the recent discussion
I had with PeterZ over the memory ordering in ttwu(). [1] You can
document the memory ordering, but that *does not make the code
understandable*. Memory ordering requires a -lot- of knowledge to
understand the context in which the ordering is necessary, and
nobody can understand that from a terse comment like "pairs with
<op> in <func>".

[1] https://lore.kernel.org/linux-xfs/20200626223254.GH2005@dread.disaster.area/

> Recently KCSAN was merged upstream
> (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/dev-tools/kcsan.rst)
> and the memory model documentation was improved
> (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/explanation.txt?h=v5.8-rc5#n1922),
> so hopefully that will raise awareness...

The majority of the _good_ programmers I know run away screaming
from this stuff. As was said many, many years ago - understanding
memory-barriers.txt is an -extremely high bar- to set as a basic
requirement for being a kernel developer.

This needs to be simplified down into a small set of common
patterns. store-release/load-acquire is a simple pattern, especially
for people who understand that unlock-to-lock provides a memory
barrier for the critical section inside the lock. When you explain
that this is a store-release/load-acquire memory ordering pattern,
it's much easier to form a working understanding of the ordering
of operations.

> > If so, can't we just treat this as a normal
> > store-release/load-acquire ordering pattern and hence use more
> > relaxed memory barriers instead of have to patch up what we have now
> > to specifically make ancient platforms that nobody actually uses
> > with weird and unusual memory models work correctly?
> 
> READ_ONCE() is already as relaxed as it can get, as it includes a read data
> dependency barrier only (which is no-op on everything other than Alpha).
> 
> If anything it should be upgraded to smp_load_acquire(), which handles control
> dependencies too.  I didn't see anything obvious in the workqueue code that
> would need that (i.e. accesses to some global structure that isn't transitively
> reachable via the workqueue_struct itself).  But we could use it to be safe if
> we're okay with any performance implications of the additional memory barrier it
> would add.

That's exactly the point I've been trying to make - if people cannot
reason why the code is correct/safe from reading it, then it doesn't
matter how fast it is.

Cheers,

Dave.
Eric Biggers July 16, 2020, 2:39 a.m. UTC | #7
On Thu, Jul 16, 2020 at 11:46:56AM +1000, Dave Chinner wrote:
> > > > > Also, you need to explain the reason for the READ_ONCE() existing
> > > > > rather than just saying "it pairs with <some other operation>".
> > > > > Knowing what operation it pairs with doesn't explain why the pairing
> > > > > is necessary in the first place, and that leads to nobody reading
> > > > > the code being able to understand what this is protecting against.
> > > > > 
> > > > 
> > > > How about this?
> > > > 
> > > > 	/*
> > > > 	 * Nothing to do if ->s_dio_done_wq is already set.  But since another
> > > > 	 * process may set it concurrently, we need to use READ_ONCE() rather
> > > > 	 * than a plain read to avoid a data race (undefined behavior) and to
> > > > 	 * ensure we observe the pointed-to struct to be fully initialized.
> > > > 	 */
> > > > 	if (likely(READ_ONCE(sb->s_dio_done_wq)))
> > > > 		return 0;
> > > 
> > > You still need to document what it pairs with, as "data race" doesn't
> > > describe the actual dependency we are synchronising against is.
> > > 
> > > AFAICT from your description, the data race is not on
> > > sb->s_dio_done_wq itself, but on seeing the contents of the
> > > structure being pointed to incorrectly. i.e. we need to ensure that
> > > writes done before the cmpxchg are ordered correctly against
> > > reads done after the pointer can be seen here.
> > > 
> > 
> > No, the data race is on ->s_dio_done_wq itself.  How about this:
> 
> Then we -just don't care- that it races because false negatives call
> into sb_init_dio_done_wq() and it does the right thing. And given
> that -false positives- cannot occur (requires time travel to see the
> variable set before it has actually been set by the cmpxchg) there
> is nothing wrong with this check being racy.
> 
> >         /*
> >          * Nothing to do if ->s_dio_done_wq is already set.  The READ_ONCE()
> >          * here pairs with the cmpxchg() in __sb_init_dio_done_wq().  Since the
> >          * cmpxchg() may set ->s_dio_done_wq concurrently, a plain load would be
> >          * a data race (undefined behavior),
> 
> No, it is *not undefined*. If we know what is racing, then it is
> trivial to determine what the outcome of the race will be. And that
> -defines- the behaviour that will occur, and as per above, once
> we've defined the behaviour we realise that *we just don't care
> about races on setting/reading ->s_dio_done_wq because the code
> will *always do the right thing*.

You're just wrong here.  The C standard allows compilers to assume that
variables accessed by plain accesses aren't concurrently changed by other CPUs.
It's undefined behavior if other CPUs do in fact change the variable.

For example, it's perfectly allowed for the compiler to load the variable twice,
do the NULL check on the second copy, then actually use the first copy --- since
it can assume the two copies will have the same value.  But here the first copy
could be NULL and the second copy could be non-NULL.

See https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE for a
discussion of the many other reasons to properly annotate data races as well.

I understand that many kernel developers are perfectly happy to rely on the
compiler being "reasonable" (with everyone having their own opinion of what is
"reasonable").  I think that's not enough.  Whenever feasible, we should write
unambiguously correct code.

Anyway, do you have a concrete suggestion for this patch itself?  You've
suggested about four different things and not clearly said what you prefer.
As I see it, the options are:

- Do nothing.  I.e. retain undefined behavior, and also retain brokenness on a
  supported Linux architecture.

- Use READ_ONCE() (which you apparently want duplicated in 3 places?)

- Use smp_load_acquire() (which part of your email suggested you prefer, but
  would also affect performance slightly whereas another part of your email
  complained about compromising performance)

- Make filesystems allocate the workqueue at mount time.  Note that in many
  cases this workqueue will never be used, and the comment above
  sb_init_dio_done_wq() specifically calls out that the workqueue is
  intentionally not allocated when unneeded.  So we'd be changing that.

Perhaps it would help if we waited on this patch until the one-time init pattern
is properly documented in tools/memory-model/Documentation/recipes.txt?

- Eric
Matthew Wilcox (Oracle) July 16, 2020, 2:47 a.m. UTC | #8
On Thu, Jul 16, 2020 at 11:46:56AM +1000, Dave Chinner wrote:
> And why should we compromise performance on hundreds of millions of
> modern systems to fix an extremely rare race on an extremely rare
> platform that maybe only a hundred people world-wide might still
> use?

I thought that wasn't the argument here.  It was that some future
compiler might choose to do something absolutely awful that no current
compiler does, and that rather than disable the stupid "optimisation",
we'd be glad that we'd already stuffed the source code up so that it
lay within some tortuous reading of the C spec.

The memory model is just too complicated.  Look at the recent exchange
between myself & Dan Williams.  I spent literally _hours_ trying to
figure out what rules to follow.

https://lore.kernel.org/linux-mm/CAPcyv4jgjoLqsV+aHGJwGXbCSwbTnWLmog5-rxD2i31vZ2rDNQ@mail.gmail.com/
https://lore.kernel.org/linux-mm/CAPcyv4j2+7XiJ9BXQ4mj_XN0N+rCyxch5QkuZ6UsOBsOO1+2Vg@mail.gmail.com/

Neither Dan nor I are exactly "new" to Linux kernel development.  As Dave
is saying here, having to understand the memory model is too high a bar.

Hell, I don't know if what we ended up with for v4 is actually correct.
It lokos good to me, but *shrug*

https://lore.kernel.org/linux-mm/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com/
Eric Biggers July 16, 2020, 3:19 a.m. UTC | #9
On Thu, Jul 16, 2020 at 03:47:17AM +0100, Matthew Wilcox wrote:
> On Thu, Jul 16, 2020 at 11:46:56AM +1000, Dave Chinner wrote:
> > And why should we compromise performance on hundreds of millions of
> > modern systems to fix an extremely rare race on an extremely rare
> > platform that maybe only a hundred people world-wide might still
> > use?
> 
> I thought that wasn't the argument here.  It was that some future
> compiler might choose to do something absolutely awful that no current
> compiler does, and that rather than disable the stupid "optimisation",
> we'd be glad that we'd already stuffed the source code up so that it
> lay within some tortuous reading of the C spec.

There are actually many reasons to avoid data races; see
https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE

> 
> The memory model is just too complicated.  Look at the recent exchange
> between myself & Dan Williams.  I spent literally _hours_ trying to
> figure out what rules to follow.
> 
> https://lore.kernel.org/linux-mm/CAPcyv4jgjoLqsV+aHGJwGXbCSwbTnWLmog5-rxD2i31vZ2rDNQ@mail.gmail.com/
> https://lore.kernel.org/linux-mm/CAPcyv4j2+7XiJ9BXQ4mj_XN0N+rCyxch5QkuZ6UsOBsOO1+2Vg@mail.gmail.com/
> 
> Neither Dan nor I are exactly "new" to Linux kernel development.  As Dave
> is saying here, having to understand the memory model is too high a bar.
> 
> Hell, I don't know if what we ended up with for v4 is actually correct.
> It lokos good to me, but *shrug*
> 
> https://lore.kernel.org/linux-mm/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com/

Yes, it's too complicated.  I'm not sure there's much of a solution, though.

Of course, we also have easy-to-use synchronization primitives like mutex,
spinlock, rw_semaphore, etc.  The problems arise when people think they know
better and try to write something more "optimized".  We need to have a higher
bar for accepting changes where the memory model is a concern at all.

- Eric
Eric Biggers July 16, 2020, 5:33 a.m. UTC | #10
On Thu, Jul 16, 2020 at 03:47:17AM +0100, Matthew Wilcox wrote:
> On Thu, Jul 16, 2020 at 11:46:56AM +1000, Dave Chinner wrote:
> > And why should we compromise performance on hundreds of millions of
> > modern systems to fix an extremely rare race on an extremely rare
> > platform that maybe only a hundred people world-wide might still
> > use?
> 
> I thought that wasn't the argument here.  It was that some future
> compiler might choose to do something absolutely awful that no current
> compiler does, and that rather than disable the stupid "optimisation",
> we'd be glad that we'd already stuffed the source code up so that it
> lay within some tortuous reading of the C spec.
> 
> The memory model is just too complicated.  Look at the recent exchange
> between myself & Dan Williams.  I spent literally _hours_ trying to
> figure out what rules to follow.
> 
> https://lore.kernel.org/linux-mm/CAPcyv4jgjoLqsV+aHGJwGXbCSwbTnWLmog5-rxD2i31vZ2rDNQ@mail.gmail.com/
> https://lore.kernel.org/linux-mm/CAPcyv4j2+7XiJ9BXQ4mj_XN0N+rCyxch5QkuZ6UsOBsOO1+2Vg@mail.gmail.com/
> 
> Neither Dan nor I are exactly "new" to Linux kernel development.  As Dave
> is saying here, having to understand the memory model is too high a bar.
> 
> Hell, I don't know if what we ended up with for v4 is actually correct.
> It lokos good to me, but *shrug*
> 
> https://lore.kernel.org/linux-mm/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com/

Looks like you still got it wrong :-(  It needs:

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 934c92dcb9ab..9a95fbe86e15 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -1029,7 +1029,7 @@ static int devmem_init_inode(void)
        }

        /* publish /dev/mem initialized */
-       WRITE_ONCE(devmem_inode, inode);
+       smp_store_release(&devmem_inode, inode);

        return 0;
 }

It seems one source of confusion is that READ_ONCE() and WRITE_ONCE() don't
actually pair with each other, unless no memory barriers are needed at all.

Instead, READ_ONCE() pairs with a primitive that has "release" semantics, e.g.
smp_store_release() or cmpxchg_release().  But READ_ONCE() is only correct if
there's no control flow dependency; if there is, it needs to be upgraded to a
primitive with "acquire" semantics, e.g. smp_load_acquire().

The best approach might be to just say that the READ_ONCE() + "release" pairing
should be avoided, and we should stick to "acquire" + "release".  (And I think
Dave may be saying he'd prefer that for ->s_dio_done_wq?)

- Eric
Dave Chinner July 16, 2020, 8:16 a.m. UTC | #11
On Wed, Jul 15, 2020 at 10:33:32PM -0700, Eric Biggers wrote:
> On Thu, Jul 16, 2020 at 03:47:17AM +0100, Matthew Wilcox wrote:
> > On Thu, Jul 16, 2020 at 11:46:56AM +1000, Dave Chinner wrote:
> > > And why should we compromise performance on hundreds of millions of
> > > modern systems to fix an extremely rare race on an extremely rare
> > > platform that maybe only a hundred people world-wide might still
> > > use?
> > 
> > I thought that wasn't the argument here.  It was that some future
> > compiler might choose to do something absolutely awful that no current
> > compiler does, and that rather than disable the stupid "optimisation",
> > we'd be glad that we'd already stuffed the source code up so that it
> > lay within some tortuous reading of the C spec.
> > 
> > The memory model is just too complicated.  Look at the recent exchange
> > between myself & Dan Williams.  I spent literally _hours_ trying to
> > figure out what rules to follow.
> > 
> > https://lore.kernel.org/linux-mm/CAPcyv4jgjoLqsV+aHGJwGXbCSwbTnWLmog5-rxD2i31vZ2rDNQ@mail.gmail.com/
> > https://lore.kernel.org/linux-mm/CAPcyv4j2+7XiJ9BXQ4mj_XN0N+rCyxch5QkuZ6UsOBsOO1+2Vg@mail.gmail.com/
> > 
> > Neither Dan nor I are exactly "new" to Linux kernel development.  As Dave
> > is saying here, having to understand the memory model is too high a bar.
> > 
> > Hell, I don't know if what we ended up with for v4 is actually correct.
> > It lokos good to me, but *shrug*
> > 
> > https://lore.kernel.org/linux-mm/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com/
> 
> Looks like you still got it wrong :-(  It needs:
> 
> diff --git a/drivers/char/mem.c b/drivers/char/mem.c
> index 934c92dcb9ab..9a95fbe86e15 100644
> --- a/drivers/char/mem.c
> +++ b/drivers/char/mem.c
> @@ -1029,7 +1029,7 @@ static int devmem_init_inode(void)
>         }
> 
>         /* publish /dev/mem initialized */
> -       WRITE_ONCE(devmem_inode, inode);
> +       smp_store_release(&devmem_inode, inode);
> 
>         return 0;
>  }
> 
> It seems one source of confusion is that READ_ONCE() and WRITE_ONCE() don't
> actually pair with each other, unless no memory barriers are needed at all.
> 
> Instead, READ_ONCE() pairs with a primitive that has "release" semantics, e.g.
> smp_store_release() or cmpxchg_release(). But READ_ONCE() is only correct if
> there's no control flow dependency; if there is, it needs to be upgraded to a
> primitive with "acquire" semantics, e.g. smp_load_acquire().

You lost your audience at "control flow dependency". i.e. you're
trying to explain memory ordering requirements by using terms that
only people who deeply grok memory ordering understand.

I can't tell you what a control flow dependency is off the top
of my head - I'd have to look up the documentation to remind myself
what it means and why it might be important. Then I'll realise yet
again that if I just use release+acquire message passing constructs,
I just don't have to care about them. And so I promptly forget about
them again.

My point is that the average programmer does not need to know what a
control flow or data depedency is to use memory ordering semantics
correctly. If you want to optimise your code down to the Nth degree
then you *may* need to know that, but almost nobody in the kernel
needs to optimise their code to that extent.

> The best approach might be to just say that the READ_ONCE() + "release" pairing
> should be avoided, and we should stick to "acquire" + "release".  (And I think
> Dave may be saying he'd prefer that for ->s_dio_done_wq?)

Pretty much.

We need to stop thinking of these synchronisation primitives as
"memory ordering" or "memory barriers" or "atomic access" and
instead think of them as tools to pass data safely between concurrent
threads.

We need to give people a simple mental model and/or pattern for
passing data safely between two racing threads, not hit them over
the head with the LKMM documentation. People are much more likely to
understand the ordering and *much* less likely to make mistakes
given clear, simple examples to follow. And that will stick if you
can relate those examples back to the locking constructs they
already understand and have been using for years.

e.g. basic message passing. foo = 0 is the initial message state, y
is the mail box flag, initially 0/locked:

			locking:	ordered code:

write message:		foo = 1		foo = 1
post message:		spin_unlock(y)	smp_store_release(y, 1)

check message box:	spin_lock(y)	if (smp_load_acquire(y) == 1)
got message:		print(foo)		print(foo)

And in both cases we will always get "foo = 1" as the output of
both sets of code. i.e. foo is the message, y is the object that
guarantees delivery of the message.

This makes the code Willy linked to obvious. The message to be
delivered is the inode and it's contents, and it is posted via
the devmem_inode. Hence:

write message:		inode = alloc_anon_inode()
post message:		smp_store_release(&devmem_inode, inode);

check message box:	inode = smp_load_acquire(&devmem_inode);
got message?		if (!inode)
    no				<fail>
   yes			<message contents guaranteed to be seen>

To someone familiar with message passing patterns, this code almost
doesn't need comments to explain it.

Using memory ordering code becomes much simpler when we think of it
as a release+acquire pattern rather than "READ_ONCE/WRITE_ONCE plus
(some set of) memory barriers" because it explicitly lays out the
ordering requirements of the code on both sides of the pattern.
Once a developer creates a mental association between the
release+acquire message passing mechanism and critical section
ordering defined by unlock->lock operations, ordering becomes much
less challenging to think and reason about.

Part of the problem is that RO/WO are now such overloaded operators
it's hard to understand all the things they do or discriminate
between which function a specific piece of code is relying on.

IMO, top level kernel developers need to stop telling people "you
need to understand the lkmm and/or memory_barriers.txt" like it's
the only way to write safe concurrent code. The reality is that most
devs don't need to understand it at all.  We'll make much more
progress on fixing broken code and having new code being written
correctly by teaching people simple patterns that are easy to
explain, easy to learn, *hard to get wrong* and easy to review. And
then they'll use them in places where they'd previously not care
about data races because they have been taught "this is the way we
should write concurrent code". They'll never learn that from being
told to read the LKMM documentation....

So if you find yourself using the words "LKMM", "control flow",
"data dependency", "compiler optimisation" or other intricate
details or the memory model, then you've already lost your
audience....

Cheers,

Dave.
Darrick J. Wong July 17, 2020, 1:36 a.m. UTC | #12
On Thu, Jul 16, 2020 at 06:16:16PM +1000, Dave Chinner wrote:
> On Wed, Jul 15, 2020 at 10:33:32PM -0700, Eric Biggers wrote:
> > On Thu, Jul 16, 2020 at 03:47:17AM +0100, Matthew Wilcox wrote:
> > > On Thu, Jul 16, 2020 at 11:46:56AM +1000, Dave Chinner wrote:
> > > > And why should we compromise performance on hundreds of millions of
> > > > modern systems to fix an extremely rare race on an extremely rare
> > > > platform that maybe only a hundred people world-wide might still
> > > > use?
> > > 
> > > I thought that wasn't the argument here.  It was that some future
> > > compiler might choose to do something absolutely awful that no current
> > > compiler does, and that rather than disable the stupid "optimisation",
> > > we'd be glad that we'd already stuffed the source code up so that it
> > > lay within some tortuous reading of the C spec.
> > > 
> > > The memory model is just too complicated.  Look at the recent exchange
> > > between myself & Dan Williams.  I spent literally _hours_ trying to
> > > figure out what rules to follow.
> > > 
> > > https://lore.kernel.org/linux-mm/CAPcyv4jgjoLqsV+aHGJwGXbCSwbTnWLmog5-rxD2i31vZ2rDNQ@mail.gmail.com/
> > > https://lore.kernel.org/linux-mm/CAPcyv4j2+7XiJ9BXQ4mj_XN0N+rCyxch5QkuZ6UsOBsOO1+2Vg@mail.gmail.com/
> > > 
> > > Neither Dan nor I are exactly "new" to Linux kernel development.  As Dave
> > > is saying here, having to understand the memory model is too high a bar.
> > > 
> > > Hell, I don't know if what we ended up with for v4 is actually correct.
> > > It lokos good to me, but *shrug*
> > > 
> > > https://lore.kernel.org/linux-mm/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com/
> > 
> > Looks like you still got it wrong :-(  It needs:
> > 
> > diff --git a/drivers/char/mem.c b/drivers/char/mem.c
> > index 934c92dcb9ab..9a95fbe86e15 100644
> > --- a/drivers/char/mem.c
> > +++ b/drivers/char/mem.c
> > @@ -1029,7 +1029,7 @@ static int devmem_init_inode(void)
> >         }
> > 
> >         /* publish /dev/mem initialized */
> > -       WRITE_ONCE(devmem_inode, inode);
> > +       smp_store_release(&devmem_inode, inode);
> > 
> >         return 0;
> >  }
> > 
> > It seems one source of confusion is that READ_ONCE() and WRITE_ONCE() don't
> > actually pair with each other, unless no memory barriers are needed at all.
> > 
> > Instead, READ_ONCE() pairs with a primitive that has "release" semantics, e.g.
> > smp_store_release() or cmpxchg_release(). But READ_ONCE() is only correct if
> > there's no control flow dependency; if there is, it needs to be upgraded to a
> > primitive with "acquire" semantics, e.g. smp_load_acquire().
> 
> You lost your audience at "control flow dependency". i.e. you're
> trying to explain memory ordering requirements by using terms that
> only people who deeply grok memory ordering understand.
> 
> I can't tell you what a control flow dependency is off the top
> of my head - I'd have to look up the documentation to remind myself
> what it means and why it might be important. Then I'll realise yet
> again that if I just use release+acquire message passing constructs,
> I just don't have to care about them. And so I promptly forget about
> them again.
> 
> My point is that the average programmer does not need to know what a
> control flow or data depedency is to use memory ordering semantics
> correctly. If you want to optimise your code down to the Nth degree
> then you *may* need to know that, but almost nobody in the kernel
> needs to optimise their code to that extent.
> 
> > The best approach might be to just say that the READ_ONCE() + "release" pairing
> > should be avoided, and we should stick to "acquire" + "release".  (And I think
> > Dave may be saying he'd prefer that for ->s_dio_done_wq?)
> 
> Pretty much.
> 
> We need to stop thinking of these synchronisation primitives as
> "memory ordering" or "memory barriers" or "atomic access" and
> instead think of them as tools to pass data safely between concurrent
> threads.
> 
> We need to give people a simple mental model and/or pattern for
> passing data safely between two racing threads, not hit them over
> the head with the LKMM documentation. People are much more likely to
> understand the ordering and *much* less likely to make mistakes
> given clear, simple examples to follow. And that will stick if you
> can relate those examples back to the locking constructs they
> already understand and have been using for years.
> 
> e.g. basic message passing. foo = 0 is the initial message state, y
> is the mail box flag, initially 0/locked:
> 
> 			locking:	ordered code:
> 
> write message:		foo = 1		foo = 1
> post message:		spin_unlock(y)	smp_store_release(y, 1)
> 
> check message box:	spin_lock(y)	if (smp_load_acquire(y) == 1)
> got message:		print(foo)		print(foo)
> 
> And in both cases we will always get "foo = 1" as the output of
> both sets of code. i.e. foo is the message, y is the object that
> guarantees delivery of the message.
> 
> This makes the code Willy linked to obvious. The message to be
> delivered is the inode and it's contents, and it is posted via
> the devmem_inode. Hence:
> 
> write message:		inode = alloc_anon_inode()
> post message:		smp_store_release(&devmem_inode, inode);
> 
> check message box:	inode = smp_load_acquire(&devmem_inode);
> got message?		if (!inode)
>     no				<fail>
>    yes			<message contents guaranteed to be seen>
> 
> To someone familiar with message passing patterns, this code almost
> doesn't need comments to explain it.

Yes.  This!!!

I want a nice one-pager in the recipes file telling me how to do
opportunistic pointer initialization initialization locklessly.  I want
the people who specialize in understanding memory models to verify that
yes, this samplecode is correct.

I want to concentrate my attention on the higher level things that I
specialize in, namely XFS, which means that I want to build XFS things
out of high quality lower level pieces that are built by people who are
experts at those lower level things so I don't have to make a huge
detour understanding another highly complex thing.  All that will do is
exercise my brain's paging algorithms.

The alternative is that I'll just use a spinlock/rwsem/mutex because
that's what I know, and they're easy to think about.  Right now I have
barely any idea what's really the difference between
READ_ONCE/smp_rmb/smp_load_acquire, and I'd rather frame the discussion
as "here's my higher level problem, how do I solve this?"

(end crazy maintainer rambling)

--D

> Using memory ordering code becomes much simpler when we think of it
> as a release+acquire pattern rather than "READ_ONCE/WRITE_ONCE plus
> (some set of) memory barriers" because it explicitly lays out the
> ordering requirements of the code on both sides of the pattern.
> Once a developer creates a mental association between the
> release+acquire message passing mechanism and critical section
> ordering defined by unlock->lock operations, ordering becomes much
> less challenging to think and reason about.
> 
> Part of the problem is that RO/WO are now such overloaded operators
> it's hard to understand all the things they do or discriminate
> between which function a specific piece of code is relying on.
> 
> IMO, top level kernel developers need to stop telling people "you
> need to understand the lkmm and/or memory_barriers.txt" like it's
> the only way to write safe concurrent code. The reality is that most
> devs don't need to understand it at all.  We'll make much more
> progress on fixing broken code and having new code being written
> correctly by teaching people simple patterns that are easy to
> explain, easy to learn, *hard to get wrong* and easy to review. And
> then they'll use them in places where they'd previously not care
> about data races because they have been taught "this is the way we
> should write concurrent code". They'll never learn that from being
> told to read the LKMM documentation....
> 
> So if you find yourself using the words "LKMM", "control flow",
> "data dependency", "compiler optimisation" or other intricate
> details or the memory model, then you've already lost your
> audience....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6d5370eac2a8..26221ae24156 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -590,7 +590,7 @@  static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  * filesystems that don't need it and also allows us to create the workqueue
  * late enough so the we can include s_id in the name of the workqueue.
  */
-int sb_init_dio_done_wq(struct super_block *sb)
+int __sb_init_dio_done_wq(struct super_block *sb)
 {
 	struct workqueue_struct *old;
 	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
@@ -615,9 +615,7 @@  static int dio_set_defer_completion(struct dio *dio)
 	if (dio->defer_completion)
 		return 0;
 	dio->defer_completion = true;
-	if (!sb->s_dio_done_wq)
-		return sb_init_dio_done_wq(sb);
-	return 0;
+	return sb_init_dio_done_wq(sb);
 }
 
 /*
@@ -1250,7 +1248,7 @@  do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		retval = 0;
 		if (iocb->ki_flags & IOCB_DSYNC)
 			retval = dio_set_defer_completion(dio);
-		else if (!dio->inode->i_sb->s_dio_done_wq) {
+		else {
 			/*
 			 * In case of AIO write racing with buffered read we
 			 * need to defer completion. We can't decide this now,
diff --git a/fs/internal.h b/fs/internal.h
index 9b863a7bd708..6736c9eee978 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -178,7 +178,14 @@  extern void mnt_pin_kill(struct mount *m);
 extern const struct dentry_operations ns_dentry_operations;
 
 /* direct-io.c: */
-int sb_init_dio_done_wq(struct super_block *sb);
+int __sb_init_dio_done_wq(struct super_block *sb);
+static inline int sb_init_dio_done_wq(struct super_block *sb)
+{
+	/* pairs with cmpxchg() in __sb_init_dio_done_wq() */
+	if (likely(READ_ONCE(sb->s_dio_done_wq)))
+		return 0;
+	return __sb_init_dio_done_wq(sb);
+}
 
 /*
  * fs/stat.c:
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ec7b78e6feca..dc7fe898dab8 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -487,8 +487,7 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		dio_warn_stale_pagecache(iocb->ki_filp);
 	ret = 0;
 
-	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion) {
 		ret = sb_init_dio_done_wq(inode->i_sb);
 		if (ret < 0)
 			goto out_free_dio;