diff mbox series

[v2] fs/direct-io: fix one-time init of ->s_dio_done_wq

Message ID 20200717050510.95832-1-ebiggers@kernel.org (mailing list archive)
State New, archived
Headers show
Series [v2] fs/direct-io: fix one-time init of ->s_dio_done_wq | expand

Commit Message

Eric Biggers July 17, 2020, 5:05 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

Correctly implement the "one-time" init pattern for ->s_dio_done_wq.
This fixes the following issues:

- The LKMM doesn't guarantee that the workqueue will be seen initialized
  before being used, if another CPU allocated it.  With regards to
  specific CPU architectures, this is true on at least Alpha, but it may
  be true on other architectures too if the internal implementation of
  workqueues causes use of the workqueue to involve a control
  dependency.  (There doesn't appear to be a control dependency
  currently, but it's hard to tell and it could change in the future.)

- The preliminary checks for sb->s_dio_done_wq are a data race, since
  they do a plain load of a concurrently modified variable.  According
  to the C standard, this undefined behavior.  In practice, the kernel
  does sometimes makes assumptions about data races might be okay in
  practice, but these rules are undocumented and not uniformly agreed
  upon, so it's best to avoid cases where they might come into play.

Following the guidance for one-time init I've proposed at
https://lkml.kernel.org/r/20200717044427.68747-1-ebiggers@kernel.org,
replace it with the simplest implementation that is guaranteed to be
correct while still achieving the following properties:

    - Doesn't make direct I/O users contend on a mutex in the fast path.

    - Doesn't allocate the workqueue when it will never be used.

Fixes: 7b7a8665edd8 ("direct-io: Implement generic deferred AIO completions")
Signed-off-by: Eric Biggers <ebiggers@google.com>
---

v2: new implementation using smp_load_acquire() + smp_store_release()
    and a mutex.

 fs/direct-io.c       | 42 ++++++++++++++++++++++++------------------
 fs/iomap/direct-io.c |  3 +--
 2 files changed, 25 insertions(+), 20 deletions(-)

Comments

Sedat Dilek July 17, 2020, 6:02 a.m. UTC | #1
On Fri, Jul 17, 2020 at 7:08 AM Eric Biggers <ebiggers@kernel.org> wrote:
> - The preliminary checks for sb->s_dio_done_wq are a data race, since
>   they do a plain load of a concurrently modified variable.  According
>   to the C standard, this undefined behavior.  In practice, the kernel
>   does sometimes makes assumptions about data races might be okay in

Some small typos:
...this *is* undefined behavior...
...does sometimes make* assumptions about...

- Sedat -
Darrick J. Wong July 17, 2020, 9:04 p.m. UTC | #2
On Thu, Jul 16, 2020 at 10:05:10PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Correctly implement the "one-time" init pattern for ->s_dio_done_wq.
> This fixes the following issues:
> 
> - The LKMM doesn't guarantee that the workqueue will be seen initialized
>   before being used, if another CPU allocated it.  With regards to
>   specific CPU architectures, this is true on at least Alpha, but it may
>   be true on other architectures too if the internal implementation of
>   workqueues causes use of the workqueue to involve a control
>   dependency.  (There doesn't appear to be a control dependency
>   currently, but it's hard to tell and it could change in the future.)
> 
> - The preliminary checks for sb->s_dio_done_wq are a data race, since
>   they do a plain load of a concurrently modified variable.  According
>   to the C standard, this undefined behavior.  In practice, the kernel
>   does sometimes makes assumptions about data races might be okay in
>   practice, but these rules are undocumented and not uniformly agreed
>   upon, so it's best to avoid cases where they might come into play.
> 
> Following the guidance for one-time init I've proposed at
> https://lkml.kernel.org/r/20200717044427.68747-1-ebiggers@kernel.org,

It might be a good idea to combine these two patches into a series so
that we can leave a breadcrumb in sb_init_dio_done_wq explaining why it
does what it does.

> replace it with the simplest implementation that is guaranteed to be
> correct while still achieving the following properties:
> 
>     - Doesn't make direct I/O users contend on a mutex in the fast path.
> 
>     - Doesn't allocate the workqueue when it will never be used.
> 
> Fixes: 7b7a8665edd8 ("direct-io: Implement generic deferred AIO completions")
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
> 
> v2: new implementation using smp_load_acquire() + smp_store_release()
>     and a mutex.
> 
>  fs/direct-io.c       | 42 ++++++++++++++++++++++++------------------
>  fs/iomap/direct-io.c |  3 +--
>  2 files changed, 25 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 6d5370eac2a8..c03c2204aadf 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -592,20 +592,28 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
>   */
>  int sb_init_dio_done_wq(struct super_block *sb)
>  {
> -	struct workqueue_struct *old;
> -	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
> -						      WQ_MEM_RECLAIM, 0,
> -						      sb->s_id);
> -	if (!wq)
> -		return -ENOMEM;
> -	/*
> -	 * This has to be atomic as more DIOs can race to create the workqueue
> -	 */
> -	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
> -	/* Someone created workqueue before us? Free ours... */
> -	if (old)
> -		destroy_workqueue(wq);
> -	return 0;
> +	static DEFINE_MUTEX(sb_init_dio_done_mutex);
> +	struct workqueue_struct *wq;
> +	int err = 0;
> +
> +	/* Pairs with the smp_store_release() below */
> +	if (smp_load_acquire(&sb->s_dio_done_wq))
> +		return 0;
> +
> +	mutex_lock(&sb_init_dio_done_mutex);
> +	if (sb->s_dio_done_wq)
> +		goto out;
> +
> +	wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id);
> +	if (!wq) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +	/* Pairs with the smp_load_acquire() above */
> +	smp_store_release(&sb->s_dio_done_wq, wq);

Why not use cmpxchg_release here?  Is the mutex actually required here,
or is this merely following the "don't complicate it up" guidelines in
the "One-Time Init" recipe that say not to use cmpxchg_release unless
you have a strong justification for it?

The code changes look ok to me, fwiw.

--D

> +out:
> +	mutex_unlock(&sb_init_dio_done_mutex);
> +	return err;
>  }
>  
>  static int dio_set_defer_completion(struct dio *dio)
> @@ -615,9 +623,7 @@ static int dio_set_defer_completion(struct dio *dio)
>  	if (dio->defer_completion)
>  		return 0;
>  	dio->defer_completion = true;
> -	if (!sb->s_dio_done_wq)
> -		return sb_init_dio_done_wq(sb);
> -	return 0;
> +	return sb_init_dio_done_wq(sb);
>  }
>  
>  /*
> @@ -1250,7 +1256,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  		retval = 0;
>  		if (iocb->ki_flags & IOCB_DSYNC)
>  			retval = dio_set_defer_completion(dio);
> -		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +		else {
>  			/*
>  			 * In case of AIO write racing with buffered read we
>  			 * need to defer completion. We can't decide this now,
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ec7b78e6feca..dc7fe898dab8 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -487,8 +487,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		dio_warn_stale_pagecache(iocb->ki_filp);
>  	ret = 0;
>  
> -	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
> -	    !inode->i_sb->s_dio_done_wq) {
> +	if (iov_iter_rw(iter) == WRITE && !wait_for_completion) {
>  		ret = sb_init_dio_done_wq(inode->i_sb);
>  		if (ret < 0)
>  			goto out_free_dio;
> -- 
> 2.27.0
>
Dave Chinner July 18, 2020, 12:15 a.m. UTC | #3
On Thu, Jul 16, 2020 at 10:05:10PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Correctly implement the "one-time" init pattern for ->s_dio_done_wq.
> This fixes the following issues:
> 
> - The LKMM doesn't guarantee that the workqueue will be seen initialized
>   before being used, if another CPU allocated it.  With regards to
>   specific CPU architectures, this is true on at least Alpha, but it may
>   be true on other architectures too if the internal implementation of
>   workqueues causes use of the workqueue to involve a control
>   dependency.  (There doesn't appear to be a control dependency
>   currently, but it's hard to tell and it could change in the future.)
> 
> - The preliminary checks for sb->s_dio_done_wq are a data race, since
>   they do a plain load of a concurrently modified variable.  According
>   to the C standard, this undefined behavior.  In practice, the kernel
>   does sometimes makes assumptions about data races might be okay in
>   practice, but these rules are undocumented and not uniformly agreed
>   upon, so it's best to avoid cases where they might come into play.
> 
> Following the guidance for one-time init I've proposed at
> https://lkml.kernel.org/r/20200717044427.68747-1-ebiggers@kernel.org,
> replace it with the simplest implementation that is guaranteed to be
> correct while still achieving the following properties:
> 
>     - Doesn't make direct I/O users contend on a mutex in the fast path.
> 
>     - Doesn't allocate the workqueue when it will never be used.
> 
> Fixes: 7b7a8665edd8 ("direct-io: Implement generic deferred AIO completions")
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
> 
> v2: new implementation using smp_load_acquire() + smp_store_release()
>     and a mutex.

A mutex?

That's over-engineered premature optimisation - the allocation path
is a slow path that will only ever be hit only on the first few
direct IOs if an app manages to synchronise it's first ever
concurrent DIOs to different files perfectly. There is zero need to
"optimise" the code like this.

I've already suggested that we get rid of this whole dynamic
initialisation code out of the direct IO path altogether for good
reason: all of this goes away and we don't have to care about
optimising it for performance at all.

We have two options as I see it: always allocate the workqueue on
direct IO capable filesytsems in their ->fill_super() method, or
allocate it on the first open(O_DIRECT) where we check if O_DIRECT
is supported by the filesystem.

i.e. do_dentry_open() does this:

        /* NB: we're sure to have correct a_ops only after f_op->open */
        if (f->f_flags & O_DIRECT) {
                if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
                        return -EINVAL;
        }

Allocate the work queue there, and we don't need to care about how
fast or slow setting up the workqueue is and so there is zero need
to optimise it for speed.

Cheers,

Dave.
Eric Biggers July 18, 2020, 12:42 a.m. UTC | #4
Hi Dave,

On Sat, Jul 18, 2020 at 10:15:36AM +1000, Dave Chinner wrote:
> On Thu, Jul 16, 2020 at 10:05:10PM -0700, Eric Biggers wrote:
> > From: Eric Biggers <ebiggers@google.com>
> > 
> > Correctly implement the "one-time" init pattern for ->s_dio_done_wq.
> > This fixes the following issues:
> > 
> > - The LKMM doesn't guarantee that the workqueue will be seen initialized
> >   before being used, if another CPU allocated it.  With regards to
> >   specific CPU architectures, this is true on at least Alpha, but it may
> >   be true on other architectures too if the internal implementation of
> >   workqueues causes use of the workqueue to involve a control
> >   dependency.  (There doesn't appear to be a control dependency
> >   currently, but it's hard to tell and it could change in the future.)
> > 
> > - The preliminary checks for sb->s_dio_done_wq are a data race, since
> >   they do a plain load of a concurrently modified variable.  According
> >   to the C standard, this undefined behavior.  In practice, the kernel
> >   does sometimes makes assumptions about data races might be okay in
> >   practice, but these rules are undocumented and not uniformly agreed
> >   upon, so it's best to avoid cases where they might come into play.
> > 
> > Following the guidance for one-time init I've proposed at
> > https://lkml.kernel.org/r/20200717044427.68747-1-ebiggers@kernel.org,
> > replace it with the simplest implementation that is guaranteed to be
> > correct while still achieving the following properties:
> > 
> >     - Doesn't make direct I/O users contend on a mutex in the fast path.
> > 
> >     - Doesn't allocate the workqueue when it will never be used.
> > 
> > Fixes: 7b7a8665edd8 ("direct-io: Implement generic deferred AIO completions")
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > ---
> > 
> > v2: new implementation using smp_load_acquire() + smp_store_release()
> >     and a mutex.
> 
> A mutex?
> 
> That's over-engineered premature optimisation - the allocation path
> is a slow path that will only ever be hit only on the first few
> direct IOs if an app manages to synchronise it's first ever
> concurrent DIOs to different files perfectly. There is zero need to
> "optimise" the code like this.

You're completely misunderstanding the point of this change.  The mutex version
is actually simpler and easier to get right than the cmpxchg() version (which
what I'm replacing) -- see the tools/memory-model/Documentation/ patch I've
proposed which explains this.  In fact the existing use of cmpxchg() is wrong,
since cmpxchg() doesn't guarantee an ACQUIRE barrier on failure.

> 
> I've already suggested that we get rid of this whole dynamic
> initialisation code out of the direct IO path altogether for good
> reason: all of this goes away and we don't have to care about
> optimising it for performance at all.
> 
> We have two options as I see it: always allocate the workqueue on
> direct IO capable filesytsems in their ->fill_super() method, or
> allocate it on the first open(O_DIRECT) where we check if O_DIRECT
> is supported by the filesystem.
> 
> i.e. do_dentry_open() does this:
> 
>         /* NB: we're sure to have correct a_ops only after f_op->open */
>         if (f->f_flags & O_DIRECT) {
>                 if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
>                         return -EINVAL;
>         }
> 
> Allocate the work queue there, and we don't need to care about how
> fast or slow setting up the workqueue is and so there is zero need
> to optimise it for speed.

You also suggested about 4 other different things, so I don't know which one you
actually want.  Now you're still suggesting multiple different things.

Not having to add filesystem-specific code to nearly every filesystem and not
allocating the workqueue when it won't be used are desirable properties to have,
and I think worth using one-time-init for.

Multiple threads can execute do_dentry_open() concurrently on the same
filesystem, so we'd still have to use one-time init for that.  Is your proposal
to do that and use the implementation where the mutex is unconditionally taken?

- Eric
diff mbox series

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6d5370eac2a8..c03c2204aadf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -592,20 +592,28 @@  static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  */
 int sb_init_dio_done_wq(struct super_block *sb)
 {
-	struct workqueue_struct *old;
-	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
-						      WQ_MEM_RECLAIM, 0,
-						      sb->s_id);
-	if (!wq)
-		return -ENOMEM;
-	/*
-	 * This has to be atomic as more DIOs can race to create the workqueue
-	 */
-	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
-	/* Someone created workqueue before us? Free ours... */
-	if (old)
-		destroy_workqueue(wq);
-	return 0;
+	static DEFINE_MUTEX(sb_init_dio_done_mutex);
+	struct workqueue_struct *wq;
+	int err = 0;
+
+	/* Pairs with the smp_store_release() below */
+	if (smp_load_acquire(&sb->s_dio_done_wq))
+		return 0;
+
+	mutex_lock(&sb_init_dio_done_mutex);
+	if (sb->s_dio_done_wq)
+		goto out;
+
+	wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id);
+	if (!wq) {
+		err = -ENOMEM;
+		goto out;
+	}
+	/* Pairs with the smp_load_acquire() above */
+	smp_store_release(&sb->s_dio_done_wq, wq);
+out:
+	mutex_unlock(&sb_init_dio_done_mutex);
+	return err;
 }
 
 static int dio_set_defer_completion(struct dio *dio)
@@ -615,9 +623,7 @@  static int dio_set_defer_completion(struct dio *dio)
 	if (dio->defer_completion)
 		return 0;
 	dio->defer_completion = true;
-	if (!sb->s_dio_done_wq)
-		return sb_init_dio_done_wq(sb);
-	return 0;
+	return sb_init_dio_done_wq(sb);
 }
 
 /*
@@ -1250,7 +1256,7 @@  do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		retval = 0;
 		if (iocb->ki_flags & IOCB_DSYNC)
 			retval = dio_set_defer_completion(dio);
-		else if (!dio->inode->i_sb->s_dio_done_wq) {
+		else {
 			/*
 			 * In case of AIO write racing with buffered read we
 			 * need to defer completion. We can't decide this now,
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ec7b78e6feca..dc7fe898dab8 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -487,8 +487,7 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		dio_warn_stale_pagecache(iocb->ki_filp);
 	ret = 0;
 
-	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion) {
 		ret = sb_init_dio_done_wq(inode->i_sb);
 		if (ret < 0)
 			goto out_free_dio;