ceph: add buffered/direct exclusionary locking for reads and writes
diff mbox series

Message ID 20190805200501.17905-1-jlayton@kernel.org
State New
Headers show
Series
  • ceph: add buffered/direct exclusionary locking for reads and writes
Related show

Commit Message

Jeff Layton Aug. 5, 2019, 8:05 p.m. UTC
xfstest generic/451 intermittently fails. The test does O_DIRECT writes
to a file, and then reads back the result using buffered I/O, while
running a separate set of tasks that are also doing buffered reads.

The client will invalidate the cache prior to a direct write, but it's
easy for one of the other readers' replies to race in and reinstantiate
the invalidated range with stale data.

To fix this, we must to serialize direct I/O writes and buffered reads.
We could just sprinkle in some shared locks on the i_rwsem for reads,
and increase the exclusive footprint on the write side, but that would
cause O_DIRECT writes to end up serialized vs. other direct requests.

Instead, borrow the scheme used by nfs.ko. Buffered writes take the
i_rwsem exclusively, but buffered reads and all O_DIRECT requests
take a shared lock, allowing them to run in parallel.

To fix the bug though, we need buffered reads and O_DIRECT I/O to not
run in parallel. A flag on the ceph_inode_info is used to indicate
whether it's in direct or buffered I/O mode. When a conflicting request
is submitted, it will block until the inode can flip to the necessary
mode.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/Makefile |   2 +-
 fs/ceph/file.c   |  28 ++++++--
 fs/ceph/io.c     | 163 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/io.h     |  12 ++++
 fs/ceph/super.h  |   2 +-
 5 files changed, 198 insertions(+), 9 deletions(-)
 create mode 100644 fs/ceph/io.c
 create mode 100644 fs/ceph/io.h

Comments

Yan, Zheng Aug. 6, 2019, 3:21 a.m. UTC | #1
On Tue, Aug 6, 2019 at 4:05 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> to a file, and then reads back the result using buffered I/O, while
> running a separate set of tasks that are also doing buffered reads.
>
> The client will invalidate the cache prior to a direct write, but it's
> easy for one of the other readers' replies to race in and reinstantiate
> the invalidated range with stale data.
>
> To fix this, we must to serialize direct I/O writes and buffered reads.
> We could just sprinkle in some shared locks on the i_rwsem for reads,
> and increase the exclusive footprint on the write side, but that would
> cause O_DIRECT writes to end up serialized vs. other direct requests.
>
> Instead, borrow the scheme used by nfs.ko. Buffered writes take the
> i_rwsem exclusively, but buffered reads and all O_DIRECT requests
> take a shared lock, allowing them to run in parallel.
>
> To fix the bug though, we need buffered reads and O_DIRECT I/O to not
> run in parallel. A flag on the ceph_inode_info is used to indicate
> whether it's in direct or buffered I/O mode. When a conflicting request
> is submitted, it will block until the inode can flip to the necessary
> mode.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/Makefile |   2 +-
>  fs/ceph/file.c   |  28 ++++++--
>  fs/ceph/io.c     | 163 +++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/io.h     |  12 ++++
>  fs/ceph/super.h  |   2 +-
>  5 files changed, 198 insertions(+), 9 deletions(-)
>  create mode 100644 fs/ceph/io.c
>  create mode 100644 fs/ceph/io.h
>
> diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
> index a699e320393f..c1da294418d1 100644
> --- a/fs/ceph/Makefile
> +++ b/fs/ceph/Makefile
> @@ -6,7 +6,7 @@
>  obj-$(CONFIG_CEPH_FS) += ceph.o
>
>  ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
> -       export.o caps.o snap.o xattr.o quota.o \
> +       export.o caps.o snap.o xattr.o quota.o io.o \
>         mds_client.o mdsmap.o strings.o ceph_frag.o \
>         debugfs.o
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 5182e1a49d6f..d7d264e05303 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -15,6 +15,7 @@
>  #include "super.h"
>  #include "mds_client.h"
>  #include "cache.h"
> +#include "io.h"
>
>  static __le32 ceph_flags_sys2wire(u32 flags)
>  {
> @@ -1284,12 +1285,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>
>                 if (ci->i_inline_version == CEPH_INLINE_NONE) {
>                         if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
> +                               ceph_start_io_direct(inode);
>                                 ret = ceph_direct_read_write(iocb, to,
>                                                              NULL, NULL);
> +                               ceph_end_io_direct(inode);
>                                 if (ret >= 0 && ret < len)
>                                         retry_op = CHECK_EOF;
>                         } else {
> +                               ceph_start_io_read(inode);
>                                 ret = ceph_sync_read(iocb, to, &retry_op);
> +                               ceph_end_io_read(inode);
>                         }
>                 } else {
>                         retry_op = READ_INLINE;
> @@ -1300,7 +1305,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>                      inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
>                      ceph_cap_string(got));
>                 ceph_add_rw_context(fi, &rw_ctx);
> +               ceph_start_io_read(inode);
>                 ret = generic_file_read_iter(iocb, to);
> +               ceph_end_io_read(inode);
>                 ceph_del_rw_context(fi, &rw_ctx);
>         }
>         dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
> @@ -1409,7 +1416,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 return -ENOMEM;
>
>  retry_snap:
> -       inode_lock(inode);
> +       if (iocb->ki_flags & IOCB_DIRECT)
> +               ceph_start_io_direct(inode);
> +       else
> +               ceph_start_io_write(inode);
>
>         /* We can write back this queue in page reclaim */
>         current->backing_dev_info = inode_to_bdi(inode);
> @@ -1480,7 +1490,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>             (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
>                 struct ceph_snap_context *snapc;
>                 struct iov_iter data;
> -               inode_unlock(inode);
>
>                 spin_lock(&ci->i_ceph_lock);
>                 if (__ceph_have_pending_cap_snap(ci)) {
> @@ -1497,11 +1506,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>
>                 /* we might need to revert back to that point */
>                 data = *from;
> -               if (iocb->ki_flags & IOCB_DIRECT)
> +               if (iocb->ki_flags & IOCB_DIRECT) {
>                         written = ceph_direct_read_write(iocb, &data, snapc,
>                                                          &prealloc_cf);
> -               else
> +                       ceph_end_io_direct(inode);
> +               } else {
>                         written = ceph_sync_write(iocb, &data, pos, snapc);
> +                       ceph_end_io_write(inode);
> +               }
>                 if (written > 0)
>                         iov_iter_advance(from, written);
>                 ceph_put_snap_context(snapc);
> @@ -1516,7 +1528,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 written = generic_perform_write(file, from, pos);
>                 if (likely(written >= 0))
>                         iocb->ki_pos = pos + written;
> -               inode_unlock(inode);
> +               ceph_end_io_write(inode);
>         }
>
>         if (written >= 0) {
> @@ -1551,9 +1563,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         }
>
>         goto out_unlocked;
> -
>  out:
> -       inode_unlock(inode);
> +       if (iocb->ki_flags & IOCB_DIRECT)
> +               ceph_end_io_direct(inode);
> +       else
> +               ceph_end_io_write(inode);
>  out_unlocked:
>         ceph_free_cap_flush(prealloc_cf);
>         current->backing_dev_info = NULL;
> diff --git a/fs/ceph/io.c b/fs/ceph/io.c
> new file mode 100644
> index 000000000000..8367cbcc81e8
> --- /dev/null
> +++ b/fs/ceph/io.c
> @@ -0,0 +1,163 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2016 Trond Myklebust
> + * Copyright (c) 2019 Jeff Layton
> + *
> + * I/O and data path helper functionality.
> + *
> + * Heavily borrowed from equivalent code in fs/nfs/io.c
> + */
> +
> +#include <linux/ceph/ceph_debug.h>
> +
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/rwsem.h>
> +#include <linux/fs.h>
> +
> +#include "super.h"
> +#include "io.h"
> +
> +/* Call with exclusively locked inode->i_rwsem */
> +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
> +{
> +       lockdep_assert_held_write(&inode->i_rwsem);
> +
> +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
> +               spin_lock(&ci->i_ceph_lock);
> +               ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
> +               spin_unlock(&ci->i_ceph_lock);
> +               inode_dio_wait(inode);
> +       }
> +}
> +
> +/**
> + * ceph_start_io_read - declare the file is being used for buffered reads
> + * @inode: file inode
> + *
> + * Declare that a buffered read operation is about to start, and ensure
> + * that we block all direct I/O.
> + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
> + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> + * cannot be changed.
> + * In practice, this means that buffered read operations are allowed to
> + * execute in parallel, thanks to the shared lock, whereas direct I/O
> + * operations need to wait to grab an exclusive lock in order to set
> + * CEPH_I_ODIRECT.
> + * Note that buffered writes and truncates both take a write lock on
> + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
> + */
> +void
> +ceph_start_io_read(struct inode *inode)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       /* Be an optimist! */
> +       down_read(&inode->i_rwsem);
> +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
> +               return;
> +       up_read(&inode->i_rwsem);
> +       /* Slow path.... */
> +       down_write(&inode->i_rwsem);
> +       ceph_block_o_direct(ci, inode);
> +       downgrade_write(&inode->i_rwsem);
> +}
> +
> +/**
> + * ceph_end_io_read - declare that the buffered read operation is done
> + * @inode: file inode
> + *
> + * Declare that a buffered read operation is done, and release the shared
> + * lock on inode->i_rwsem.
> + */
> +void
> +ceph_end_io_read(struct inode *inode)
> +{
> +       up_read(&inode->i_rwsem);
> +}
> +
> +/**
> + * ceph_start_io_write - declare the file is being used for buffered writes
> + * @inode: file inode
> + *
> + * Declare that a buffered read operation is about to start, and ensure
> + * that we block all direct I/O.
> + */
> +void
> +ceph_start_io_write(struct inode *inode)
> +{
> +       down_write(&inode->i_rwsem);
> +       ceph_block_o_direct(ceph_inode(inode), inode);
> +}
> +
> +/**
> + * ceph_end_io_write - declare that the buffered write operation is done
> + * @inode: file inode
> + *
> + * Declare that a buffered write operation is done, and release the
> + * lock on inode->i_rwsem.
> + */
> +void
> +ceph_end_io_write(struct inode *inode)
> +{
> +       up_write(&inode->i_rwsem);
> +}
> +
> +/* Call with exclusively locked inode->i_rwsem */
> +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
> +{
> +       lockdep_assert_held_write(&inode->i_rwsem);
> +
> +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
> +               spin_lock(&ci->i_ceph_lock);
> +               ci->i_ceph_flags |= CEPH_I_ODIRECT;
> +               spin_unlock(&ci->i_ceph_lock);
> +               /* FIXME: unmap_mapping_range? */
> +               filemap_write_and_wait(inode->i_mapping);
> +       }
> +}
> +
> +/**
> + * ceph_end_io_direct - declare the file is being used for direct i/o
> + * @inode: file inode
> + *
> + * Declare that a direct I/O operation is about to start, and ensure
> + * that we block all buffered I/O.
> + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
> + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> + * cannot be changed.
> + * In practice, this means that direct I/O operations are allowed to
> + * execute in parallel, thanks to the shared lock, whereas buffered I/O
> + * operations need to wait to grab an exclusive lock in order to clear
> + * CEPH_I_ODIRECT.
> + * Note that buffered writes and truncates both take a write lock on
> + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
> + */
> +void
> +ceph_start_io_direct(struct inode *inode)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       /* Be an optimist! */
> +       down_read(&inode->i_rwsem);
> +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
> +               return;
> +       up_read(&inode->i_rwsem);
> +       /* Slow path.... */
> +       down_write(&inode->i_rwsem);
> +       ceph_block_buffered(ci, inode);
> +       downgrade_write(&inode->i_rwsem);
> +}
> +
> +/**
> + * ceph_end_io_direct - declare that the direct i/o operation is done
> + * @inode: file inode
> + *
> + * Declare that a direct I/O operation is done, and release the shared
> + * lock on inode->i_rwsem.
> + */
> +void
> +ceph_end_io_direct(struct inode *inode)
> +{
> +       up_read(&inode->i_rwsem);
> +}
> diff --git a/fs/ceph/io.h b/fs/ceph/io.h
> new file mode 100644
> index 000000000000..fa594cd77348
> --- /dev/null
> +++ b/fs/ceph/io.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _FS_CEPH_IO_H
> +#define _FS_CEPH_IO_H
> +
> +void ceph_start_io_read(struct inode *inode);
> +void ceph_end_io_read(struct inode *inode);
> +void ceph_start_io_write(struct inode *inode);
> +void ceph_end_io_write(struct inode *inode);
> +void ceph_start_io_direct(struct inode *inode);
> +void ceph_end_io_direct(struct inode *inode);
> +
> +#endif /* FS_CEPH_IO_H */
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 03e4828c7635..46a64293a3f7 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -516,7 +516,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>  #define CEPH_I_FLUSH_SNAPS     (1 << 9) /* need flush snapss */
>  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
>  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> -
> +#define CEPH_I_ODIRECT         (1 << 12) /* active direct I/O in progress */
>
>  /*
>   * Masks of ceph inode work.
> --
> 2.21.0
>
Reviewd-by
Sage Weil Aug. 6, 2019, 3:27 a.m. UTC | #2
On Mon, 5 Aug 2019, Jeff Layton wrote:
> xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> to a file, and then reads back the result using buffered I/O, while
> running a separate set of tasks that are also doing buffered reads.
> 
> The client will invalidate the cache prior to a direct write, but it's
> easy for one of the other readers' replies to race in and reinstantiate
> the invalidated range with stale data.

Maybe a silly question, but: what if the write path did the invalidation 
after the write instead of before?  Then any racing read will see the new 
data on disk.

sage
Christoph Hellwig Aug. 6, 2019, 8:25 a.m. UTC | #3
On Mon, Aug 05, 2019 at 04:05:01PM -0400, Jeff Layton wrote:
> Instead, borrow the scheme used by nfs.ko. Buffered writes take the
> i_rwsem exclusively, but buffered reads and all O_DIRECT requests
> take a shared lock, allowing them to run in parallel.

Note that you'll still need an exclusive lock to guard against cache
invalidation for direct writes.  And instead of adding a new lock you
might want to look at the i_rwsem based scheme in XFS (whіch also
happens to be where O_DIRECT originally came from).
Jeff Layton Aug. 6, 2019, 10:35 a.m. UTC | #4
On Tue, 2019-08-06 at 03:27 +0000, Sage Weil wrote:
> On Mon, 5 Aug 2019, Jeff Layton wrote:
> > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > to a file, and then reads back the result using buffered I/O, while
> > running a separate set of tasks that are also doing buffered reads.
> > 
> > The client will invalidate the cache prior to a direct write, but it's
> > easy for one of the other readers' replies to race in and reinstantiate
> > the invalidated range with stale data.
> 
> Maybe a silly question, but: what if the write path did the invalidation 
> after the write instead of before?  Then any racing read will see the new 
> data on disk.
> 

I tried that originally. It reduces the race window somewhat, but it's
still present since a reply to a concurrent read can get in just after
the invalidation occurs. You really do have to serialize them to fix
this, AFAICT.
Jeff Layton Aug. 6, 2019, 10:54 a.m. UTC | #5
On Tue, 2019-08-06 at 01:25 -0700, Christoph Hellwig wrote:
> On Mon, Aug 05, 2019 at 04:05:01PM -0400, Jeff Layton wrote:
> > Instead, borrow the scheme used by nfs.ko. Buffered writes take the
> > i_rwsem exclusively, but buffered reads and all O_DIRECT requests
> > take a shared lock, allowing them to run in parallel.
> 
> Note that you'll still need an exclusive lock to guard against cache
> invalidation for direct writes.  And instead of adding a new lock you
> might want to look at the i_rwsem based scheme in XFS (whіch also
> happens to be where O_DIRECT originally came from).

Thanks, Christoph.

That part of the patch description is unclear. I'll fix that up, but
this patch does ensure that no buffered I/O can take place while any
direct I/O is in flight. Only operations of the same "flavor" can run
in parallel. Note that we _do_ use the i_rwsem here, but there is also
an additional per-inode flag that handles the buffered read/direct I/O
exclusion.

I did take a look at the xfs_ilock* code this morning. That's quite a
bit more complex than this. It's possible that ceph doesn't serialize
mmap I/O and O_DIRECT properly. I'll have to look over xfstests and see
whether there is a good test for that as well and whether it passes.

Cheers,
Sage Weil Aug. 6, 2019, 1:25 p.m. UTC | #6
On Tue, 6 Aug 2019, Jeff Layton wrote:
> On Tue, 2019-08-06 at 03:27 +0000, Sage Weil wrote:
> > On Mon, 5 Aug 2019, Jeff Layton wrote:
> > > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > > to a file, and then reads back the result using buffered I/O, while
> > > running a separate set of tasks that are also doing buffered reads.
> > > 
> > > The client will invalidate the cache prior to a direct write, but it's
> > > easy for one of the other readers' replies to race in and reinstantiate
> > > the invalidated range with stale data.
> > 
> > Maybe a silly question, but: what if the write path did the invalidation 
> > after the write instead of before?  Then any racing read will see the new 
> > data on disk.
> > 
> 
> I tried that originally. It reduces the race window somewhat, but it's
> still present since a reply to a concurrent read can get in just after
> the invalidation occurs. You really do have to serialize them to fix
> this, AFAICT.

I've always assumed that viewing the ordering for concurrent operations as 
non-deterministic is the only sane approach.  If the read initiates before 
the write completes you have no obligation to reflect the result of the 
write.

Is that what you're trying to do?

sage
Jeff Layton Aug. 6, 2019, 1:42 p.m. UTC | #7
On Tue, 2019-08-06 at 13:25 +0000, Sage Weil wrote:
> On Tue, 6 Aug 2019, Jeff Layton wrote:
> > On Tue, 2019-08-06 at 03:27 +0000, Sage Weil wrote:
> > > On Mon, 5 Aug 2019, Jeff Layton wrote:
> > > > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > > > to a file, and then reads back the result using buffered I/O, while
> > > > running a separate set of tasks that are also doing buffered reads.
> > > > 
> > > > The client will invalidate the cache prior to a direct write, but it's
> > > > easy for one of the other readers' replies to race in and reinstantiate
> > > > the invalidated range with stale data.
> > > 
> > > Maybe a silly question, but: what if the write path did the invalidation 
> > > after the write instead of before?  Then any racing read will see the new 
> > > data on disk.
> > > 
> > 
> > I tried that originally. It reduces the race window somewhat, but it's
> > still present since a reply to a concurrent read can get in just after
> > the invalidation occurs. You really do have to serialize them to fix
> > this, AFAICT.
> 
> I've always assumed that viewing the ordering for concurrent operations as 
> non-deterministic is the only sane approach.  If the read initiates before 
> the write completes you have no obligation to reflect the result of the 
> write.
> 
> Is that what you're trying to do?
> 

Not exactly.

In this testcase, we have one thread that is alternating between DIO
writes and buffered reads. Logically, the buffered read should always
reflect the result of the DIO write...and indeed if we run that program
in isolation it always does, since the cache is invalidated just prior
to the DIO write.

The issue occurs when other tasks are doing buffered reads at the same
time. Sometimes the reply to one of those will come in after the
invalidation but before the subsequent buffered read by the writing
task. If that OSD read occurs before the OSD write then it'll populate
the pagecache with stale data. The subsequent read by the writing thread
then ends up reading that stale data out of the cache.

Doing the invalidation after the DIO write reduces this race window to
some degree, but doesn't fully close it. You have to serialize things
such that buffered read requests aren't dispatched until all of the DIO
write replies have come in.
Jeff Layton Aug. 6, 2019, 2:27 p.m. UTC | #8
On Tue, 2019-08-06 at 11:21 +0800, Yan, Zheng wrote:
> On Tue, Aug 6, 2019 at 4:05 AM Jeff Layton <jlayton@kernel.org> wrote:
> > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > to a file, and then reads back the result using buffered I/O, while
> > running a separate set of tasks that are also doing buffered reads.
> > 
> > The client will invalidate the cache prior to a direct write, but it's
> > easy for one of the other readers' replies to race in and reinstantiate
> > the invalidated range with stale data.
> > 
> > To fix this, we must to serialize direct I/O writes and buffered reads.
> > We could just sprinkle in some shared locks on the i_rwsem for reads,
> > and increase the exclusive footprint on the write side, but that would
> > cause O_DIRECT writes to end up serialized vs. other direct requests.
> > 
> > Instead, borrow the scheme used by nfs.ko. Buffered writes take the
> > i_rwsem exclusively, but buffered reads and all O_DIRECT requests
> > take a shared lock, allowing them to run in parallel.
> > 
> > To fix the bug though, we need buffered reads and O_DIRECT I/O to not
> > run in parallel. A flag on the ceph_inode_info is used to indicate
> > whether it's in direct or buffered I/O mode. When a conflicting request
> > is submitted, it will block until the inode can flip to the necessary
> > mode.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/Makefile |   2 +-
> >  fs/ceph/file.c   |  28 ++++++--
> >  fs/ceph/io.c     | 163 +++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ceph/io.h     |  12 ++++
> >  fs/ceph/super.h  |   2 +-
> >  5 files changed, 198 insertions(+), 9 deletions(-)
> >  create mode 100644 fs/ceph/io.c
> >  create mode 100644 fs/ceph/io.h
> > 
> > diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
> > index a699e320393f..c1da294418d1 100644
> > --- a/fs/ceph/Makefile
> > +++ b/fs/ceph/Makefile
> > @@ -6,7 +6,7 @@
> >  obj-$(CONFIG_CEPH_FS) += ceph.o
> > 
> >  ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
> > -       export.o caps.o snap.o xattr.o quota.o \
> > +       export.o caps.o snap.o xattr.o quota.o io.o \
> >         mds_client.o mdsmap.o strings.o ceph_frag.o \
> >         debugfs.o
> > 
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 5182e1a49d6f..d7d264e05303 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -15,6 +15,7 @@
> >  #include "super.h"
> >  #include "mds_client.h"
> >  #include "cache.h"
> > +#include "io.h"
> > 
> >  static __le32 ceph_flags_sys2wire(u32 flags)
> >  {
> > @@ -1284,12 +1285,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > 
> >                 if (ci->i_inline_version == CEPH_INLINE_NONE) {
> >                         if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
> > +                               ceph_start_io_direct(inode);
> >                                 ret = ceph_direct_read_write(iocb, to,
> >                                                              NULL, NULL);
> > +                               ceph_end_io_direct(inode);
> >                                 if (ret >= 0 && ret < len)
> >                                         retry_op = CHECK_EOF;
> >                         } else {
> > +                               ceph_start_io_read(inode);
> >                                 ret = ceph_sync_read(iocb, to, &retry_op);
> > +                               ceph_end_io_read(inode);
> >                         }
> >                 } else {
> >                         retry_op = READ_INLINE;
> > @@ -1300,7 +1305,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >                      inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
> >                      ceph_cap_string(got));
> >                 ceph_add_rw_context(fi, &rw_ctx);
> > +               ceph_start_io_read(inode);
> >                 ret = generic_file_read_iter(iocb, to);
> > +               ceph_end_io_read(inode);
> >                 ceph_del_rw_context(fi, &rw_ctx);
> >         }
> >         dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
> > @@ -1409,7 +1416,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 return -ENOMEM;
> > 
> >  retry_snap:
> > -       inode_lock(inode);
> > +       if (iocb->ki_flags & IOCB_DIRECT)
> > +               ceph_start_io_direct(inode);
> > +       else
> > +               ceph_start_io_write(inode);
> > 
> >         /* We can write back this queue in page reclaim */
> >         current->backing_dev_info = inode_to_bdi(inode);
> > @@ -1480,7 +1490,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >             (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
> >                 struct ceph_snap_context *snapc;
> >                 struct iov_iter data;
> > -               inode_unlock(inode);
> > 
> >                 spin_lock(&ci->i_ceph_lock);
> >                 if (__ceph_have_pending_cap_snap(ci)) {
> > @@ -1497,11 +1506,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > 
> >                 /* we might need to revert back to that point */
> >                 data = *from;
> > -               if (iocb->ki_flags & IOCB_DIRECT)
> > +               if (iocb->ki_flags & IOCB_DIRECT) {
> >                         written = ceph_direct_read_write(iocb, &data, snapc,
> >                                                          &prealloc_cf);
> > -               else
> > +                       ceph_end_io_direct(inode);
> > +               } else {
> >                         written = ceph_sync_write(iocb, &data, pos, snapc);
> > +                       ceph_end_io_write(inode);
> > +               }
> >                 if (written > 0)
> >                         iov_iter_advance(from, written);
> >                 ceph_put_snap_context(snapc);
> > @@ -1516,7 +1528,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 written = generic_perform_write(file, from, pos);
> >                 if (likely(written >= 0))
> >                         iocb->ki_pos = pos + written;
> > -               inode_unlock(inode);
> > +               ceph_end_io_write(inode);
> >         }
> > 
> >         if (written >= 0) {
> > @@ -1551,9 +1563,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >         }
> > 
> >         goto out_unlocked;
> > -
> >  out:
> > -       inode_unlock(inode);
> > +       if (iocb->ki_flags & IOCB_DIRECT)
> > +               ceph_end_io_direct(inode);
> > +       else
> > +               ceph_end_io_write(inode);
> >  out_unlocked:
> >         ceph_free_cap_flush(prealloc_cf);
> >         current->backing_dev_info = NULL;
> > diff --git a/fs/ceph/io.c b/fs/ceph/io.c
> > new file mode 100644
> > index 000000000000..8367cbcc81e8
> > --- /dev/null
> > +++ b/fs/ceph/io.c
> > @@ -0,0 +1,163 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (c) 2016 Trond Myklebust
> > + * Copyright (c) 2019 Jeff Layton
> > + *
> > + * I/O and data path helper functionality.
> > + *
> > + * Heavily borrowed from equivalent code in fs/nfs/io.c
> > + */
> > +
> > +#include <linux/ceph/ceph_debug.h>
> > +
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/rwsem.h>
> > +#include <linux/fs.h>
> > +
> > +#include "super.h"
> > +#include "io.h"
> > +
> > +/* Call with exclusively locked inode->i_rwsem */
> > +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
> > +{
> > +       lockdep_assert_held_write(&inode->i_rwsem);
> > +
> > +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
> > +               spin_lock(&ci->i_ceph_lock);
> > +               ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
> > +               spin_unlock(&ci->i_ceph_lock);
> > +               inode_dio_wait(inode);
> > +       }
> > +}
> > +
> > +/**
> > + * ceph_start_io_read - declare the file is being used for buffered reads
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered read operation is about to start, and ensure
> > + * that we block all direct I/O.
> > + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
> > + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> > + * cannot be changed.
> > + * In practice, this means that buffered read operations are allowed to
> > + * execute in parallel, thanks to the shared lock, whereas direct I/O
> > + * operations need to wait to grab an exclusive lock in order to set
> > + * CEPH_I_ODIRECT.
> > + * Note that buffered writes and truncates both take a write lock on
> > + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
> > + */
> > +void
> > +ceph_start_io_read(struct inode *inode)
> > +{
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +       /* Be an optimist! */
> > +       down_read(&inode->i_rwsem);
> > +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
> > +               return;
> > +       up_read(&inode->i_rwsem);
> > +       /* Slow path.... */
> > +       down_write(&inode->i_rwsem);
> > +       ceph_block_o_direct(ci, inode);
> > +       downgrade_write(&inode->i_rwsem);
> > +}
> > +
> > +/**
> > + * ceph_end_io_read - declare that the buffered read operation is done
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered read operation is done, and release the shared
> > + * lock on inode->i_rwsem.
> > + */
> > +void
> > +ceph_end_io_read(struct inode *inode)
> > +{
> > +       up_read(&inode->i_rwsem);
> > +}
> > +
> > +/**
> > + * ceph_start_io_write - declare the file is being used for buffered writes
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered read operation is about to start, and ensure
> > + * that we block all direct I/O.
> > + */
> > +void
> > +ceph_start_io_write(struct inode *inode)
> > +{
> > +       down_write(&inode->i_rwsem);
> > +       ceph_block_o_direct(ceph_inode(inode), inode);
> > +}
> > +
> > +/**
> > + * ceph_end_io_write - declare that the buffered write operation is done
> > + * @inode: file inode
> > + *
> > + * Declare that a buffered write operation is done, and release the
> > + * lock on inode->i_rwsem.
> > + */
> > +void
> > +ceph_end_io_write(struct inode *inode)
> > +{
> > +       up_write(&inode->i_rwsem);
> > +}
> > +
> > +/* Call with exclusively locked inode->i_rwsem */
> > +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
> > +{
> > +       lockdep_assert_held_write(&inode->i_rwsem);
> > +
> > +       if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
> > +               spin_lock(&ci->i_ceph_lock);
> > +               ci->i_ceph_flags |= CEPH_I_ODIRECT;
> > +               spin_unlock(&ci->i_ceph_lock);
> > +               /* FIXME: unmap_mapping_range? */
> > +               filemap_write_and_wait(inode->i_mapping);
> > +       }
> > +}
> > +
> > +/**
> > + * ceph_end_io_direct - declare the file is being used for direct i/o
> > + * @inode: file inode
> > + *
> > + * Declare that a direct I/O operation is about to start, and ensure
> > + * that we block all buffered I/O.
> > + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
> > + * and holds a shared lock on inode->i_rwsem to ensure that the flag
> > + * cannot be changed.
> > + * In practice, this means that direct I/O operations are allowed to
> > + * execute in parallel, thanks to the shared lock, whereas buffered I/O
> > + * operations need to wait to grab an exclusive lock in order to clear
> > + * CEPH_I_ODIRECT.
> > + * Note that buffered writes and truncates both take a write lock on
> > + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
> > + */
> > +void
> > +ceph_start_io_direct(struct inode *inode)
> > +{
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +       /* Be an optimist! */
> > +       down_read(&inode->i_rwsem);
> > +       if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
> > +               return;
> > +       up_read(&inode->i_rwsem);
> > +       /* Slow path.... */
> > +       down_write(&inode->i_rwsem);
> > +       ceph_block_buffered(ci, inode);
> > +       downgrade_write(&inode->i_rwsem);
> > +}
> > +
> > +/**
> > + * ceph_end_io_direct - declare that the direct i/o operation is done
> > + * @inode: file inode
> > + *
> > + * Declare that a direct I/O operation is done, and release the shared
> > + * lock on inode->i_rwsem.
> > + */
> > +void
> > +ceph_end_io_direct(struct inode *inode)
> > +{
> > +       up_read(&inode->i_rwsem);
> > +}
> > diff --git a/fs/ceph/io.h b/fs/ceph/io.h
> > new file mode 100644
> > index 000000000000..fa594cd77348
> > --- /dev/null
> > +++ b/fs/ceph/io.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _FS_CEPH_IO_H
> > +#define _FS_CEPH_IO_H
> > +
> > +void ceph_start_io_read(struct inode *inode);
> > +void ceph_end_io_read(struct inode *inode);
> > +void ceph_start_io_write(struct inode *inode);
> > +void ceph_end_io_write(struct inode *inode);
> > +void ceph_start_io_direct(struct inode *inode);
> > +void ceph_end_io_direct(struct inode *inode);
> > +
> > +#endif /* FS_CEPH_IO_H */
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 03e4828c7635..46a64293a3f7 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -516,7 +516,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> >  #define CEPH_I_FLUSH_SNAPS     (1 << 9) /* need flush snapss */
> >  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> >  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> > -
> > +#define CEPH_I_ODIRECT         (1 << 12) /* active direct I/O in progress */
> > 
> >  /*
> >   * Masks of ceph inode work.
> > --
> > 2.21.0
> > 
> Reviewd-by

Thanks, Zheng.

I cleaned up the comments and changelog, and added another delta to
remove the filemap_write_and_wait_range() call at the top of
ceph_direct_read_write() since it's no longer needed.

I've pushed the result to ceph-client/testing. Let me know if you hear
of any regressions with this (performance regressions in particular).
Sage Weil Aug. 6, 2019, 2:39 p.m. UTC | #9
On Tue, 6 Aug 2019, Jeff Layton wrote:
> On Tue, 2019-08-06 at 13:25 +0000, Sage Weil wrote:
> > On Tue, 6 Aug 2019, Jeff Layton wrote:
> > > On Tue, 2019-08-06 at 03:27 +0000, Sage Weil wrote:
> > > > On Mon, 5 Aug 2019, Jeff Layton wrote:
> > > > > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > > > > to a file, and then reads back the result using buffered I/O, while
> > > > > running a separate set of tasks that are also doing buffered reads.
> > > > > 
> > > > > The client will invalidate the cache prior to a direct write, but it's
> > > > > easy for one of the other readers' replies to race in and reinstantiate
> > > > > the invalidated range with stale data.
> > > > 
> > > > Maybe a silly question, but: what if the write path did the invalidation 
> > > > after the write instead of before?  Then any racing read will see the new 
> > > > data on disk.
> > > > 
> > > 
> > > I tried that originally. It reduces the race window somewhat, but it's
> > > still present since a reply to a concurrent read can get in just after
> > > the invalidation occurs. You really do have to serialize them to fix
> > > this, AFAICT.
> > 
> > I've always assumed that viewing the ordering for concurrent operations as 
> > non-deterministic is the only sane approach.  If the read initiates before 
> > the write completes you have no obligation to reflect the result of the 
> > write.
> > 
> > Is that what you're trying to do?
> > 
> 
> Not exactly.
> 
> In this testcase, we have one thread that is alternating between DIO
> writes and buffered reads. Logically, the buffered read should always
> reflect the result of the DIO write...and indeed if we run that program
> in isolation it always does, since the cache is invalidated just prior
> to the DIO write.
> 
> The issue occurs when other tasks are doing buffered reads at the same
> time. Sometimes the reply to one of those will come in after the
> invalidation but before the subsequent buffered read by the writing
> task. If that OSD read occurs before the OSD write then it'll populate

This part confuses me.  Let's say the DIO thread does 'write a', 'read 
(a)', 'write b', then 'read (b)'.  And the interfering buffered reader 
thread does a 'read (a)'.  In order for that to pollute the cache, the 
reply has to arrive *after* the 'write b' ack is received and the 
invalidation happens.  However, I don't think it's possible for a read ack 
to arrive *after* a write and not reflect that write, since the reply 
delivery from the osd to client is ordered.

Am I misunderstanding the sequence of events?

1: write a
1: get write ack, invalidate
1: read (sees a)
1: write b
2: start buffered read
1: get write ack, invalidate
2: get buffered read reply, cache 'b'
1: read (sees cached b)

If the buffered read starts earlier, then we would get the reply before 
the write ack, and the cached result would get invalidated when the direct 
write completes.

It's sometimes possible for reads that race with writes to complete 
*sooner* (i.e., return pre-write value, because they don't wait for things 
degraded objects to be repaired, which writes do wait for), but then the 
read completes first.  There is a flag you can set to force reads to be 
ordered like writes (RWORDERED), which means they will take at least as 
long as racing writes. But IIRC reads never take longer than writes.

sage


> the pagecache with stale data. The subsequent read by the writing thread
> then ends up reading that stale data out of the cache.
> 
> Doing the invalidation after the DIO write reduces this race window to
> some degree, but doesn't fully close it. You have to serialize things
> such that buffered read requests aren't dispatched until all of the DIO
> write replies have come in.
> -- 
> Jeff Layton <jlayton@kernel.org>
> 
>
Jeff Layton Aug. 6, 2019, 3:21 p.m. UTC | #10
On Tue, 2019-08-06 at 14:39 +0000, Sage Weil wrote:
> On Tue, 6 Aug 2019, Jeff Layton wrote:
> > On Tue, 2019-08-06 at 13:25 +0000, Sage Weil wrote:
> > > On Tue, 6 Aug 2019, Jeff Layton wrote:
> > > > On Tue, 2019-08-06 at 03:27 +0000, Sage Weil wrote:
> > > > > On Mon, 5 Aug 2019, Jeff Layton wrote:
> > > > > > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > > > > > to a file, and then reads back the result using buffered I/O, while
> > > > > > running a separate set of tasks that are also doing buffered reads.
> > > > > > 
> > > > > > The client will invalidate the cache prior to a direct write, but it's
> > > > > > easy for one of the other readers' replies to race in and reinstantiate
> > > > > > the invalidated range with stale data.
> > > > > 
> > > > > Maybe a silly question, but: what if the write path did the invalidation 
> > > > > after the write instead of before?  Then any racing read will see the new 
> > > > > data on disk.
> > > > > 
> > > > 
> > > > I tried that originally. It reduces the race window somewhat, but it's
> > > > still present since a reply to a concurrent read can get in just after
> > > > the invalidation occurs. You really do have to serialize them to fix
> > > > this, AFAICT.
> > > 
> > > I've always assumed that viewing the ordering for concurrent operations as 
> > > non-deterministic is the only sane approach.  If the read initiates before 
> > > the write completes you have no obligation to reflect the result of the 
> > > write.
> > > 
> > > Is that what you're trying to do?
> > > 
> > 
> > Not exactly.
> > 
> > In this testcase, we have one thread that is alternating between DIO
> > writes and buffered reads. Logically, the buffered read should always
> > reflect the result of the DIO write...and indeed if we run that program
> > in isolation it always does, since the cache is invalidated just prior
> > to the DIO write.
> > 
> > The issue occurs when other tasks are doing buffered reads at the same
> > time. Sometimes the reply to one of those will come in after the
> > invalidation but before the subsequent buffered read by the writing
> > task. If that OSD read occurs before the OSD write then it'll populate
> 
> This part confuses me.  Let's say the DIO thread does 'write a', 'read 
> (a)', 'write b', then 'read (b)'.  And the interfering buffered reader 
> thread does a 'read (a)'.  In order for that to pollute the cache, the 
> reply has to arrive *after* the 'write b' ack is received and the 
> invalidation happens.  However, I don't think it's possible for a read ack 
> to arrive *after* a write and not reflect that write, since the reply 
> delivery from the osd to client is ordered.
> 
> Am I misunderstanding the sequence of events?
> 
> 1: write a
> 1: get write ack, invalidate
> 1: read (sees a)
> 1: write b
> 2: start buffered read
> 1: get write ack, invalidate
> 2: get buffered read reply, cache 'b'
> 1: read (sees cached b)
>
> If the buffered read starts earlier, then we would get the reply before 
> the write ack, and the cached result would get invalidated when the direct 
> write completes.
> 

The problem here though (IIUC) is that we don't have a strict guarantee
on the ordering of the reply processing, in particular between buffered
and direct I/O. We issue requests to libceph's OSD infrastructure, but
the readpages machinery is mostly done in the context of the task that
initiated it.

When we get the reply from the OSD, the pagevec is populated and then
the initiating task is awoken, but nothing ensures that the initiating
tasks doing the postprocessing end up serialized in the "correct" order.

IOW, even though the read reply came in before the write reply, the
writing task ended up getting scheduled first when they were both
awoken.

> It's sometimes possible for reads that race with writes to complete 
> *sooner* (i.e., return pre-write value, because they don't wait for things 
> degraded objects to be repaired, which writes do wait for), but then the 
> read completes first.  There is a flag you can set to force reads to be 
> ordered like writes (RWORDERED), which means they will take at least as 
> long as racing writes. But IIRC reads never take longer than writes.
> 
>
> > the pagecache with stale data. The subsequent read by the writing thread
> > then ends up reading that stale data out of the cache.
> > 
> > Doing the invalidation after the DIO write reduces this race window to
> > some degree, but doesn't fully close it. You have to serialize things
> > such that buffered read requests aren't dispatched until all of the DIO
> > write replies have come in.
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> > 
> >
Sage Weil Aug. 6, 2019, 3:27 p.m. UTC | #11
On Tue, 6 Aug 2019, Jeff Layton wrote:
> On Tue, 2019-08-06 at 14:39 +0000, Sage Weil wrote:
> > On Tue, 6 Aug 2019, Jeff Layton wrote:
> > > On Tue, 2019-08-06 at 13:25 +0000, Sage Weil wrote:
> > > > On Tue, 6 Aug 2019, Jeff Layton wrote:
> > > > > On Tue, 2019-08-06 at 03:27 +0000, Sage Weil wrote:
> > > > > > On Mon, 5 Aug 2019, Jeff Layton wrote:
> > > > > > > xfstest generic/451 intermittently fails. The test does O_DIRECT writes
> > > > > > > to a file, and then reads back the result using buffered I/O, while
> > > > > > > running a separate set of tasks that are also doing buffered reads.
> > > > > > > 
> > > > > > > The client will invalidate the cache prior to a direct write, but it's
> > > > > > > easy for one of the other readers' replies to race in and reinstantiate
> > > > > > > the invalidated range with stale data.
> > > > > > 
> > > > > > Maybe a silly question, but: what if the write path did the invalidation 
> > > > > > after the write instead of before?  Then any racing read will see the new 
> > > > > > data on disk.
> > > > > > 
> > > > > 
> > > > > I tried that originally. It reduces the race window somewhat, but it's
> > > > > still present since a reply to a concurrent read can get in just after
> > > > > the invalidation occurs. You really do have to serialize them to fix
> > > > > this, AFAICT.
> > > > 
> > > > I've always assumed that viewing the ordering for concurrent operations as 
> > > > non-deterministic is the only sane approach.  If the read initiates before 
> > > > the write completes you have no obligation to reflect the result of the 
> > > > write.
> > > > 
> > > > Is that what you're trying to do?
> > > > 
> > > 
> > > Not exactly.
> > > 
> > > In this testcase, we have one thread that is alternating between DIO
> > > writes and buffered reads. Logically, the buffered read should always
> > > reflect the result of the DIO write...and indeed if we run that program
> > > in isolation it always does, since the cache is invalidated just prior
> > > to the DIO write.
> > > 
> > > The issue occurs when other tasks are doing buffered reads at the same
> > > time. Sometimes the reply to one of those will come in after the
> > > invalidation but before the subsequent buffered read by the writing
> > > task. If that OSD read occurs before the OSD write then it'll populate
> > 
> > This part confuses me.  Let's say the DIO thread does 'write a', 'read 
> > (a)', 'write b', then 'read (b)'.  And the interfering buffered reader 
> > thread does a 'read (a)'.  In order for that to pollute the cache, the 
> > reply has to arrive *after* the 'write b' ack is received and the 
> > invalidation happens.  However, I don't think it's possible for a read ack 
> > to arrive *after* a write and not reflect that write, since the reply 
> > delivery from the osd to client is ordered.
> > 
> > Am I misunderstanding the sequence of events?
> > 
> > 1: write a
> > 1: get write ack, invalidate
> > 1: read (sees a)
> > 1: write b
> > 2: start buffered read
> > 1: get write ack, invalidate
> > 2: get buffered read reply, cache 'b'
> > 1: read (sees cached b)
> >
> > If the buffered read starts earlier, then we would get the reply before 
> > the write ack, and the cached result would get invalidated when the direct 
> > write completes.
> > 
> 
> The problem here though (IIUC) is that we don't have a strict guarantee
> on the ordering of the reply processing, in particular between buffered
> and direct I/O. We issue requests to libceph's OSD infrastructure, but
> the readpages machinery is mostly done in the context of the task that
> initiated it.
> 
> When we get the reply from the OSD, the pagevec is populated and then
> the initiating task is awoken, but nothing ensures that the initiating
> tasks doing the postprocessing end up serialized in the "correct" order.
> 
> IOW, even though the read reply came in before the write reply, the
> writing task ended up getting scheduled first when they were both
> awoken.

Aha, I understand now.

Thanks!
sage


> 
> > It's sometimes possible for reads that race with writes to complete 
> > *sooner* (i.e., return pre-write value, because they don't wait for things 
> > degraded objects to be repaired, which writes do wait for), but then the 
> > read completes first.  There is a flag you can set to force reads to be 
> > ordered like writes (RWORDERED), which means they will take at least as 
> > long as racing writes. But IIRC reads never take longer than writes.
> > 
> >
> > > the pagecache with stale data. The subsequent read by the writing thread
> > > then ends up reading that stale data out of the cache.
> > > 
> > > Doing the invalidation after the DIO write reduces this race window to
> > > some degree, but doesn't fully close it. You have to serialize things
> > > such that buffered read requests aren't dispatched until all of the DIO
> > > write replies have come in.
> > > -- 
> > > Jeff Layton <jlayton@kernel.org>
> > > 
> > > 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
> 
>
Christoph Hellwig Aug. 8, 2019, 7:46 a.m. UTC | #12
On Tue, Aug 06, 2019 at 06:54:17AM -0400, Jeff Layton wrote:
> That part of the patch description is unclear. I'll fix that up, but
> this patch does ensure that no buffered I/O can take place while any
> direct I/O is in flight. Only operations of the same "flavor" can run
> in parallel. Note that we _do_ use the i_rwsem here, but there is also
> an additional per-inode flag that handles the buffered read/direct I/O
> exclusion.
> 
> I did take a look at the xfs_ilock* code this morning. That's quite a
> bit more complex than this. It's possible that ceph doesn't serialize
> mmap I/O and O_DIRECT properly. I'll have to look over xfstests and see
> whether there is a good test for that as well and whether it passes.

Note that the mmap bits aren't really related to direct vs buffer
locking but for hole punching vs page faults.  As ceph supports hole
punching you'll probably need it too, though.

Patch
diff mbox series

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index a699e320393f..c1da294418d1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@ 
 obj-$(CONFIG_CEPH_FS) += ceph.o
 
 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
-	export.o caps.o snap.o xattr.o quota.o \
+	export.o caps.o snap.o xattr.o quota.o io.o \
 	mds_client.o mdsmap.o strings.o ceph_frag.o \
 	debugfs.o
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5182e1a49d6f..d7d264e05303 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -15,6 +15,7 @@ 
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "io.h"
 
 static __le32 ceph_flags_sys2wire(u32 flags)
 {
@@ -1284,12 +1285,16 @@  static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 		if (ci->i_inline_version == CEPH_INLINE_NONE) {
 			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+				ceph_start_io_direct(inode);
 				ret = ceph_direct_read_write(iocb, to,
 							     NULL, NULL);
+				ceph_end_io_direct(inode);
 				if (ret >= 0 && ret < len)
 					retry_op = CHECK_EOF;
 			} else {
+				ceph_start_io_read(inode);
 				ret = ceph_sync_read(iocb, to, &retry_op);
+				ceph_end_io_read(inode);
 			}
 		} else {
 			retry_op = READ_INLINE;
@@ -1300,7 +1305,9 @@  static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
 		     ceph_cap_string(got));
 		ceph_add_rw_context(fi, &rw_ctx);
+		ceph_start_io_read(inode);
 		ret = generic_file_read_iter(iocb, to);
+		ceph_end_io_read(inode);
 		ceph_del_rw_context(fi, &rw_ctx);
 	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1409,7 +1416,10 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return -ENOMEM;
 
 retry_snap:
-	inode_lock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_start_io_direct(inode);
+	else
+		ceph_start_io_write(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1480,7 +1490,6 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	    (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
 		struct ceph_snap_context *snapc;
 		struct iov_iter data;
-		inode_unlock(inode);
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_have_pending_cap_snap(ci)) {
@@ -1497,11 +1506,14 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 		/* we might need to revert back to that point */
 		data = *from;
-		if (iocb->ki_flags & IOCB_DIRECT)
+		if (iocb->ki_flags & IOCB_DIRECT) {
 			written = ceph_direct_read_write(iocb, &data, snapc,
 							 &prealloc_cf);
-		else
+			ceph_end_io_direct(inode);
+		} else {
 			written = ceph_sync_write(iocb, &data, pos, snapc);
+			ceph_end_io_write(inode);
+		}
 		if (written > 0)
 			iov_iter_advance(from, written);
 		ceph_put_snap_context(snapc);
@@ -1516,7 +1528,7 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
-		inode_unlock(inode);
+		ceph_end_io_write(inode);
 	}
 
 	if (written >= 0) {
@@ -1551,9 +1563,11 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	goto out_unlocked;
-
 out:
-	inode_unlock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_end_io_direct(inode);
+	else
+		ceph_end_io_write(inode);
 out_unlocked:
 	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 000000000000..8367cbcc81e8
--- /dev/null
+++ b/fs/ceph/io.c
@@ -0,0 +1,163 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ * Copyright (c) 2019 Jeff Layton
+ *
+ * I/O and data path helper functionality.
+ *
+ * Heavily borrowed from equivalent code in fs/nfs/io.c
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+
+#include "super.h"
+#include "io.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+
+	if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
+		spin_unlock(&ci->i_ceph_lock);
+		inode_dio_wait(inode);
+	}
+}
+
+/**
+ * ceph_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+ceph_start_io_read(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	/* Be an optimist! */
+	down_read(&inode->i_rwsem);
+	if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
+		return;
+	up_read(&inode->i_rwsem);
+	/* Slow path.... */
+	down_write(&inode->i_rwsem);
+	ceph_block_o_direct(ci, inode);
+	downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * ceph_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_read(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
+}
+
+/**
+ * ceph_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+ceph_start_io_write(struct inode *inode)
+{
+	down_write(&inode->i_rwsem);
+	ceph_block_o_direct(ceph_inode(inode), inode);
+}
+
+/**
+ * ceph_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_write(struct inode *inode)
+{
+	up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+
+	if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags |= CEPH_I_ODIRECT;
+		spin_unlock(&ci->i_ceph_lock);
+		/* FIXME: unmap_mapping_range? */
+		filemap_write_and_wait(inode->i_mapping);
+	}
+}
+
+/**
+ * ceph_end_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+ceph_start_io_direct(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	/* Be an optimist! */
+	down_read(&inode->i_rwsem);
+	if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
+		return;
+	up_read(&inode->i_rwsem);
+	/* Slow path.... */
+	down_write(&inode->i_rwsem);
+	ceph_block_buffered(ci, inode);
+	downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * ceph_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_direct(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
+}
diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 000000000000..fa594cd77348
--- /dev/null
+++ b/fs/ceph/io.h
@@ -0,0 +1,12 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_IO_H
+#define _FS_CEPH_IO_H
+
+void ceph_start_io_read(struct inode *inode);
+void ceph_end_io_read(struct inode *inode);
+void ceph_start_io_write(struct inode *inode);
+void ceph_end_io_write(struct inode *inode);
+void ceph_start_io_direct(struct inode *inode);
+void ceph_end_io_direct(struct inode *inode);
+
+#endif /* FS_CEPH_IO_H */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 03e4828c7635..46a64293a3f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -516,7 +516,7 @@  static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_FLUSH_SNAPS	(1 << 9) /* need flush snapss */
 #define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
 #define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
-
+#define CEPH_I_ODIRECT		(1 << 12) /* active direct I/O in progress */
 
 /*
  * Masks of ceph inode work.