diff mbox series

[RFC] ceph: add new "nopagecache" option

Message ID 20220105132645.72282-1-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series [RFC] ceph: add new "nopagecache" option | expand

Commit Message

Jeff Layton Jan. 5, 2022, 1:26 p.m. UTC
CephFS is a bit unlike most other filesystems in that it only
conditionally does buffered I/O based on the caps that it gets from the
MDS. In most cases, unless there is contended access for an inode the
MDS does give Fbc caps to the client, so the unbuffered codepaths are
only infrequently traveled and are difficult to test.

At one time, the "-o sync" mount option would give you this behavior,
but that was removed in commit 7ab9b3807097 ("ceph: Don't use
ceph-sync-mode for synchronous-fs.").

Add a new mount option to tell the client to ignore Fbc caps when doing
I/O, and to use the synchronous codepaths exclusively, even on
non-O_DIRECT file descriptors. We already have an ioctl that forces this
behavior on a per-inode basis, so we can just always set the CEPH_F_SYNC
flag on such mounts.

Additionally, this patch also changes the client to not request Fbc when
doing direct I/O. We aren't using the cache with O_DIRECT so we don't
have any need for those caps.

Cc: majianpeng <majianpeng@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/file.c  | 24 +++++++++++++++---------
 fs/ceph/super.c | 11 +++++++++++
 fs/ceph/super.h |  1 +
 3 files changed, 27 insertions(+), 9 deletions(-)

I've been working with this patch in order to test the synchronous
codepaths for content encryption. I think though that this might make
sense to take into mainline, as it could be helpful for troubleshooting,
and ensuring that these codepaths are regularly tested.

Either way, I think the cap handling changes probably make sense on
their own. I can split that out if we don't want to take the mount
option in as well.

Thoughts?

Comments

Gregory Farnum Jan. 6, 2022, 1:41 p.m. UTC | #1
This seems fine to me. Do you have a teuthology fragment to exercise this?

Acked-by: Greg Farnum <gfarnum@redhat.com>

On Wed, Jan 5, 2022 at 5:27 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> CephFS is a bit unlike most other filesystems in that it only
> conditionally does buffered I/O based on the caps that it gets from the
> MDS. In most cases, unless there is contended access for an inode the
> MDS does give Fbc caps to the client, so the unbuffered codepaths are
> only infrequently traveled and are difficult to test.
>
> At one time, the "-o sync" mount option would give you this behavior,
> but that was removed in commit 7ab9b3807097 ("ceph: Don't use
> ceph-sync-mode for synchronous-fs.").
>
> Add a new mount option to tell the client to ignore Fbc caps when doing
> I/O, and to use the synchronous codepaths exclusively, even on
> non-O_DIRECT file descriptors. We already have an ioctl that forces this
> behavior on a per-inode basis, so we can just always set the CEPH_F_SYNC
> flag on such mounts.
>
> Additionally, this patch also changes the client to not request Fbc when
> doing direct I/O. We aren't using the cache with O_DIRECT so we don't
> have any need for those caps.
>
> Cc: majianpeng <majianpeng@gmail.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/file.c  | 24 +++++++++++++++---------
>  fs/ceph/super.c | 11 +++++++++++
>  fs/ceph/super.h |  1 +
>  3 files changed, 27 insertions(+), 9 deletions(-)
>
> I've been working with this patch in order to test the synchronous
> codepaths for content encryption. I think though that this might make
> sense to take into mainline, as it could be helpful for troubleshooting,
> and ensuring that these codepaths are regularly tested.
>
> Either way, I think the cap handling changes probably make sense on
> their own. I can split that out if we don't want to take the mount
> option in as well.
>
> Thoughts?
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index c138e8126286..7de5db51c3d0 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>                                         int fmode, bool isdir)
>  {
>         struct ceph_inode_info *ci = ceph_inode(inode);
> +       struct ceph_mount_options *opt =
> +               ceph_inode_to_client(&ci->vfs_inode)->mount_options;
>         struct ceph_file_info *fi;
>
>         dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
> @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>                 if (!fi)
>                         return -ENOMEM;
>
> +               if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> +                       fi->flags |= CEPH_F_SYNC;
> +
>                 file->private_data = fi;
>         }
>
> @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>         struct ceph_inode_info *ci = ceph_inode(inode);
>         bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
>         ssize_t ret;
> -       int want, got = 0;
> +       int want = 0, got = 0;
>         int retry_op = 0, read = 0;
>
>  again:
> @@ -1551,13 +1556,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>         else
>                 ceph_start_io_read(inode);
>
> +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> +               want |= CEPH_CAP_FILE_CACHE;
>         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> -               want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
> -       else
> -               want = CEPH_CAP_FILE_CACHE;
> +               want |= CEPH_CAP_FILE_LAZYIO;
> +
>         ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
>         if (ret < 0) {
> -               if (iocb->ki_flags & IOCB_DIRECT)
> +               if (direct_lock)
>                         ceph_end_io_direct(inode);
>                 else
>                         ceph_end_io_read(inode);
> @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         struct ceph_osd_client *osdc = &fsc->client->osdc;
>         struct ceph_cap_flush *prealloc_cf;
>         ssize_t count, written = 0;
> -       int err, want, got;
> +       int err, want = 0, got;
>         bool direct_lock = false;
>         u32 map_flags;
>         u64 pool_flags;
> @@ -1766,10 +1772,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>
>         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
>              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> +               want |= CEPH_CAP_FILE_BUFFER;
>         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> -               want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
> -       else
> -               want = CEPH_CAP_FILE_BUFFER;
> +               want |= CEPH_CAP_FILE_LAZYIO;
>         got = 0;
>         err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
>         if (err < 0)
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 8d6daea351f6..d7e604a56fd9 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -160,6 +160,7 @@ enum {
>         Opt_quotadf,
>         Opt_copyfrom,
>         Opt_wsync,
> +       Opt_pagecache,
>  };
>
>  enum ceph_recover_session_mode {
> @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
>         fsparam_string  ("mon_addr",                    Opt_mon_addr),
>         fsparam_u32     ("wsize",                       Opt_wsize),
>         fsparam_flag_no ("wsync",                       Opt_wsync),
> +       fsparam_flag_no ("pagecache",                   Opt_pagecache),
>         {}
>  };
>
> @@ -567,6 +569,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
>                 else
>                         fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
>                 break;
> +       case Opt_pagecache:
> +               if (result.negated)
> +                       fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
> +               else
> +                       fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
> +               break;
>         default:
>                 BUG();
>         }
> @@ -702,6 +710,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
>         if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
>                 seq_puts(m, ",wsync");
>
> +       if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> +               seq_puts(m, ",nopagecache");
> +
>         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
>                 seq_printf(m, ",wsize=%u", fsopt->wsize);
>         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index f9b1bbf26c1b..5c6d9384b7be 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -46,6 +46,7 @@
>  #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
>  #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
>  #define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
> +#define CEPH_MOUNT_OPT_NOPAGECACHE     (1<<16) /* bypass pagecache altogether */
>
>  #define CEPH_MOUNT_OPT_DEFAULT                 \
>         (CEPH_MOUNT_OPT_DCACHE |                \
> --
> 2.33.1
>
Jeff Layton Jan. 6, 2022, 1:54 p.m. UTC | #2
No, I've only used this for local testing so far. Perhaps you can
handhold me through adding such a patch to teuthology for it?

Note that this _really_ slows down I/O, especially if you're doing a lot
of small reads or writes, so it may not be appropriate for throughput-
heavy testing.

On Thu, 2022-01-06 at 05:41 -0800, Gregory Farnum wrote:
> This seems fine to me. Do you have a teuthology fragment to exercise this?
> 
> Acked-by: Greg Farnum <gfarnum@redhat.com>
> 
> On Wed, Jan 5, 2022 at 5:27 AM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > CephFS is a bit unlike most other filesystems in that it only
> > conditionally does buffered I/O based on the caps that it gets from the
> > MDS. In most cases, unless there is contended access for an inode the
> > MDS does give Fbc caps to the client, so the unbuffered codepaths are
> > only infrequently traveled and are difficult to test.
> > 
> > At one time, the "-o sync" mount option would give you this behavior,
> > but that was removed in commit 7ab9b3807097 ("ceph: Don't use
> > ceph-sync-mode for synchronous-fs.").
> > 
> > Add a new mount option to tell the client to ignore Fbc caps when doing
> > I/O, and to use the synchronous codepaths exclusively, even on
> > non-O_DIRECT file descriptors. We already have an ioctl that forces this
> > behavior on a per-inode basis, so we can just always set the CEPH_F_SYNC
> > flag on such mounts.
> > 
> > Additionally, this patch also changes the client to not request Fbc when
> > doing direct I/O. We aren't using the cache with O_DIRECT so we don't
> > have any need for those caps.
> > 
> > Cc: majianpeng <majianpeng@gmail.com>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/file.c  | 24 +++++++++++++++---------
> >  fs/ceph/super.c | 11 +++++++++++
> >  fs/ceph/super.h |  1 +
> >  3 files changed, 27 insertions(+), 9 deletions(-)
> > 
> > I've been working with this patch in order to test the synchronous
> > codepaths for content encryption. I think though that this might make
> > sense to take into mainline, as it could be helpful for troubleshooting,
> > and ensuring that these codepaths are regularly tested.
> > 
> > Either way, I think the cap handling changes probably make sense on
> > their own. I can split that out if we don't want to take the mount
> > option in as well.
> > 
> > Thoughts?
> > 
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index c138e8126286..7de5db51c3d0 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >                                         int fmode, bool isdir)
> >  {
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> > +       struct ceph_mount_options *opt =
> > +               ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> >         struct ceph_file_info *fi;
> > 
> >         dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
> > @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >                 if (!fi)
> >                         return -ENOMEM;
> > 
> > +               if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> > +                       fi->flags |= CEPH_F_SYNC;
> > +
> >                 file->private_data = fi;
> >         }
> > 
> > @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> >         bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
> >         ssize_t ret;
> > -       int want, got = 0;
> > +       int want = 0, got = 0;
> >         int retry_op = 0, read = 0;
> > 
> >  again:
> > @@ -1551,13 +1556,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >         else
> >                 ceph_start_io_read(inode);
> > 
> > +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> > +               want |= CEPH_CAP_FILE_CACHE;
> >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > -               want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
> > -       else
> > -               want = CEPH_CAP_FILE_CACHE;
> > +               want |= CEPH_CAP_FILE_LAZYIO;
> > +
> >         ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
> >         if (ret < 0) {
> > -               if (iocb->ki_flags & IOCB_DIRECT)
> > +               if (direct_lock)
> >                         ceph_end_io_direct(inode);
> >                 else
> >                         ceph_end_io_read(inode);
> > @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >         struct ceph_osd_client *osdc = &fsc->client->osdc;
> >         struct ceph_cap_flush *prealloc_cf;
> >         ssize_t count, written = 0;
> > -       int err, want, got;
> > +       int err, want = 0, got;
> >         bool direct_lock = false;
> >         u32 map_flags;
> >         u64 pool_flags;
> > @@ -1766,10 +1772,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > 
> >         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
> >              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> > +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> > +               want |= CEPH_CAP_FILE_BUFFER;
> >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > -               want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
> > -       else
> > -               want = CEPH_CAP_FILE_BUFFER;
> > +               want |= CEPH_CAP_FILE_LAZYIO;
> >         got = 0;
> >         err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
> >         if (err < 0)
> > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > index 8d6daea351f6..d7e604a56fd9 100644
> > --- a/fs/ceph/super.c
> > +++ b/fs/ceph/super.c
> > @@ -160,6 +160,7 @@ enum {
> >         Opt_quotadf,
> >         Opt_copyfrom,
> >         Opt_wsync,
> > +       Opt_pagecache,
> >  };
> > 
> >  enum ceph_recover_session_mode {
> > @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
> >         fsparam_string  ("mon_addr",                    Opt_mon_addr),
> >         fsparam_u32     ("wsize",                       Opt_wsize),
> >         fsparam_flag_no ("wsync",                       Opt_wsync),
> > +       fsparam_flag_no ("pagecache",                   Opt_pagecache),
> >         {}
> >  };
> > 
> > @@ -567,6 +569,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
> >                 else
> >                         fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
> >                 break;
> > +       case Opt_pagecache:
> > +               if (result.negated)
> > +                       fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
> > +               else
> > +                       fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
> > +               break;
> >         default:
> >                 BUG();
> >         }
> > @@ -702,6 +710,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
> >         if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
> >                 seq_puts(m, ",wsync");
> > 
> > +       if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> > +               seq_puts(m, ",nopagecache");
> > +
> >         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
> >                 seq_printf(m, ",wsize=%u", fsopt->wsize);
> >         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index f9b1bbf26c1b..5c6d9384b7be 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -46,6 +46,7 @@
> >  #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
> >  #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
> >  #define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
> > +#define CEPH_MOUNT_OPT_NOPAGECACHE     (1<<16) /* bypass pagecache altogether */
> > 
> >  #define CEPH_MOUNT_OPT_DEFAULT                 \
> >         (CEPH_MOUNT_OPT_DCACHE |                \
> > --
> > 2.33.1
> > 
>
Gregory Farnum Jan. 6, 2022, 2:13 p.m. UTC | #3
On Thu, Jan 6, 2022 at 5:54 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> No, I've only used this for local testing so far. Perhaps you can
> handhold me through adding such a patch to teuthology for it?

Well, we mount with the "kclient" task:
https://github.com/ceph/ceph/blob/master/qa/tasks/kclient.py

And it looks like that already supports passing arbitrary mount
options: https://github.com/ceph/ceph/blob/master/qa/tasks/kclient.py#L51

So I think that wherever we use the kclient, we look at if that
subsuite/test is appropriate to test without the page cache. If it is,
split up the kclient yaml fragment to specify both with and without
the new mount option.

Right now it looks like the tree is organized so we just include
https://github.com/ceph/ceph/tree/master/qa/cephfs/mount anywhere we
use CephFS mounts, and that will generate tests against both ceph-fuse
and the kclient. So we could update that by replacing the single
"kclient/mount.yaml" file with a folder "kclient_mount" that contains
the existing mount.yaml, and a no_page_cache_mount.yaml that looks
like

tasks:
- kclient:
  - mntopts: ["nopagecache"]

>
> Note that this _really_ slows down I/O, especially if you're doing a lot
> of small reads or writes, so it may not be appropriate for throughput-
> heavy testing.
>
> On Thu, 2022-01-06 at 05:41 -0800, Gregory Farnum wrote:
> > This seems fine to me. Do you have a teuthology fragment to exercise this?
> >
> > Acked-by: Greg Farnum <gfarnum@redhat.com>
> >
> > On Wed, Jan 5, 2022 at 5:27 AM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > CephFS is a bit unlike most other filesystems in that it only
> > > conditionally does buffered I/O based on the caps that it gets from the
> > > MDS. In most cases, unless there is contended access for an inode the
> > > MDS does give Fbc caps to the client, so the unbuffered codepaths are
> > > only infrequently traveled and are difficult to test.
> > >
> > > At one time, the "-o sync" mount option would give you this behavior,
> > > but that was removed in commit 7ab9b3807097 ("ceph: Don't use
> > > ceph-sync-mode for synchronous-fs.").
> > >
> > > Add a new mount option to tell the client to ignore Fbc caps when doing
> > > I/O, and to use the synchronous codepaths exclusively, even on
> > > non-O_DIRECT file descriptors. We already have an ioctl that forces this
> > > behavior on a per-inode basis, so we can just always set the CEPH_F_SYNC
> > > flag on such mounts.
> > >
> > > Additionally, this patch also changes the client to not request Fbc when
> > > doing direct I/O. We aren't using the cache with O_DIRECT so we don't
> > > have any need for those caps.
> > >
> > > Cc: majianpeng <majianpeng@gmail.com>
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/file.c  | 24 +++++++++++++++---------
> > >  fs/ceph/super.c | 11 +++++++++++
> > >  fs/ceph/super.h |  1 +
> > >  3 files changed, 27 insertions(+), 9 deletions(-)
> > >
> > > I've been working with this patch in order to test the synchronous
> > > codepaths for content encryption. I think though that this might make
> > > sense to take into mainline, as it could be helpful for troubleshooting,
> > > and ensuring that these codepaths are regularly tested.
> > >
> > > Either way, I think the cap handling changes probably make sense on
> > > their own. I can split that out if we don't want to take the mount
> > > option in as well.
> > >
> > > Thoughts?
> > >
> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > index c138e8126286..7de5db51c3d0 100644
> > > --- a/fs/ceph/file.c
> > > +++ b/fs/ceph/file.c
> > > @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > >                                         int fmode, bool isdir)
> > >  {
> > >         struct ceph_inode_info *ci = ceph_inode(inode);
> > > +       struct ceph_mount_options *opt =
> > > +               ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> > >         struct ceph_file_info *fi;
> > >
> > >         dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
> > > @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > >                 if (!fi)
> > >                         return -ENOMEM;
> > >
> > > +               if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> > > +                       fi->flags |= CEPH_F_SYNC;
> > > +
> > >                 file->private_data = fi;
> > >         }
> > >
> > > @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >         struct ceph_inode_info *ci = ceph_inode(inode);
> > >         bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
> > >         ssize_t ret;
> > > -       int want, got = 0;
> > > +       int want = 0, got = 0;
> > >         int retry_op = 0, read = 0;
> > >
> > >  again:
> > > @@ -1551,13 +1556,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >         else
> > >                 ceph_start_io_read(inode);
> > >
> > > +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> > > +               want |= CEPH_CAP_FILE_CACHE;
> > >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > > -               want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
> > > -       else
> > > -               want = CEPH_CAP_FILE_CACHE;
> > > +               want |= CEPH_CAP_FILE_LAZYIO;
> > > +
> > >         ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
> > >         if (ret < 0) {
> > > -               if (iocb->ki_flags & IOCB_DIRECT)
> > > +               if (direct_lock)
> > >                         ceph_end_io_direct(inode);
> > >                 else
> > >                         ceph_end_io_read(inode);
> > > @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > >         struct ceph_osd_client *osdc = &fsc->client->osdc;
> > >         struct ceph_cap_flush *prealloc_cf;
> > >         ssize_t count, written = 0;
> > > -       int err, want, got;
> > > +       int err, want = 0, got;
> > >         bool direct_lock = false;
> > >         u32 map_flags;
> > >         u64 pool_flags;
> > > @@ -1766,10 +1772,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > >
> > >         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
> > >              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> > > +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> > > +               want |= CEPH_CAP_FILE_BUFFER;
> > >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > > -               want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
> > > -       else
> > > -               want = CEPH_CAP_FILE_BUFFER;
> > > +               want |= CEPH_CAP_FILE_LAZYIO;
> > >         got = 0;
> > >         err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
> > >         if (err < 0)
> > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > index 8d6daea351f6..d7e604a56fd9 100644
> > > --- a/fs/ceph/super.c
> > > +++ b/fs/ceph/super.c
> > > @@ -160,6 +160,7 @@ enum {
> > >         Opt_quotadf,
> > >         Opt_copyfrom,
> > >         Opt_wsync,
> > > +       Opt_pagecache,
> > >  };
> > >
> > >  enum ceph_recover_session_mode {
> > > @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
> > >         fsparam_string  ("mon_addr",                    Opt_mon_addr),
> > >         fsparam_u32     ("wsize",                       Opt_wsize),
> > >         fsparam_flag_no ("wsync",                       Opt_wsync),
> > > +       fsparam_flag_no ("pagecache",                   Opt_pagecache),
> > >         {}
> > >  };
> > >
> > > @@ -567,6 +569,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
> > >                 else
> > >                         fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
> > >                 break;
> > > +       case Opt_pagecache:
> > > +               if (result.negated)
> > > +                       fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
> > > +               else
> > > +                       fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
> > > +               break;
> > >         default:
> > >                 BUG();
> > >         }
> > > @@ -702,6 +710,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
> > >         if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
> > >                 seq_puts(m, ",wsync");
> > >
> > > +       if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> > > +               seq_puts(m, ",nopagecache");
> > > +
> > >         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
> > >                 seq_printf(m, ",wsize=%u", fsopt->wsize);
> > >         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index f9b1bbf26c1b..5c6d9384b7be 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -46,6 +46,7 @@
> > >  #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
> > >  #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
> > >  #define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
> > > +#define CEPH_MOUNT_OPT_NOPAGECACHE     (1<<16) /* bypass pagecache altogether */
> > >
> > >  #define CEPH_MOUNT_OPT_DEFAULT                 \
> > >         (CEPH_MOUNT_OPT_DCACHE |                \
> > > --
> > > 2.33.1
> > >
> >
>
> --
> Jeff Layton <jlayton@kernel.org>
>
Jeff Layton Jan. 6, 2022, 2:44 p.m. UTC | #4
On Thu, 2022-01-06 at 06:13 -0800, Gregory Farnum wrote:
> On Thu, Jan 6, 2022 at 5:54 AM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > No, I've only used this for local testing so far. Perhaps you can
> > handhold me through adding such a patch to teuthology for it?
> 
> Well, we mount with the "kclient" task:
> https://github.com/ceph/ceph/blob/master/qa/tasks/kclient.py
> 
> And it looks like that already supports passing arbitrary mount
> options: https://github.com/ceph/ceph/blob/master/qa/tasks/kclient.py#L51
> 
> So I think that wherever we use the kclient, we look at if that
> subsuite/test is appropriate to test without the page cache. If it is,
> split up the kclient yaml fragment to specify both with and without
> the new mount option.
> 
> Right now it looks like the tree is organized so we just include
> https://github.com/ceph/ceph/tree/master/qa/cephfs/mount anywhere we
> use CephFS mounts, and that will generate tests against both ceph-fuse
> and the kclient. So we could update that by replacing the single
> "kclient/mount.yaml" file with a folder "kclient_mount" that contains
> the existing mount.yaml, and a no_page_cache_mount.yaml that looks
> like
> 
> tasks:
> - kclient:
>   - mntopts: ["nopagecache"]
> 

Do I also need one of the magic %, +, or $ files in this directory?

Also, what will be effect of doing this? Will the new mount option be
used randomly, or will I need to specify a test run that includes this
file?


> > 
> > Note that this _really_ slows down I/O, especially if you're doing a lot
> > of small reads or writes, so it may not be appropriate for throughput-
> > heavy testing.
> > 
> > On Thu, 2022-01-06 at 05:41 -0800, Gregory Farnum wrote:
> > > This seems fine to me. Do you have a teuthology fragment to exercise this?
> > > 
> > > Acked-by: Greg Farnum <gfarnum@redhat.com>
> > > 
> > > On Wed, Jan 5, 2022 at 5:27 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > CephFS is a bit unlike most other filesystems in that it only
> > > > conditionally does buffered I/O based on the caps that it gets from the
> > > > MDS. In most cases, unless there is contended access for an inode the
> > > > MDS does give Fbc caps to the client, so the unbuffered codepaths are
> > > > only infrequently traveled and are difficult to test.
> > > > 
> > > > At one time, the "-o sync" mount option would give you this behavior,
> > > > but that was removed in commit 7ab9b3807097 ("ceph: Don't use
> > > > ceph-sync-mode for synchronous-fs.").
> > > > 
> > > > Add a new mount option to tell the client to ignore Fbc caps when doing
> > > > I/O, and to use the synchronous codepaths exclusively, even on
> > > > non-O_DIRECT file descriptors. We already have an ioctl that forces this
> > > > behavior on a per-inode basis, so we can just always set the CEPH_F_SYNC
> > > > flag on such mounts.
> > > > 
> > > > Additionally, this patch also changes the client to not request Fbc when
> > > > doing direct I/O. We aren't using the cache with O_DIRECT so we don't
> > > > have any need for those caps.
> > > > 
> > > > Cc: majianpeng <majianpeng@gmail.com>
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/file.c  | 24 +++++++++++++++---------
> > > >  fs/ceph/super.c | 11 +++++++++++
> > > >  fs/ceph/super.h |  1 +
> > > >  3 files changed, 27 insertions(+), 9 deletions(-)
> > > > 
> > > > I've been working with this patch in order to test the synchronous
> > > > codepaths for content encryption. I think though that this might make
> > > > sense to take into mainline, as it could be helpful for troubleshooting,
> > > > and ensuring that these codepaths are regularly tested.
> > > > 
> > > > Either way, I think the cap handling changes probably make sense on
> > > > their own. I can split that out if we don't want to take the mount
> > > > option in as well.
> > > > 
> > > > Thoughts?
> > > > 
> > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > index c138e8126286..7de5db51c3d0 100644
> > > > --- a/fs/ceph/file.c
> > > > +++ b/fs/ceph/file.c
> > > > @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > > >                                         int fmode, bool isdir)
> > > >  {
> > > >         struct ceph_inode_info *ci = ceph_inode(inode);
> > > > +       struct ceph_mount_options *opt =
> > > > +               ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> > > >         struct ceph_file_info *fi;
> > > > 
> > > >         dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
> > > > @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > > >                 if (!fi)
> > > >                         return -ENOMEM;
> > > > 
> > > > +               if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> > > > +                       fi->flags |= CEPH_F_SYNC;
> > > > +
> > > >                 file->private_data = fi;
> > > >         }
> > > > 
> > > > @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > >         struct ceph_inode_info *ci = ceph_inode(inode);
> > > >         bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
> > > >         ssize_t ret;
> > > > -       int want, got = 0;
> > > > +       int want = 0, got = 0;
> > > >         int retry_op = 0, read = 0;
> > > > 
> > > >  again:
> > > > @@ -1551,13 +1556,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > >         else
> > > >                 ceph_start_io_read(inode);
> > > > 
> > > > +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> > > > +               want |= CEPH_CAP_FILE_CACHE;
> > > >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > > > -               want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
> > > > -       else
> > > > -               want = CEPH_CAP_FILE_CACHE;
> > > > +               want |= CEPH_CAP_FILE_LAZYIO;
> > > > +
> > > >         ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
> > > >         if (ret < 0) {
> > > > -               if (iocb->ki_flags & IOCB_DIRECT)
> > > > +               if (direct_lock)
> > > >                         ceph_end_io_direct(inode);
> > > >                 else
> > > >                         ceph_end_io_read(inode);
> > > > @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > > >         struct ceph_osd_client *osdc = &fsc->client->osdc;
> > > >         struct ceph_cap_flush *prealloc_cf;
> > > >         ssize_t count, written = 0;
> > > > -       int err, want, got;
> > > > +       int err, want = 0, got;
> > > >         bool direct_lock = false;
> > > >         u32 map_flags;
> > > >         u64 pool_flags;
> > > > @@ -1766,10 +1772,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > > > 
> > > >         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
> > > >              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> > > > +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> > > > +               want |= CEPH_CAP_FILE_BUFFER;
> > > >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > > > -               want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
> > > > -       else
> > > > -               want = CEPH_CAP_FILE_BUFFER;
> > > > +               want |= CEPH_CAP_FILE_LAZYIO;
> > > >         got = 0;
> > > >         err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
> > > >         if (err < 0)
> > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > index 8d6daea351f6..d7e604a56fd9 100644
> > > > --- a/fs/ceph/super.c
> > > > +++ b/fs/ceph/super.c
> > > > @@ -160,6 +160,7 @@ enum {
> > > >         Opt_quotadf,
> > > >         Opt_copyfrom,
> > > >         Opt_wsync,
> > > > +       Opt_pagecache,
> > > >  };
> > > > 
> > > >  enum ceph_recover_session_mode {
> > > > @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
> > > >         fsparam_string  ("mon_addr",                    Opt_mon_addr),
> > > >         fsparam_u32     ("wsize",                       Opt_wsize),
> > > >         fsparam_flag_no ("wsync",                       Opt_wsync),
> > > > +       fsparam_flag_no ("pagecache",                   Opt_pagecache),
> > > >         {}
> > > >  };
> > > > 
> > > > @@ -567,6 +569,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
> > > >                 else
> > > >                         fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
> > > >                 break;
> > > > +       case Opt_pagecache:
> > > > +               if (result.negated)
> > > > +                       fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
> > > > +               else
> > > > +                       fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
> > > > +               break;
> > > >         default:
> > > >                 BUG();
> > > >         }
> > > > @@ -702,6 +710,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
> > > >         if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
> > > >                 seq_puts(m, ",wsync");
> > > > 
> > > > +       if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> > > > +               seq_puts(m, ",nopagecache");
> > > > +
> > > >         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
> > > >                 seq_printf(m, ",wsize=%u", fsopt->wsize);
> > > >         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index f9b1bbf26c1b..5c6d9384b7be 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -46,6 +46,7 @@
> > > >  #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
> > > >  #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
> > > >  #define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
> > > > +#define CEPH_MOUNT_OPT_NOPAGECACHE     (1<<16) /* bypass pagecache altogether */
> > > > 
> > > >  #define CEPH_MOUNT_OPT_DEFAULT                 \
> > > >         (CEPH_MOUNT_OPT_DCACHE |                \
> > > > --
> > > > 2.33.1
> > > > 
> > > 
> > 
> > --
> > Jeff Layton <jlayton@kernel.org>
> > 
>
Venky Shankar Jan. 7, 2022, 9:12 a.m. UTC | #5
On Wed, Jan 5, 2022 at 6:57 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> CephFS is a bit unlike most other filesystems in that it only
> conditionally does buffered I/O based on the caps that it gets from the
> MDS. In most cases, unless there is contended access for an inode the
> MDS does give Fbc caps to the client, so the unbuffered codepaths are
> only infrequently traveled and are difficult to test.
>
> At one time, the "-o sync" mount option would give you this behavior,
> but that was removed in commit 7ab9b3807097 ("ceph: Don't use
> ceph-sync-mode for synchronous-fs.").
>
> Add a new mount option to tell the client to ignore Fbc caps when doing
> I/O, and to use the synchronous codepaths exclusively, even on
> non-O_DIRECT file descriptors. We already have an ioctl that forces this
> behavior on a per-inode basis, so we can just always set the CEPH_F_SYNC
> flag on such mounts.
>
> Additionally, this patch also changes the client to not request Fbc when
> doing direct I/O. We aren't using the cache with O_DIRECT so we don't
> have any need for those caps.

This looks good.

Reviewed-by: Venky Shankar <vshankar@redhat.com>

>
> Cc: majianpeng <majianpeng@gmail.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/file.c  | 24 +++++++++++++++---------
>  fs/ceph/super.c | 11 +++++++++++
>  fs/ceph/super.h |  1 +
>  3 files changed, 27 insertions(+), 9 deletions(-)
>
> I've been working with this patch in order to test the synchronous
> codepaths for content encryption. I think though that this might make
> sense to take into mainline, as it could be helpful for troubleshooting,
> and ensuring that these codepaths are regularly tested.
>
> Either way, I think the cap handling changes probably make sense on
> their own. I can split that out if we don't want to take the mount
> option in as well.
>
> Thoughts?
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index c138e8126286..7de5db51c3d0 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>                                         int fmode, bool isdir)
>  {
>         struct ceph_inode_info *ci = ceph_inode(inode);
> +       struct ceph_mount_options *opt =
> +               ceph_inode_to_client(&ci->vfs_inode)->mount_options;
>         struct ceph_file_info *fi;
>
>         dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
> @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>                 if (!fi)
>                         return -ENOMEM;
>
> +               if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> +                       fi->flags |= CEPH_F_SYNC;
> +
>                 file->private_data = fi;
>         }
>
> @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>         struct ceph_inode_info *ci = ceph_inode(inode);
>         bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
>         ssize_t ret;
> -       int want, got = 0;
> +       int want = 0, got = 0;
>         int retry_op = 0, read = 0;
>
>  again:
> @@ -1551,13 +1556,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
>         else
>                 ceph_start_io_read(inode);
>
> +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> +               want |= CEPH_CAP_FILE_CACHE;
>         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> -               want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
> -       else
> -               want = CEPH_CAP_FILE_CACHE;
> +               want |= CEPH_CAP_FILE_LAZYIO;
> +
>         ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
>         if (ret < 0) {
> -               if (iocb->ki_flags & IOCB_DIRECT)
> +               if (direct_lock)
>                         ceph_end_io_direct(inode);
>                 else
>                         ceph_end_io_read(inode);
> @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         struct ceph_osd_client *osdc = &fsc->client->osdc;
>         struct ceph_cap_flush *prealloc_cf;
>         ssize_t count, written = 0;
> -       int err, want, got;
> +       int err, want = 0, got;
>         bool direct_lock = false;
>         u32 map_flags;
>         u64 pool_flags;
> @@ -1766,10 +1772,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>
>         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
>              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> +       if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
> +               want |= CEPH_CAP_FILE_BUFFER;
>         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> -               want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
> -       else
> -               want = CEPH_CAP_FILE_BUFFER;
> +               want |= CEPH_CAP_FILE_LAZYIO;
>         got = 0;
>         err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
>         if (err < 0)
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 8d6daea351f6..d7e604a56fd9 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -160,6 +160,7 @@ enum {
>         Opt_quotadf,
>         Opt_copyfrom,
>         Opt_wsync,
> +       Opt_pagecache,
>  };
>
>  enum ceph_recover_session_mode {
> @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
>         fsparam_string  ("mon_addr",                    Opt_mon_addr),
>         fsparam_u32     ("wsize",                       Opt_wsize),
>         fsparam_flag_no ("wsync",                       Opt_wsync),
> +       fsparam_flag_no ("pagecache",                   Opt_pagecache),
>         {}
>  };
>
> @@ -567,6 +569,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
>                 else
>                         fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
>                 break;
> +       case Opt_pagecache:
> +               if (result.negated)
> +                       fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
> +               else
> +                       fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
> +               break;
>         default:
>                 BUG();
>         }
> @@ -702,6 +710,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
>         if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
>                 seq_puts(m, ",wsync");
>
> +       if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
> +               seq_puts(m, ",nopagecache");
> +
>         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
>                 seq_printf(m, ",wsize=%u", fsopt->wsize);
>         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index f9b1bbf26c1b..5c6d9384b7be 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -46,6 +46,7 @@
>  #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
>  #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
>  #define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
> +#define CEPH_MOUNT_OPT_NOPAGECACHE     (1<<16) /* bypass pagecache altogether */
>
>  #define CEPH_MOUNT_OPT_DEFAULT                 \
>         (CEPH_MOUNT_OPT_DCACHE |                \
> --
> 2.33.1
>
diff mbox series

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c138e8126286..7de5db51c3d0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -204,6 +204,8 @@  static int ceph_init_file_info(struct inode *inode, struct file *file,
 					int fmode, bool isdir)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mount_options *opt =
+		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
 	struct ceph_file_info *fi;
 
 	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
@@ -225,6 +227,9 @@  static int ceph_init_file_info(struct inode *inode, struct file *file,
 		if (!fi)
 			return -ENOMEM;
 
+		if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+			fi->flags |= CEPH_F_SYNC;
+
 		file->private_data = fi;
 	}
 
@@ -1536,7 +1541,7 @@  static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
 	ssize_t ret;
-	int want, got = 0;
+	int want = 0, got = 0;
 	int retry_op = 0, read = 0;
 
 again:
@@ -1551,13 +1556,14 @@  static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	else
 		ceph_start_io_read(inode);
 
+	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+		want |= CEPH_CAP_FILE_CACHE;
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
-		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
-	else
-		want = CEPH_CAP_FILE_CACHE;
+		want |= CEPH_CAP_FILE_LAZYIO;
+
 	ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
 	if (ret < 0) {
-		if (iocb->ki_flags & IOCB_DIRECT)
+		if (direct_lock)
 			ceph_end_io_direct(inode);
 		else
 			ceph_end_io_read(inode);
@@ -1691,7 +1697,7 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_cap_flush *prealloc_cf;
 	ssize_t count, written = 0;
-	int err, want, got;
+	int err, want = 0, got;
 	bool direct_lock = false;
 	u32 map_flags;
 	u64 pool_flags;
@@ -1766,10 +1772,10 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
 	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+		want |= CEPH_CAP_FILE_BUFFER;
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
-		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
-	else
-		want = CEPH_CAP_FILE_BUFFER;
+		want |= CEPH_CAP_FILE_LAZYIO;
 	got = 0;
 	err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
 	if (err < 0)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8d6daea351f6..d7e604a56fd9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -160,6 +160,7 @@  enum {
 	Opt_quotadf,
 	Opt_copyfrom,
 	Opt_wsync,
+	Opt_pagecache,
 };
 
 enum ceph_recover_session_mode {
@@ -201,6 +202,7 @@  static const struct fs_parameter_spec ceph_mount_parameters[] = {
 	fsparam_string	("mon_addr",			Opt_mon_addr),
 	fsparam_u32	("wsize",			Opt_wsize),
 	fsparam_flag_no	("wsync",			Opt_wsync),
+	fsparam_flag_no	("pagecache",			Opt_pagecache),
 	{}
 };
 
@@ -567,6 +569,12 @@  static int ceph_parse_mount_param(struct fs_context *fc,
 		else
 			fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
 		break;
+	case Opt_pagecache:
+		if (result.negated)
+			fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
+		else
+			fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
+		break;
 	default:
 		BUG();
 	}
@@ -702,6 +710,9 @@  static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
 		seq_puts(m, ",wsync");
 
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+		seq_puts(m, ",nopagecache");
+
 	if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
 		seq_printf(m, ",wsize=%u", fsopt->wsize);
 	if (fsopt->rsize != CEPH_MAX_READ_SIZE)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f9b1bbf26c1b..5c6d9384b7be 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -46,6 +46,7 @@ 
 #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
 #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
 #define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
+#define CEPH_MOUNT_OPT_NOPAGECACHE     (1<<16) /* bypass pagecache altogether */
 
 #define CEPH_MOUNT_OPT_DEFAULT			\
 	(CEPH_MOUNT_OPT_DCACHE |		\