diff mbox series

[v2] virtiofs: propagate sync() to file server

Message ID 20210426151011.840459-1-groug@kaod.org (mailing list archive)
State New, archived
Headers show
Series [v2] virtiofs: propagate sync() to file server | expand

Commit Message

Greg Kurz April 26, 2021, 3:10 p.m. UTC
Even if POSIX doesn't mandate it, linux users legitimately expect
sync() to flush all data and metadata to physical storage when it
is located on the same system. This isn't happening with virtiofs
though : sync() inside the guest returns right away even though
data still needs to be flushed from the host page cache.

This is easily demonstrated by doing the following in the guest:

$ dd if=/dev/zero of=/mnt/foo bs=1M count=5K ; strace -T -e sync sync
5120+0 records in
5120+0 records out
5368709120 bytes (5.4 GB, 5.0 GiB) copied, 5.22224 s, 1.0 GB/s
sync()                                  = 0 <0.024068>
+++ exited with 0 +++

and start the following in the host when the 'dd' command completes
in the guest:

$ strace -T -e fsync /usr/bin/sync virtiofs/foo
fsync(3)                                = 0 <10.371640>
+++ exited with 0 +++

There are no good reasons not to honor the expected behavior of
sync() actually : it gives an unrealistic impression that virtiofs
is super fast and that data has safely landed on HW, which isn't
the case obviously.

Implement a ->sync_fs() superblock operation that sends a new
FUSE_SYNC request type for this purpose. Provision a 64-bit
flags field for possible future extensions. Since the file
server cannot handle the wait == 0 case, we skip it to avoid a
gratuitous roundtrip.

Like with FUSE_FSYNC and FUSE_FSYNCDIR, lack of support for
FUSE_SYNC in the file server is treated as permanent success.
This ensures compatibility with older file servers : the client
will get the current behavior of sync() not being propagated to
the file server.

Note that such an operation allows the file server to DoS sync().
Since a typical FUSE file server is an untrusted piece of software
running in userspace, this is disabled by default.  Only enable it
with virtiofs for now since virtiofsd is supposedly trusted by the
guest kernel.

Reported-by: Robert Krawitz <rlk@redhat.com>
Signed-off-by: Greg Kurz <groug@kaod.org>
---

v2: - clarify compatibility with older servers in changelog (Vivek)
    - ignore the wait == 0 case (Miklos)
    - 64-bit aligned argument structure (Vivek, Miklos)

 fs/fuse/fuse_i.h          |  3 +++
 fs/fuse/inode.c           | 35 +++++++++++++++++++++++++++++++++++
 fs/fuse/virtio_fs.c       |  1 +
 include/uapi/linux/fuse.h | 10 +++++++++-
 4 files changed, 48 insertions(+), 1 deletion(-)

Comments

Vivek Goyal April 27, 2021, 5:12 p.m. UTC | #1
On Mon, Apr 26, 2021 at 05:10:11PM +0200, Greg Kurz wrote:
> Even if POSIX doesn't mandate it, linux users legitimately expect
> sync() to flush all data and metadata to physical storage when it
> is located on the same system. This isn't happening with virtiofs
> though : sync() inside the guest returns right away even though
> data still needs to be flushed from the host page cache.
> 
> This is easily demonstrated by doing the following in the guest:
> 
> $ dd if=/dev/zero of=/mnt/foo bs=1M count=5K ; strace -T -e sync sync
> 5120+0 records in
> 5120+0 records out
> 5368709120 bytes (5.4 GB, 5.0 GiB) copied, 5.22224 s, 1.0 GB/s
> sync()                                  = 0 <0.024068>
> +++ exited with 0 +++
> 
> and start the following in the host when the 'dd' command completes
> in the guest:
> 
> $ strace -T -e fsync /usr/bin/sync virtiofs/foo
> fsync(3)                                = 0 <10.371640>
> +++ exited with 0 +++
> 
> There are no good reasons not to honor the expected behavior of
> sync() actually : it gives an unrealistic impression that virtiofs
> is super fast and that data has safely landed on HW, which isn't
> the case obviously.
> 
> Implement a ->sync_fs() superblock operation that sends a new
> FUSE_SYNC request type for this purpose. Provision a 64-bit
> flags field for possible future extensions. Since the file
> server cannot handle the wait == 0 case, we skip it to avoid a
> gratuitous roundtrip.
> 
> Like with FUSE_FSYNC and FUSE_FSYNCDIR, lack of support for
> FUSE_SYNC in the file server is treated as permanent success.
> This ensures compatibility with older file servers : the client
> will get the current behavior of sync() not being propagated to
> the file server.
> 
> Note that such an operation allows the file server to DoS sync().
> Since a typical FUSE file server is an untrusted piece of software
> running in userspace, this is disabled by default.  Only enable it
> with virtiofs for now since virtiofsd is supposedly trusted by the
> guest kernel.
> 
> Reported-by: Robert Krawitz <rlk@redhat.com>
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
> 
> v2: - clarify compatibility with older servers in changelog (Vivek)
>     - ignore the wait == 0 case (Miklos)
>     - 64-bit aligned argument structure (Vivek, Miklos)
> 
>  fs/fuse/fuse_i.h          |  3 +++
>  fs/fuse/inode.c           | 35 +++++++++++++++++++++++++++++++++++
>  fs/fuse/virtio_fs.c       |  1 +
>  include/uapi/linux/fuse.h | 10 +++++++++-
>  4 files changed, 48 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 63d97a15ffde..68e9ae96cbd4 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -755,6 +755,9 @@ struct fuse_conn {
>  	/* Auto-mount submounts announced by the server */
>  	unsigned int auto_submounts:1;
>  
> +	/* Propagate syncfs() to server */
> +	unsigned int sync_fs:1;
> +
>  	/** The number of requests waiting for completion */
>  	atomic_t num_waiting;
>  
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index b0e18b470e91..ac184069b40f 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -506,6 +506,40 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
>  	return err;
>  }
>  
> +static int fuse_sync_fs(struct super_block *sb, int wait)
> +{
> +	struct fuse_mount *fm = get_fuse_mount_super(sb);
> +	struct fuse_conn *fc = fm->fc;
> +	struct fuse_syncfs_in inarg;
> +	FUSE_ARGS(args);
> +	int err;
> +
> +	/*
> +	 * Userspace cannot handle the wait == 0 case. Avoid a
> +	 * gratuitous roundtrip.
> +	 */
> +	if (!wait)
> +		return 0;
> +
> +	if (!fc->sync_fs)
> +		return 0;
> +
> +	memset(&inarg, 0, sizeof(inarg));
> +	args.in_numargs = 1;
> +	args.in_args[0].size = sizeof(inarg);
> +	args.in_args[0].value = &inarg;
> +	args.opcode = FUSE_SYNCFS;
> +	args.out_numargs = 0;
> +
> +	err = fuse_simple_request(fm, &args);
> +	if (err == -ENOSYS) {
> +		fc->sync_fs = 0;
> +		err = 0;
> +	}
> +
> +	return err;
> +}
> +
>  enum {
>  	OPT_SOURCE,
>  	OPT_SUBTYPE,
> @@ -909,6 +943,7 @@ static const struct super_operations fuse_super_operations = {
>  	.put_super	= fuse_put_super,
>  	.umount_begin	= fuse_umount_begin,
>  	.statfs		= fuse_statfs,
> +	.sync_fs	= fuse_sync_fs,
>  	.show_options	= fuse_show_options,
>  };
>  
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 4ee6f734ba83..a3c025308743 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -1441,6 +1441,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
>  	fc->release = fuse_free_conn;
>  	fc->delete_stale = true;
>  	fc->auto_submounts = true;
> +	fc->sync_fs = true;
>  
>  	fsc->s_fs_info = fm;
>  	sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 54442612c48b..1265ca17620c 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -179,6 +179,9 @@
>   *  7.33
>   *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
>   *  - add FUSE_OPEN_KILL_SUIDGID
> + *
> + *  7.34
> + *  - add FUSE_SYNCFS
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -214,7 +217,7 @@
>  #define FUSE_KERNEL_VERSION 7
>  
>  /** Minor version number of this interface */
> -#define FUSE_KERNEL_MINOR_VERSION 33
> +#define FUSE_KERNEL_MINOR_VERSION 34
>  
>  /** The node ID of the root inode */
>  #define FUSE_ROOT_ID 1
> @@ -499,6 +502,7 @@ enum fuse_opcode {
>  	FUSE_COPY_FILE_RANGE	= 47,
>  	FUSE_SETUPMAPPING	= 48,
>  	FUSE_REMOVEMAPPING	= 49,
> +	FUSE_SYNCFS		= 50,
>  
>  	/* CUSE specific operations */
>  	CUSE_INIT		= 4096,
> @@ -957,4 +961,8 @@ struct fuse_removemapping_one {
>  #define FUSE_REMOVEMAPPING_MAX_ENTRY   \
>  		(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
>  
> +struct fuse_syncfs_in {
> +	uint64_t flags;
> +};
> +

Hi Greg,

Will it be better if 32bits are for flags and reset 32 are
padding and can be used in whatever manner.

struct fuse_syncfs_in {
	uint32_t flags;
	uint32_t padding;
};

This will increase the flexibility if we were to send more information
in future.

I already see bunch of structures where flags are 32 bit and reset
are padding bits. fuse_read_in, fuse_write_in, fuse_rename2_in etc.

Thanks
Vivek

>  #endif /* _LINUX_FUSE_H */
> -- 
> 2.26.3
>
Greg Kurz April 27, 2021, 7:09 p.m. UTC | #2
On Tue, 27 Apr 2021 13:12:06 -0400
Vivek Goyal <vgoyal@redhat.com> wrote:

> On Mon, Apr 26, 2021 at 05:10:11PM +0200, Greg Kurz wrote:
> > Even if POSIX doesn't mandate it, linux users legitimately expect
> > sync() to flush all data and metadata to physical storage when it
> > is located on the same system. This isn't happening with virtiofs
> > though : sync() inside the guest returns right away even though
> > data still needs to be flushed from the host page cache.
> > 
> > This is easily demonstrated by doing the following in the guest:
> > 
> > $ dd if=/dev/zero of=/mnt/foo bs=1M count=5K ; strace -T -e sync sync
> > 5120+0 records in
> > 5120+0 records out
> > 5368709120 bytes (5.4 GB, 5.0 GiB) copied, 5.22224 s, 1.0 GB/s
> > sync()                                  = 0 <0.024068>
> > +++ exited with 0 +++
> > 
> > and start the following in the host when the 'dd' command completes
> > in the guest:
> > 
> > $ strace -T -e fsync /usr/bin/sync virtiofs/foo
> > fsync(3)                                = 0 <10.371640>
> > +++ exited with 0 +++
> > 
> > There are no good reasons not to honor the expected behavior of
> > sync() actually : it gives an unrealistic impression that virtiofs
> > is super fast and that data has safely landed on HW, which isn't
> > the case obviously.
> > 
> > Implement a ->sync_fs() superblock operation that sends a new
> > FUSE_SYNC request type for this purpose. Provision a 64-bit
> > flags field for possible future extensions. Since the file
> > server cannot handle the wait == 0 case, we skip it to avoid a
> > gratuitous roundtrip.
> > 
> > Like with FUSE_FSYNC and FUSE_FSYNCDIR, lack of support for
> > FUSE_SYNC in the file server is treated as permanent success.
> > This ensures compatibility with older file servers : the client
> > will get the current behavior of sync() not being propagated to
> > the file server.
> > 
> > Note that such an operation allows the file server to DoS sync().
> > Since a typical FUSE file server is an untrusted piece of software
> > running in userspace, this is disabled by default.  Only enable it
> > with virtiofs for now since virtiofsd is supposedly trusted by the
> > guest kernel.
> > 
> > Reported-by: Robert Krawitz <rlk@redhat.com>
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> > ---
> > 
> > v2: - clarify compatibility with older servers in changelog (Vivek)
> >     - ignore the wait == 0 case (Miklos)
> >     - 64-bit aligned argument structure (Vivek, Miklos)
> > 
> >  fs/fuse/fuse_i.h          |  3 +++
> >  fs/fuse/inode.c           | 35 +++++++++++++++++++++++++++++++++++
> >  fs/fuse/virtio_fs.c       |  1 +
> >  include/uapi/linux/fuse.h | 10 +++++++++-
> >  4 files changed, 48 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 63d97a15ffde..68e9ae96cbd4 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -755,6 +755,9 @@ struct fuse_conn {
> >  	/* Auto-mount submounts announced by the server */
> >  	unsigned int auto_submounts:1;
> >  
> > +	/* Propagate syncfs() to server */
> > +	unsigned int sync_fs:1;
> > +
> >  	/** The number of requests waiting for completion */
> >  	atomic_t num_waiting;
> >  
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index b0e18b470e91..ac184069b40f 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -506,6 +506,40 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
> >  	return err;
> >  }
> >  
> > +static int fuse_sync_fs(struct super_block *sb, int wait)
> > +{
> > +	struct fuse_mount *fm = get_fuse_mount_super(sb);
> > +	struct fuse_conn *fc = fm->fc;
> > +	struct fuse_syncfs_in inarg;
> > +	FUSE_ARGS(args);
> > +	int err;
> > +
> > +	/*
> > +	 * Userspace cannot handle the wait == 0 case. Avoid a
> > +	 * gratuitous roundtrip.
> > +	 */
> > +	if (!wait)
> > +		return 0;
> > +
> > +	if (!fc->sync_fs)
> > +		return 0;
> > +
> > +	memset(&inarg, 0, sizeof(inarg));
> > +	args.in_numargs = 1;
> > +	args.in_args[0].size = sizeof(inarg);
> > +	args.in_args[0].value = &inarg;
> > +	args.opcode = FUSE_SYNCFS;
> > +	args.out_numargs = 0;
> > +
> > +	err = fuse_simple_request(fm, &args);
> > +	if (err == -ENOSYS) {
> > +		fc->sync_fs = 0;
> > +		err = 0;
> > +	}
> > +
> > +	return err;
> > +}
> > +
> >  enum {
> >  	OPT_SOURCE,
> >  	OPT_SUBTYPE,
> > @@ -909,6 +943,7 @@ static const struct super_operations fuse_super_operations = {
> >  	.put_super	= fuse_put_super,
> >  	.umount_begin	= fuse_umount_begin,
> >  	.statfs		= fuse_statfs,
> > +	.sync_fs	= fuse_sync_fs,
> >  	.show_options	= fuse_show_options,
> >  };
> >  
> > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > index 4ee6f734ba83..a3c025308743 100644
> > --- a/fs/fuse/virtio_fs.c
> > +++ b/fs/fuse/virtio_fs.c
> > @@ -1441,6 +1441,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
> >  	fc->release = fuse_free_conn;
> >  	fc->delete_stale = true;
> >  	fc->auto_submounts = true;
> > +	fc->sync_fs = true;
> >  
> >  	fsc->s_fs_info = fm;
> >  	sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index 54442612c48b..1265ca17620c 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -179,6 +179,9 @@
> >   *  7.33
> >   *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
> >   *  - add FUSE_OPEN_KILL_SUIDGID
> > + *
> > + *  7.34
> > + *  - add FUSE_SYNCFS
> >   */
> >  
> >  #ifndef _LINUX_FUSE_H
> > @@ -214,7 +217,7 @@
> >  #define FUSE_KERNEL_VERSION 7
> >  
> >  /** Minor version number of this interface */
> > -#define FUSE_KERNEL_MINOR_VERSION 33
> > +#define FUSE_KERNEL_MINOR_VERSION 34
> >  
> >  /** The node ID of the root inode */
> >  #define FUSE_ROOT_ID 1
> > @@ -499,6 +502,7 @@ enum fuse_opcode {
> >  	FUSE_COPY_FILE_RANGE	= 47,
> >  	FUSE_SETUPMAPPING	= 48,
> >  	FUSE_REMOVEMAPPING	= 49,
> > +	FUSE_SYNCFS		= 50,
> >  
> >  	/* CUSE specific operations */
> >  	CUSE_INIT		= 4096,
> > @@ -957,4 +961,8 @@ struct fuse_removemapping_one {
> >  #define FUSE_REMOVEMAPPING_MAX_ENTRY   \
> >  		(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
> >  
> > +struct fuse_syncfs_in {
> > +	uint64_t flags;
> > +};
> > +
> 
> Hi Greg,
> 
> Will it be better if 32bits are for flags and reset 32 are
> padding and can be used in whatever manner.
> 
> struct fuse_syncfs_in {
> 	uint32_t flags;
> 	uint32_t padding;
> };
> 
> This will increase the flexibility if we were to send more information
> in future.
> 
> I already see bunch of structures where flags are 32 bit and reset
> are padding bits. fuse_read_in, fuse_write_in, fuse_rename2_in etc.
> 
> Thanks
> Vivek
> 

Yes, it makes sense. I'll wait a few more days and roll out a v3.

Thanks !

--
Greg

> >  #endif /* _LINUX_FUSE_H */
> > -- 
> > 2.26.3
> > 
>
Vivek Goyal April 30, 2021, 12:17 p.m. UTC | #3
On Tue, Apr 27, 2021 at 09:09:21PM +0200, Greg Kurz wrote:
[..]
> > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > > index 54442612c48b..1265ca17620c 100644
> > > --- a/include/uapi/linux/fuse.h
> > > +++ b/include/uapi/linux/fuse.h
> > > @@ -179,6 +179,9 @@
> > >   *  7.33
> > >   *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
> > >   *  - add FUSE_OPEN_KILL_SUIDGID
> > > + *
> > > + *  7.34
> > > + *  - add FUSE_SYNCFS
> > >   */
> > >  
> > >  #ifndef _LINUX_FUSE_H
> > > @@ -214,7 +217,7 @@
> > >  #define FUSE_KERNEL_VERSION 7
> > >  
> > >  /** Minor version number of this interface */
> > > -#define FUSE_KERNEL_MINOR_VERSION 33
> > > +#define FUSE_KERNEL_MINOR_VERSION 34
> > >  
> > >  /** The node ID of the root inode */
> > >  #define FUSE_ROOT_ID 1
> > > @@ -499,6 +502,7 @@ enum fuse_opcode {
> > >  	FUSE_COPY_FILE_RANGE	= 47,
> > >  	FUSE_SETUPMAPPING	= 48,
> > >  	FUSE_REMOVEMAPPING	= 49,
> > > +	FUSE_SYNCFS		= 50,
> > >  
> > >  	/* CUSE specific operations */
> > >  	CUSE_INIT		= 4096,
> > > @@ -957,4 +961,8 @@ struct fuse_removemapping_one {
> > >  #define FUSE_REMOVEMAPPING_MAX_ENTRY   \
> > >  		(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
> > >  
> > > +struct fuse_syncfs_in {
> > > +	uint64_t flags;
> > > +};
> > > +
> > 
> > Hi Greg,
> > 
> > Will it be better if 32bits are for flags and reset 32 are
> > padding and can be used in whatever manner.
> > 
> > struct fuse_syncfs_in {
> > 	uint32_t flags;
> > 	uint32_t padding;
> > };
> > 
> > This will increase the flexibility if we were to send more information
> > in future.
> > 
> > I already see bunch of structures where flags are 32 bit and reset
> > are padding bits. fuse_read_in, fuse_write_in, fuse_rename2_in etc.
> > 
> > Thanks
> > Vivek
> > 
> 
> Yes, it makes sense. I'll wait a few more days and roll out a v3.

Thinking more about it. We are not using any of the fields of this
structure right now. So may be all of it can be padding and no need
to add "flags".

struct fuse_syncfs_in {
	uint64_t padding;
};

Essentially what you have already done  :-). Just rename flags to
padding/unused to make it clear its unused for now.

Vivek
Greg Kurz April 30, 2021, 12:32 p.m. UTC | #4
On Fri, 30 Apr 2021 08:17:57 -0400
Vivek Goyal <vgoyal@redhat.com> wrote:

> On Tue, Apr 27, 2021 at 09:09:21PM +0200, Greg Kurz wrote:
> [..]
> > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > > > index 54442612c48b..1265ca17620c 100644
> > > > --- a/include/uapi/linux/fuse.h
> > > > +++ b/include/uapi/linux/fuse.h
> > > > @@ -179,6 +179,9 @@
> > > >   *  7.33
> > > >   *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
> > > >   *  - add FUSE_OPEN_KILL_SUIDGID
> > > > + *
> > > > + *  7.34
> > > > + *  - add FUSE_SYNCFS
> > > >   */
> > > >  
> > > >  #ifndef _LINUX_FUSE_H
> > > > @@ -214,7 +217,7 @@
> > > >  #define FUSE_KERNEL_VERSION 7
> > > >  
> > > >  /** Minor version number of this interface */
> > > > -#define FUSE_KERNEL_MINOR_VERSION 33
> > > > +#define FUSE_KERNEL_MINOR_VERSION 34
> > > >  
> > > >  /** The node ID of the root inode */
> > > >  #define FUSE_ROOT_ID 1
> > > > @@ -499,6 +502,7 @@ enum fuse_opcode {
> > > >  	FUSE_COPY_FILE_RANGE	= 47,
> > > >  	FUSE_SETUPMAPPING	= 48,
> > > >  	FUSE_REMOVEMAPPING	= 49,
> > > > +	FUSE_SYNCFS		= 50,
> > > >  
> > > >  	/* CUSE specific operations */
> > > >  	CUSE_INIT		= 4096,
> > > > @@ -957,4 +961,8 @@ struct fuse_removemapping_one {
> > > >  #define FUSE_REMOVEMAPPING_MAX_ENTRY   \
> > > >  		(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
> > > >  
> > > > +struct fuse_syncfs_in {
> > > > +	uint64_t flags;
> > > > +};
> > > > +
> > > 
> > > Hi Greg,
> > > 
> > > Will it be better if 32bits are for flags and reset 32 are
> > > padding and can be used in whatever manner.
> > > 
> > > struct fuse_syncfs_in {
> > > 	uint32_t flags;
> > > 	uint32_t padding;
> > > };
> > > 
> > > This will increase the flexibility if we were to send more information
> > > in future.
> > > 
> > > I already see bunch of structures where flags are 32 bit and reset
> > > are padding bits. fuse_read_in, fuse_write_in, fuse_rename2_in etc.
> > > 
> > > Thanks
> > > Vivek
> > > 
> > 
> > Yes, it makes sense. I'll wait a few more days and roll out a v3.
> 
> Thinking more about it. We are not using any of the fields of this
> structure right now. So may be all of it can be padding and no need
> to add "flags".
> 
> struct fuse_syncfs_in {
> 	uint64_t padding;
> };
> 
> Essentially what you have already done  :-). Just rename flags to
> padding/unused to make it clear its unused for now.
> 

Yeah and this would allow to get rid of the assert() on non-zero flags
on the virtiofsd size, which was looking a bit awkward :-)

> Vivek
>
diff mbox series

Patch

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 63d97a15ffde..68e9ae96cbd4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,6 +755,9 @@  struct fuse_conn {
 	/* Auto-mount submounts announced by the server */
 	unsigned int auto_submounts:1;
 
+	/* Propagate syncfs() to server */
+	unsigned int sync_fs:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b0e18b470e91..ac184069b40f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -506,6 +506,40 @@  static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return err;
 }
 
+static int fuse_sync_fs(struct super_block *sb, int wait)
+{
+	struct fuse_mount *fm = get_fuse_mount_super(sb);
+	struct fuse_conn *fc = fm->fc;
+	struct fuse_syncfs_in inarg;
+	FUSE_ARGS(args);
+	int err;
+
+	/*
+	 * Userspace cannot handle the wait == 0 case. Avoid a
+	 * gratuitous roundtrip.
+	 */
+	if (!wait)
+		return 0;
+
+	if (!fc->sync_fs)
+		return 0;
+
+	memset(&inarg, 0, sizeof(inarg));
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.opcode = FUSE_SYNCFS;
+	args.out_numargs = 0;
+
+	err = fuse_simple_request(fm, &args);
+	if (err == -ENOSYS) {
+		fc->sync_fs = 0;
+		err = 0;
+	}
+
+	return err;
+}
+
 enum {
 	OPT_SOURCE,
 	OPT_SUBTYPE,
@@ -909,6 +943,7 @@  static const struct super_operations fuse_super_operations = {
 	.put_super	= fuse_put_super,
 	.umount_begin	= fuse_umount_begin,
 	.statfs		= fuse_statfs,
+	.sync_fs	= fuse_sync_fs,
 	.show_options	= fuse_show_options,
 };
 
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 4ee6f734ba83..a3c025308743 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1441,6 +1441,7 @@  static int virtio_fs_get_tree(struct fs_context *fsc)
 	fc->release = fuse_free_conn;
 	fc->delete_stale = true;
 	fc->auto_submounts = true;
+	fc->sync_fs = true;
 
 	fsc->s_fs_info = fm;
 	sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 54442612c48b..1265ca17620c 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -179,6 +179,9 @@ 
  *  7.33
  *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
  *  - add FUSE_OPEN_KILL_SUIDGID
+ *
+ *  7.34
+ *  - add FUSE_SYNCFS
  */
 
 #ifndef _LINUX_FUSE_H
@@ -214,7 +217,7 @@ 
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 33
+#define FUSE_KERNEL_MINOR_VERSION 34
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -499,6 +502,7 @@  enum fuse_opcode {
 	FUSE_COPY_FILE_RANGE	= 47,
 	FUSE_SETUPMAPPING	= 48,
 	FUSE_REMOVEMAPPING	= 49,
+	FUSE_SYNCFS		= 50,
 
 	/* CUSE specific operations */
 	CUSE_INIT		= 4096,
@@ -957,4 +961,8 @@  struct fuse_removemapping_one {
 #define FUSE_REMOVEMAPPING_MAX_ENTRY   \
 		(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
 
+struct fuse_syncfs_in {
+	uint64_t flags;
+};
+
 #endif /* _LINUX_FUSE_H */