diff mbox series

[RESEND,V12,3/8] fuse: Definitions and ioctl for passthrough

Message ID 20210125153057.3623715-4-balsini@android.com (mailing list archive)
State New, archived
Headers show
Series fuse: Add support for passthrough read/write | expand

Commit Message

Alessio Balsini Jan. 25, 2021, 3:30 p.m. UTC
Expose the FUSE_PASSTHROUGH interface to user space and declare all the
basic data structures and functions as the skeleton on top of which the
FUSE passthrough functionality will be built.

As part of this, introduce the new FUSE passthrough ioctl, which allows
the FUSE daemon to specify a direct connection between a FUSE file and a
lower file system file. Such ioctl requires user space to pass the file
descriptor of one of its opened files through the fuse_passthrough_out
data structure introduced in this patch. This structure includes extra
fields for possible future extensions.
Also, add the passthrough functions for the set-up and tear-down of the
data structures and locks that will be used both when fuse_conns and
fuse_files are created/deleted.

Signed-off-by: Alessio Balsini <balsini@android.com>
---
 fs/fuse/Makefile          |  1 +
 fs/fuse/dev.c             | 12 ++++++++++++
 fs/fuse/dir.c             |  2 ++
 fs/fuse/file.c            |  4 +++-
 fs/fuse/fuse_i.h          | 27 +++++++++++++++++++++++++++
 fs/fuse/inode.c           | 17 ++++++++++++++++-
 fs/fuse/passthrough.c     | 21 +++++++++++++++++++++
 include/uapi/linux/fuse.h | 11 ++++++++++-
 8 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 fs/fuse/passthrough.c

Comments

Miklos Szeredi Feb. 17, 2021, 1:41 p.m. UTC | #1
On Mon, Jan 25, 2021 at 4:31 PM Alessio Balsini <balsini@android.com> wrote:
>
> Expose the FUSE_PASSTHROUGH interface to user space and declare all the
> basic data structures and functions as the skeleton on top of which the
> FUSE passthrough functionality will be built.
>
> As part of this, introduce the new FUSE passthrough ioctl, which allows
> the FUSE daemon to specify a direct connection between a FUSE file and a
> lower file system file. Such ioctl requires user space to pass the file
> descriptor of one of its opened files through the fuse_passthrough_out
> data structure introduced in this patch. This structure includes extra
> fields for possible future extensions.
> Also, add the passthrough functions for the set-up and tear-down of the
> data structures and locks that will be used both when fuse_conns and
> fuse_files are created/deleted.
>
> Signed-off-by: Alessio Balsini <balsini@android.com>

[...]

> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 54442612c48b..9d7685ce0acd 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -360,6 +360,7 @@ struct fuse_file_lock {
>  #define FUSE_MAP_ALIGNMENT     (1 << 26)
>  #define FUSE_SUBMOUNTS         (1 << 27)
>  #define FUSE_HANDLE_KILLPRIV_V2        (1 << 28)
> +#define FUSE_PASSTHROUGH       (1 << 29)

This header has a version and a changelog.  Please update those as well.

>
>  /**
>   * CUSE INIT request/reply flags
> @@ -625,7 +626,7 @@ struct fuse_create_in {
>  struct fuse_open_out {
>         uint64_t        fh;
>         uint32_t        open_flags;
> -       uint32_t        padding;
> +       uint32_t        passthrough_fh;

I think it would be cleaner to add a FOPEN_PASSTHROUGH flag to
explicitly request passthrough instead of just passing a non-null
value to passthrough_fh.

>  };
>
>  struct fuse_release_in {
> @@ -828,6 +829,13 @@ struct fuse_in_header {
>         uint32_t        padding;
>  };
>
> +struct fuse_passthrough_out {
> +       uint32_t        fd;
> +       /* For future implementation */
> +       uint32_t        len;
> +       void            *vec;
> +};

I don't see why we'd need these extensions.    The ioctl just needs to
establish an ID to open file mapping that can be referenced on the
regular protocol, i.e. it just needs to be passed an open file
descriptor and return an unique ID.

Mapping the fuse file's data to the underlying file's data is a
different matter.  That can be an identity mapping established at open
time (this is what this series does) or it can be an arbitrary extent
mapping to one or more underlying open files, established at open time
or on demand.  All of these can be done in band using the fuse
protocol, no need to involve the ioctl mechanism.

So I think we can just get rid of "struct fuse_passthrough_out"
completely and use "uint32_t *" as the ioctl argument.

What I think would be useful is to have an explicit
FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
once the fuse server no longer needs this ID.   If this turns out to
be a performance problem, we could still add the auto-close behavior
with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.

Thanks,
Miklos
Peng Tao Feb. 19, 2021, 7:05 a.m. UTC | #2
On Wed, Feb 17, 2021 at 9:41 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Mon, Jan 25, 2021 at 4:31 PM Alessio Balsini <balsini@android.com> wrote:
> >
> > Expose the FUSE_PASSTHROUGH interface to user space and declare all the
> > basic data structures and functions as the skeleton on top of which the
> > FUSE passthrough functionality will be built.
> >
> > As part of this, introduce the new FUSE passthrough ioctl, which allows
> > the FUSE daemon to specify a direct connection between a FUSE file and a
> > lower file system file. Such ioctl requires user space to pass the file
> > descriptor of one of its opened files through the fuse_passthrough_out
> > data structure introduced in this patch. This structure includes extra
> > fields for possible future extensions.
> > Also, add the passthrough functions for the set-up and tear-down of the
> > data structures and locks that will be used both when fuse_conns and
> > fuse_files are created/deleted.
> >
> > Signed-off-by: Alessio Balsini <balsini@android.com>
>
> [...]
>
> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index 54442612c48b..9d7685ce0acd 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -360,6 +360,7 @@ struct fuse_file_lock {
> >  #define FUSE_MAP_ALIGNMENT     (1 << 26)
> >  #define FUSE_SUBMOUNTS         (1 << 27)
> >  #define FUSE_HANDLE_KILLPRIV_V2        (1 << 28)
> > +#define FUSE_PASSTHROUGH       (1 << 29)
>
> This header has a version and a changelog.  Please update those as well.
>
> >
> >  /**
> >   * CUSE INIT request/reply flags
> > @@ -625,7 +626,7 @@ struct fuse_create_in {
> >  struct fuse_open_out {
> >         uint64_t        fh;
> >         uint32_t        open_flags;
> > -       uint32_t        padding;
> > +       uint32_t        passthrough_fh;
>
> I think it would be cleaner to add a FOPEN_PASSTHROUGH flag to
> explicitly request passthrough instead of just passing a non-null
> value to passthrough_fh.
>
> >  };
> >
> >  struct fuse_release_in {
> > @@ -828,6 +829,13 @@ struct fuse_in_header {
> >         uint32_t        padding;
> >  };
> >
> > +struct fuse_passthrough_out {
> > +       uint32_t        fd;
> > +       /* For future implementation */
> > +       uint32_t        len;
> > +       void            *vec;
> > +};
>
> I don't see why we'd need these extensions.    The ioctl just needs to
> establish an ID to open file mapping that can be referenced on the
> regular protocol, i.e. it just needs to be passed an open file
> descriptor and return an unique ID.
>
> Mapping the fuse file's data to the underlying file's data is a
> different matter.  That can be an identity mapping established at open
> time (this is what this series does) or it can be an arbitrary extent
> mapping to one or more underlying open files, established at open time
> or on demand.  All of these can be done in band using the fuse
> protocol, no need to involve the ioctl mechanism.
>
> So I think we can just get rid of "struct fuse_passthrough_out"
> completely and use "uint32_t *" as the ioctl argument.
>
> What I think would be useful is to have an explicit
> FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
> once the fuse server no longer needs this ID.   If this turns out to
> be a performance problem, we could still add the auto-close behavior
> with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.
Hi Miklos,

W/o auto closing, what happens if user space daemon forgets to call
FUSE_DEV_IOC_PASSTHROUGH_CLOSE? Do we keep the ID alive somewhere?

Thanks,
Tao
Miklos Szeredi Feb. 19, 2021, 8:40 a.m. UTC | #3
On Fri, Feb 19, 2021 at 8:05 AM Peng Tao <bergwolf@gmail.com> wrote:
>
> On Wed, Feb 17, 2021 at 9:41 PM Miklos Szeredi <miklos@szeredi.hu> wrote:

> > What I think would be useful is to have an explicit
> > FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
> > once the fuse server no longer needs this ID.   If this turns out to
> > be a performance problem, we could still add the auto-close behavior
> > with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.
> Hi Miklos,
>
> W/o auto closing, what happens if user space daemon forgets to call
> FUSE_DEV_IOC_PASSTHROUGH_CLOSE? Do we keep the ID alive somewhere?

Kernel would keep the ID open until explicit close or fuse connection
is released.

There should be some limit on the max open files referenced through
ID's, though.   E.g. inherit RLIMIT_NOFILE from mounting task.

Thanks,
Miklos
Alessio Balsini March 1, 2021, 5:05 p.m. UTC | #4
On Fri, Feb 19, 2021 at 09:40:21AM +0100, Miklos Szeredi wrote:
> On Fri, Feb 19, 2021 at 8:05 AM Peng Tao <bergwolf@gmail.com> wrote:
> >
> > On Wed, Feb 17, 2021 at 9:41 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> 
> > > What I think would be useful is to have an explicit
> > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
> > > once the fuse server no longer needs this ID.   If this turns out to
> > > be a performance problem, we could still add the auto-close behavior
> > > with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.
> > Hi Miklos,
> >
> > W/o auto closing, what happens if user space daemon forgets to call
> > FUSE_DEV_IOC_PASSTHROUGH_CLOSE? Do we keep the ID alive somewhere?
> 
> Kernel would keep the ID open until explicit close or fuse connection
> is released.
> 
> There should be some limit on the max open files referenced through
> ID's, though.   E.g. inherit RLIMIT_NOFILE from mounting task.
> 
> Thanks,
> Miklos

I like the idea of FUSE_DEV_IOC_PASSTHROUGH_CLOSE to revoke the
passthrough access, that is something I was already working on. What I
had in mind was simply to break that 1:1 connection between fuse_file
and lower filp setting a specific fuse_file::passthrough::filp to NULL,
but this is slightly different from what you mentioned.

AFAIU you are suggesting to allocate one ID for each lower fs file
opened with passthrough within a connection, and maybe using idr_find at
every read/write/mmap operation to check if passthrough is enabled on
that file. Something similar to fuse2_map_get().
This way the fuse server can pass the same ID to one or more
fuse_file(s).
FUSE_DEV_IOC_PASSTHROUGH_CLOSE would idr_remove the ID, so idr_find
would fail, preventing the use of passthrough on that ID. CMIIW.

After FUSE_DEV_IOC_PASSTHROUGH_CLOSE(ID) it may happen that if some
fuse_file(s) storing that ID are still open and the same ID is reclaimed
in a new idr_alloc, this would lead to mismatching lower fs filp being
used by our fuse_file(s).  So also the ID stored in the fuse_file(s)
must be invalidated to prevent future uses of deallocated IDs.

Would it make sense to have a list of fuse_files using the same ID, that
must be traversed at FUSE_DEV_IOC_PASSTHROUGH_CLOSE time?
Negative values (maybe -ENOENT) might be used to mark IDs as invalid,
and tested before idr_find at read/write/mmap to avoid the idr_find
complexity in case passthrough is disabled for that file.

What do you think?


I agree with all the above comments to this patch, i.e., add
FOPEN_PASSTHROUGH, drop fuse_passthrough_out, header version+changelog,
that will be fixed in V13.


Thanks,
Alessio
Amir Goldstein Sept. 8, 2022, 3:36 p.m. UTC | #5
Hi Alessio and Miklos,

Some time has passed.. and I was thinking of picking up these patches.

On Mon, Mar 1, 2021 at 7:05 PM Alessio Balsini <balsini@android.com> wrote:
>
> On Fri, Feb 19, 2021 at 09:40:21AM +0100, Miklos Szeredi wrote:
> > On Fri, Feb 19, 2021 at 8:05 AM Peng Tao <bergwolf@gmail.com> wrote:
> > >
> > > On Wed, Feb 17, 2021 at 9:41 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > > > What I think would be useful is to have an explicit
> > > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
> > > > once the fuse server no longer needs this ID.   If this turns out to
> > > > be a performance problem, we could still add the auto-close behavior
> > > > with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.
> > > Hi Miklos,
> > >
> > > W/o auto closing, what happens if user space daemon forgets to call
> > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE? Do we keep the ID alive somewhere?
> >
> > Kernel would keep the ID open until explicit close or fuse connection
> > is released.
> >
> > There should be some limit on the max open files referenced through
> > ID's, though.   E.g. inherit RLIMIT_NOFILE from mounting task.
> >
> > Thanks,
> > Miklos
>
> I like the idea of FUSE_DEV_IOC_PASSTHROUGH_CLOSE to revoke the
> passthrough access, that is something I was already working on. What I
> had in mind was simply to break that 1:1 connection between fuse_file
> and lower filp setting a specific fuse_file::passthrough::filp to NULL,
> but this is slightly different from what you mentioned.
>

I don't like the idea of switching between passthrough and server mid-life
of an open file.

There are consequences related to syncing the attribute cache of the kernel
and the server that I don't even want to think about.

> AFAIU you are suggesting to allocate one ID for each lower fs file
> opened with passthrough within a connection, and maybe using idr_find at
> every read/write/mmap operation to check if passthrough is enabled on
> that file. Something similar to fuse2_map_get().
> This way the fuse server can pass the same ID to one or more
> fuse_file(s).
> FUSE_DEV_IOC_PASSTHROUGH_CLOSE would idr_remove the ID, so idr_find
> would fail, preventing the use of passthrough on that ID. CMIIW.
>

I don't think that FUSE_DEV_IOC_PASSTHROUGH_CLOSE should remove the ID.
We can use a refcount for the mapping and FUSE_DEV_IOC_PASSTHROUGH_CLOSE
just drops the initial server's refcount.

Implementing revoke for an existing mapping is something completely different.
It can be done, not even so hard, but I don't think it should be part of this
series and in any case revoke will not remove the ID.

> After FUSE_DEV_IOC_PASSTHROUGH_CLOSE(ID) it may happen that if some
> fuse_file(s) storing that ID are still open and the same ID is reclaimed
> in a new idr_alloc, this would lead to mismatching lower fs filp being
> used by our fuse_file(s).  So also the ID stored in the fuse_file(s)
> must be invalidated to prevent future uses of deallocated IDs.

Obtaining a refcount on FOPEN_PASSTHROUGH will solve that.

>
> Would it make sense to have a list of fuse_files using the same ID, that
> must be traversed at FUSE_DEV_IOC_PASSTHROUGH_CLOSE time?
> Negative values (maybe -ENOENT) might be used to mark IDs as invalid,
> and tested before idr_find at read/write/mmap to avoid the idr_find
> complexity in case passthrough is disabled for that file.
>
> What do you think?
>

As I wrote above, this sounds unnecessarily complicated.

Miklos,

Do you agree with my interpretation of
FUSE_DEV_IOC_PASSTHROUGH_CLOSE?

Thanks,
Amir.
Miklos Szeredi Sept. 9, 2022, 7:07 p.m. UTC | #6
On Thu, 8 Sept 2022 at 17:36, Amir Goldstein <amir73il@gmail.com> wrote:
>
> Hi Alessio and Miklos,
>
> Some time has passed.. and I was thinking of picking up these patches.
>
> On Mon, Mar 1, 2021 at 7:05 PM Alessio Balsini <balsini@android.com> wrote:
> >
> > On Fri, Feb 19, 2021 at 09:40:21AM +0100, Miklos Szeredi wrote:
> > > On Fri, Feb 19, 2021 at 8:05 AM Peng Tao <bergwolf@gmail.com> wrote:
> > > >
> > > > On Wed, Feb 17, 2021 at 9:41 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > >
> > > > > What I think would be useful is to have an explicit
> > > > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
> > > > > once the fuse server no longer needs this ID.   If this turns out to
> > > > > be a performance problem, we could still add the auto-close behavior
> > > > > with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.
> > > > Hi Miklos,
> > > >
> > > > W/o auto closing, what happens if user space daemon forgets to call
> > > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE? Do we keep the ID alive somewhere?
> > >
> > > Kernel would keep the ID open until explicit close or fuse connection
> > > is released.
> > >
> > > There should be some limit on the max open files referenced through
> > > ID's, though.   E.g. inherit RLIMIT_NOFILE from mounting task.
> > >
> > > Thanks,
> > > Miklos
> >
> > I like the idea of FUSE_DEV_IOC_PASSTHROUGH_CLOSE to revoke the
> > passthrough access, that is something I was already working on. What I
> > had in mind was simply to break that 1:1 connection between fuse_file
> > and lower filp setting a specific fuse_file::passthrough::filp to NULL,
> > but this is slightly different from what you mentioned.
> >
>
> I don't like the idea of switching between passthrough and server mid-life
> of an open file.
>
> There are consequences related to syncing the attribute cache of the kernel
> and the server that I don't even want to think about.
>
> > AFAIU you are suggesting to allocate one ID for each lower fs file
> > opened with passthrough within a connection, and maybe using idr_find at
> > every read/write/mmap operation to check if passthrough is enabled on
> > that file. Something similar to fuse2_map_get().
> > This way the fuse server can pass the same ID to one or more
> > fuse_file(s).
> > FUSE_DEV_IOC_PASSTHROUGH_CLOSE would idr_remove the ID, so idr_find
> > would fail, preventing the use of passthrough on that ID. CMIIW.
> >
>
> I don't think that FUSE_DEV_IOC_PASSTHROUGH_CLOSE should remove the ID.
> We can use a refcount for the mapping and FUSE_DEV_IOC_PASSTHROUGH_CLOSE
> just drops the initial server's refcount.
>
> Implementing revoke for an existing mapping is something completely different.
> It can be done, not even so hard, but I don't think it should be part of this
> series and in any case revoke will not remove the ID.
>
> > After FUSE_DEV_IOC_PASSTHROUGH_CLOSE(ID) it may happen that if some
> > fuse_file(s) storing that ID are still open and the same ID is reclaimed
> > in a new idr_alloc, this would lead to mismatching lower fs filp being
> > used by our fuse_file(s).  So also the ID stored in the fuse_file(s)
> > must be invalidated to prevent future uses of deallocated IDs.
>
> Obtaining a refcount on FOPEN_PASSTHROUGH will solve that.
>
> >
> > Would it make sense to have a list of fuse_files using the same ID, that
> > must be traversed at FUSE_DEV_IOC_PASSTHROUGH_CLOSE time?
> > Negative values (maybe -ENOENT) might be used to mark IDs as invalid,
> > and tested before idr_find at read/write/mmap to avoid the idr_find
> > complexity in case passthrough is disabled for that file.
> >
> > What do you think?
> >
>
> As I wrote above, this sounds unnecessarily complicated.
>
> Miklos,
>
> Do you agree with my interpretation of
> FUSE_DEV_IOC_PASSTHROUGH_CLOSE?

We need to deal with the case of too many open files.   The server
could manage this, but then we do need to handle the case when a
cached mapping disappears, i.e:

 client opens file
 [time passes]
 cached passthrough fd gets evicted to make room for other passthrough I/O
 [time passes]
 new I/O request comes in
 need to reestablish passthrough fd before finishing I/O

The way I think of this is that a passthrough mapping is assigned at
open time, which is cached (which may have the lifetime longer than
the open file, but shorter as well).  When
FUSE_DEV_IOC_PASSTHROUGH_CLOSE and there are cached mapping referring
to this particular handle, then those mappings need to be purged.   On
a new I/O request, the mapping will need to be reestablished by
sending a FUSE_MAP request, which triggers
FUSE_DEV_IOC_PASSTHROUGH_OPEN.

One other question that's nagging me is how to "unhide" these pseudo-fds.

Could we create a kernel thread for each fuse sb which has normal
file-table for these?  This would would allow inspecting state through
/proc/$KTHREDID/fd, etc..

Thanks,
Miklos
Amir Goldstein Sept. 10, 2022, 8:52 a.m. UTC | #7
On Fri, Sep 9, 2022 at 10:07 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Thu, 8 Sept 2022 at 17:36, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > Hi Alessio and Miklos,
> >
> > Some time has passed.. and I was thinking of picking up these patches.
> >
> > On Mon, Mar 1, 2021 at 7:05 PM Alessio Balsini <balsini@android.com> wrote:
> > >
> > > On Fri, Feb 19, 2021 at 09:40:21AM +0100, Miklos Szeredi wrote:
> > > > On Fri, Feb 19, 2021 at 8:05 AM Peng Tao <bergwolf@gmail.com> wrote:
> > > > >
> > > > > On Wed, Feb 17, 2021 at 9:41 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > >
> > > > > > What I think would be useful is to have an explicit
> > > > > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE ioctl, that would need to be called
> > > > > > once the fuse server no longer needs this ID.   If this turns out to
> > > > > > be a performance problem, we could still add the auto-close behavior
> > > > > > with an explicit FOPEN_PASSTHROUGH_AUTOCLOSE flag later.
> > > > > Hi Miklos,
> > > > >
> > > > > W/o auto closing, what happens if user space daemon forgets to call
> > > > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE? Do we keep the ID alive somewhere?
> > > >
> > > > Kernel would keep the ID open until explicit close or fuse connection
> > > > is released.
> > > >
> > > > There should be some limit on the max open files referenced through
> > > > ID's, though.   E.g. inherit RLIMIT_NOFILE from mounting task.
> > > >
> > > > Thanks,
> > > > Miklos
> > >
> > > I like the idea of FUSE_DEV_IOC_PASSTHROUGH_CLOSE to revoke the
> > > passthrough access, that is something I was already working on. What I
> > > had in mind was simply to break that 1:1 connection between fuse_file
> > > and lower filp setting a specific fuse_file::passthrough::filp to NULL,
> > > but this is slightly different from what you mentioned.
> > >
> >
> > I don't like the idea of switching between passthrough and server mid-life
> > of an open file.
> >
> > There are consequences related to syncing the attribute cache of the kernel
> > and the server that I don't even want to think about.
> >
> > > AFAIU you are suggesting to allocate one ID for each lower fs file
> > > opened with passthrough within a connection, and maybe using idr_find at
> > > every read/write/mmap operation to check if passthrough is enabled on
> > > that file. Something similar to fuse2_map_get().
> > > This way the fuse server can pass the same ID to one or more
> > > fuse_file(s).
> > > FUSE_DEV_IOC_PASSTHROUGH_CLOSE would idr_remove the ID, so idr_find
> > > would fail, preventing the use of passthrough on that ID. CMIIW.
> > >
> >
> > I don't think that FUSE_DEV_IOC_PASSTHROUGH_CLOSE should remove the ID.
> > We can use a refcount for the mapping and FUSE_DEV_IOC_PASSTHROUGH_CLOSE
> > just drops the initial server's refcount.
> >
> > Implementing revoke for an existing mapping is something completely different.
> > It can be done, not even so hard, but I don't think it should be part of this
> > series and in any case revoke will not remove the ID.
> >
> > > After FUSE_DEV_IOC_PASSTHROUGH_CLOSE(ID) it may happen that if some
> > > fuse_file(s) storing that ID are still open and the same ID is reclaimed
> > > in a new idr_alloc, this would lead to mismatching lower fs filp being
> > > used by our fuse_file(s).  So also the ID stored in the fuse_file(s)
> > > must be invalidated to prevent future uses of deallocated IDs.
> >
> > Obtaining a refcount on FOPEN_PASSTHROUGH will solve that.
> >
> > >
> > > Would it make sense to have a list of fuse_files using the same ID, that
> > > must be traversed at FUSE_DEV_IOC_PASSTHROUGH_CLOSE time?
> > > Negative values (maybe -ENOENT) might be used to mark IDs as invalid,
> > > and tested before idr_find at read/write/mmap to avoid the idr_find
> > > complexity in case passthrough is disabled for that file.
> > >
> > > What do you think?
> > >
> >
> > As I wrote above, this sounds unnecessarily complicated.
> >
> > Miklos,
> >
> > Do you agree with my interpretation of
> > FUSE_DEV_IOC_PASSTHROUGH_CLOSE?
>
> We need to deal with the case of too many open files.   The server
> could manage this, but then we do need to handle the case when a
> cached mapping disappears, i.e:
>
>  client opens file
>  [time passes]
>  cached passthrough fd gets evicted to make room for other passthrough I/O
>  [time passes]
>  new I/O request comes in
>  need to reestablish passthrough fd before finishing I/O
>
> The way I think of this is that a passthrough mapping is assigned at
> open time, which is cached (which may have the lifetime longer than
> the open file, but shorter as well).  When
> FUSE_DEV_IOC_PASSTHROUGH_CLOSE and there are cached mapping referring
> to this particular handle, then those mappings need to be purged.   On
> a new I/O request, the mapping will need to be reestablished by
> sending a FUSE_MAP request, which triggers
> FUSE_DEV_IOC_PASSTHROUGH_OPEN.
>

Do we really need all this complication?

I mean, if we do this then the server may end up thrashing this
passthrough cache
when the client has many open files.

I think we should accept the fact that just as any current FUSE
passthrough (in userspace) implementation is limited to max number of
open files as the server's process limitation, kernel passthrough implementation
will be limited by inheriting the mounter's process limitation.

There is no reason that the server should need to keep more
passthrough fd's open than client open fds.

If we only support FOPEN_PASSTHROUGH_AUTOCLOSE as v12
patches implicitly do, then the memory overhead is not much different
than the extra overlayfs pseudo realfiles.

So IMO, we can start with a refcounted mapping implementation
and only if there is interest in server managed mappings eviction
we could implement FUSE_DEV_IOC_PASSTHROUGH_FORGET.

> One other question that's nagging me is how to "unhide" these pseudo-fds.
>
> Could we create a kernel thread for each fuse sb which has normal
> file-table for these?  This would would allow inspecting state through
> /proc/$KTHREDID/fd, etc..
>

Yeah that sounds like a good idea.
As I mentioned elsewhere in the thread, io_uring also has a mechanism
to register open files with the kernel to perform IO on them later.
I assume those files are also visible via some /proc/$KTHREDID/fd,
but I'll need to check.

BTW, I see that the Android team is presenting eBPF-FUSE on LPC
coming Tuesday [1].

There are affordable and free options to attend virtually [2].

I wonder when patches will be available ;)

Thanks,
Amir.

[1] https://lpc.events/event/16/contributions/1339/
[2] https://lpc.events/event/16/page/181-attend
Bernd Schubert Sept. 10, 2022, 1:03 p.m. UTC | #8
> 
> BTW, I see that the Android team is presenting eBPF-FUSE on LPC
> coming Tuesday [1].
> 
> There are affordable and free options to attend virtually [2].
> 
> I wonder when patches will be available ;)

Oh interesting. Btw, I'm currently working on uring communication, 
similar to what ublk is doing, I hope to have a very first version by 
next week.

Bernd
Miklos Szeredi Sept. 12, 2022, 9:29 a.m. UTC | #9
On Sat, 10 Sept 2022 at 10:52, Amir Goldstein <amir73il@gmail.com> wrote:

> I think we should accept the fact that just as any current FUSE
> passthrough (in userspace) implementation is limited to max number of
> open files as the server's process limitation, kernel passthrough implementation
> will be limited by inheriting the mounter's process limitation.
>
> There is no reason that the server should need to keep more
> passthrough fd's open than client open fds.

Maybe you're right.

> If we only support FOPEN_PASSTHROUGH_AUTOCLOSE as v12
> patches implicitly do, then the memory overhead is not much different
> than the extra overlayfs pseudo realfiles.

How exactly would this work?

ioctl(F_D_I_P_OPEN) - create passthrough fd with ref 1
open/FOPEN_PASSTHOUGH -  inc refcount in passthrough fd
release - put refcount in passthrough fd
ioctl(F_D_I_P_CLOSE) - put ref in passthrough fd

Due to being refcounted the F_D_I_P_CLOSE can come at any point past
the finished open request.

Or did you have something else in mind?

> > One other question that's nagging me is how to "unhide" these pseudo-fds.
> >
> > Could we create a kernel thread for each fuse sb which has normal
> > file-table for these?  This would would allow inspecting state through
> > /proc/$KTHREDID/fd, etc..
> >
>
> Yeah that sounds like a good idea.
> As I mentioned elsewhere in the thread, io_uring also has a mechanism
> to register open files with the kernel to perform IO on them later.
> I assume those files are also visible via some /proc/$KTHREDID/fd,
> but I'll need to check.
>
> BTW, I see that the Android team is presenting eBPF-FUSE on LPC
> coming Tuesday [1].

At first glance it looks like a filtered kernel-only passthrough +
fuse fallback, where filtering is provided by eBPF scripts and only
falls back to userspace access on more complex cases.  Maybe it's a
good direction, we'll see.  Apparently the passthrough case is
important enough for various use cases.

Miklos
Amir Goldstein Sept. 12, 2022, 12:29 p.m. UTC | #10
On Mon, Sep 12, 2022 at 12:29 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Sat, 10 Sept 2022 at 10:52, Amir Goldstein <amir73il@gmail.com> wrote:
>
> > I think we should accept the fact that just as any current FUSE
> > passthrough (in userspace) implementation is limited to max number of
> > open files as the server's process limitation, kernel passthrough implementation
> > will be limited by inheriting the mounter's process limitation.
> >
> > There is no reason that the server should need to keep more
> > passthrough fd's open than client open fds.
>
> Maybe you're right.
>
> > If we only support FOPEN_PASSTHROUGH_AUTOCLOSE as v12
> > patches implicitly do, then the memory overhead is not much different
> > than the extra overlayfs pseudo realfiles.
>
> How exactly would this work?
>
> ioctl(F_D_I_P_OPEN) - create passthrough fd with ref 1
> open/FOPEN_PASSTHOUGH -  inc refcount in passthrough fd
> release - put refcount in passthrough fd
> ioctl(F_D_I_P_CLOSE) - put ref in passthrough fd
>
> Due to being refcounted the F_D_I_P_CLOSE can come at any point past
> the finished open request.
>
> Or did you have something else in mind?
>

What I had in mind is that FOPEN_PASSTHROUGH_AUTOCLOSE
"transfers" the server's refcount to the kernel and server does
not need to call explicit F_D_I_P_CLOSE.

This is useful for servers that don't care about reusing mappings.

> > > One other question that's nagging me is how to "unhide" these pseudo-fds.
> > >
> > > Could we create a kernel thread for each fuse sb which has normal
> > > file-table for these?  This would would allow inspecting state through
> > > /proc/$KTHREDID/fd, etc..
> > >
> >
> > Yeah that sounds like a good idea.
> > As I mentioned elsewhere in the thread, io_uring also has a mechanism
> > to register open files with the kernel to perform IO on them later.
> > I assume those files are also visible via some /proc/$KTHREDID/fd,
> > but I'll need to check.
> >
> > BTW, I see that the Android team is presenting eBPF-FUSE on LPC
> > coming Tuesday [1].
>
> At first glance it looks like a filtered kernel-only passthrough +
> fuse fallback, where filtering is provided by eBPF scripts and only
> falls back to userspace access on more complex cases.  Maybe it's a
> good direction, we'll see.

Yeh, we'll see.

> Apparently the passthrough case is
> important enough for various use cases.
>

Indeed.
My use case is HSM and I think that using FUSE for HSM is becoming
more and more common these days.

One of the things that bothers me is that both this FUSE_PASSTHROUGH
patch set and any future eBPF-FUSE passthrough implementation is
bound to duplicate a lot of code and know how from overlayfs
(along with the bugs).

We could try to factor out some common bits to a kernel fs passthough
library.

Anotehr options to consider is not to add any passthrough logic
to FUSE at all.

Instead, implement a "switch" fs to choose between passthrough
to one of several underlying fs "branches", where one of the branches
could be local fs and another a FUSE fs (i.e. for the complex cases).

A similar design was described at:
https://github.com/github/libprojfs/blob/master/docs/design.md#phase-2--hybrid

This "switch" fs is not that much different from overlayfs, when
removing the "merge dir" logic and replacing the "is_upper" logic
with a generic eBPF "choose_branch" logic.

Food for thought.

Thanks,
Amir.
Miklos Szeredi Sept. 12, 2022, 1:03 p.m. UTC | #11
On Mon, 12 Sept 2022 at 14:29, Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Sep 12, 2022 at 12:29 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > On Sat, 10 Sept 2022 at 10:52, Amir Goldstein <amir73il@gmail.com> wrote:

> > > BTW, I see that the Android team is presenting eBPF-FUSE on LPC
> > > coming Tuesday [1].
> >
> > At first glance it looks like a filtered kernel-only passthrough +
> > fuse fallback, where filtering is provided by eBPF scripts and only
> > falls back to userspace access on more complex cases.  Maybe it's a
> > good direction, we'll see.
>
> Yeh, we'll see.
>
> > Apparently the passthrough case is
> > important enough for various use cases.
> >
>
> Indeed.
> My use case is HSM and I think that using FUSE for HSM is becoming
> more and more common these days.

HSM?

>
> One of the things that bothers me is that both this FUSE_PASSTHROUGH
> patch set and any future eBPF-FUSE passthrough implementation is
> bound to duplicate a lot of code and know how from overlayfs
> (along with the bugs).
>
> We could try to factor out some common bits to a kernel fs passthough
> library.

Yeah, although fuse/passthrough might not want all the complexity.
Getting rid of the context switch latency is the easy part.  Getting
rid of dcache duplication is the hard one, though it seems that the
current level of hacks in overlayfs seems sufficient and nobody much
cares for the corner cases (or works around them).

>
> Anotehr options to consider is not to add any passthrough logic
> to FUSE at all.
>
> Instead, implement a "switch" fs to choose between passthrough
> to one of several underlying fs "branches", where one of the branches
> could be local fs and another a FUSE fs (i.e. for the complex cases).

st_dev/st_ino management might become a headache (as it is in overlayfs).

Thanks,
Miklos
Miklos Szeredi Sept. 12, 2022, 1:05 p.m. UTC | #12
On Mon, 12 Sept 2022 at 15:03, Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Mon, 12 Sept 2022 at 14:29, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Mon, Sep 12, 2022 at 12:29 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > >
> > > On Sat, 10 Sept 2022 at 10:52, Amir Goldstein <amir73il@gmail.com> wrote:
>
> > > > BTW, I see that the Android team is presenting eBPF-FUSE on LPC
> > > > coming Tuesday [1].
> > >
> > > At first glance it looks like a filtered kernel-only passthrough +
> > > fuse fallback, where filtering is provided by eBPF scripts and only
> > > falls back to userspace access on more complex cases.  Maybe it's a
> > > good direction, we'll see.
> >
> > Yeh, we'll see.
> >
> > > Apparently the passthrough case is
> > > important enough for various use cases.
> > >
> >
> > Indeed.
> > My use case is HSM and I think that using FUSE for HSM is becoming
> > more and more common these days.
>
> HSM?
>
> >
> > One of the things that bothers me is that both this FUSE_PASSTHROUGH
> > patch set and any future eBPF-FUSE passthrough implementation is
> > bound to duplicate a lot of code and know how from overlayfs
> > (along with the bugs).
> >
> > We could try to factor out some common bits to a kernel fs passthough
> > library.
>
> Yeah, although fuse/passthrough might not want all the complexity.
> Getting rid of the context switch latency is the easy part.  Getting
> rid of dcache duplication is the hard one, though it seems that the

s/dcache/page cache/

> current level of hacks in overlayfs seems sufficient and nobody much
> cares for the corner cases (or works around them).
>
> >
> > Anotehr options to consider is not to add any passthrough logic
> > to FUSE at all.
> >
> > Instead, implement a "switch" fs to choose between passthrough
> > to one of several underlying fs "branches", where one of the branches
> > could be local fs and another a FUSE fs (i.e. for the complex cases).
>
> st_dev/st_ino management might become a headache (as it is in overlayfs).
>
> Thanks,
> Miklos
Amir Goldstein Sept. 12, 2022, 1:26 p.m. UTC | #13
On Mon, Sep 12, 2022 at 4:03 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Mon, 12 Sept 2022 at 14:29, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Mon, Sep 12, 2022 at 12:29 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > >
> > > On Sat, 10 Sept 2022 at 10:52, Amir Goldstein <amir73il@gmail.com> wrote:
>
> > > > BTW, I see that the Android team is presenting eBPF-FUSE on LPC
> > > > coming Tuesday [1].
> > >
> > > At first glance it looks like a filtered kernel-only passthrough +
> > > fuse fallback, where filtering is provided by eBPF scripts and only
> > > falls back to userspace access on more complex cases.  Maybe it's a
> > > good direction, we'll see.
> >
> > Yeh, we'll see.
> >
> > > Apparently the passthrough case is
> > > important enough for various use cases.
> > >
> >
> > Indeed.
> > My use case is HSM and I think that using FUSE for HSM is becoming
> > more and more common these days.
>
> HSM?
>

Sorry, Hierarchical Storage Management.
such as the product described at:
https://github.com/github/libprojfs/blob/master/docs/design.md#vfsforgit-on-windows

> >
> > One of the things that bothers me is that both this FUSE_PASSTHROUGH
> > patch set and any future eBPF-FUSE passthrough implementation is
> > bound to duplicate a lot of code and know how from overlayfs
> > (along with the bugs).
> >
> > We could try to factor out some common bits to a kernel fs passthough
> > library.
>
> Yeah, although fuse/passthrough might not want all the complexity.
> Getting rid of the context switch latency is the easy part.  Getting
> rid of  page cache duplication is the hard one, though it seems that the
> current level of hacks in overlayfs seems sufficient and nobody much
> cares for the corner cases (or works around them).
>

FWIW duplicate page cache exists in passthough FUSE whether
passthrough is in kernel or in userspace, but going through yet another
"switch" fs would make things even worse.

I have another completely different solution that I am considering
for HSM that is a little less flexible than FUSE, but does not have many of
the passthrough challenges:

https://lore.kernel.org/linux-fsdevel/CAOQ4uxhrQ7hySTyHM0Atq=uzbNdHyGV5wfadJarhAu1jDFOUTg@mail.gmail.com/

> >
> > Anotehr options to consider is not to add any passthrough logic
> > to FUSE at all.
> >
> > Instead, implement a "switch" fs to choose between passthrough
> > to one of several underlying fs "branches", where one of the branches
> > could be local fs and another a FUSE fs (i.e. for the complex cases).
>
> st_dev/st_ino management might become a headache (as it is in overlayfs).
>

Yeh. It's interesting how passthough of readdir and lookup/create in
eBPF-FUSE is going to handle those things...

Thanks,
Amir.
Miklos Szeredi Sept. 12, 2022, 2:22 p.m. UTC | #14
On Mon, 12 Sept 2022 at 15:26, Amir Goldstein <amir73il@gmail.com> wrote:

> FWIW duplicate page cache exists in passthough FUSE whether
> passthrough is in kernel or in userspace, but going through yet another
> "switch" fs would make things even worse.

I imagine the "switch" layer for a HSM would be simple enough:

a) if file exists on fastest layer (upper) then take that
b) if not then fall back to fuse layer (lower) .

It's almost like a read-only overlayfs (no copy up) except it would be
read-write and copy-up/down would be performed by the server as
needed. No page cache duplication for upper, and AFAICS no corner
cases that overlayfs has, since all layers are consistent (the fuse
layer would reference the upper if that is currently the up-to-date
one).

readdir would go to the layer which has the complete directory (which
I guess the lower one must always have, but the upper could also).

I'm probably missing lots of details, though...

Thanks,
Miklos
Amir Goldstein Sept. 12, 2022, 3:39 p.m. UTC | #15
On Mon, Sep 12, 2022 at 5:22 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Mon, 12 Sept 2022 at 15:26, Amir Goldstein <amir73il@gmail.com> wrote:
>
> > FWIW duplicate page cache exists in passthough FUSE whether
> > passthrough is in kernel or in userspace, but going through yet another
> > "switch" fs would make things even worse.
>
> I imagine the "switch" layer for a HSM would be simple enough:
>
> a) if file exists on fastest layer (upper) then take that
> b) if not then fall back to fuse layer (lower) .
>
> It's almost like a read-only overlayfs (no copy up) except it would be
> read-write and copy-up/down would be performed by the server as
> needed. No page cache duplication for upper, and AFAICS no corner
> cases that overlayfs has, since all layers are consistent (the fuse
> layer would reference the upper if that is currently the up-to-date
> one).

On recent LSF/MM/BPF, BPF developers asked me about using overlayfs
for something that looks like the above - merging of non overlapping layers
without any copy-up/down, but with write to lower.

I gave them the same solution (overlayfs without copy-up)
but I said I didn't know what you would think about this overlayfs mode
and I also pointed them to the eBPF-FUSE developers as another
possible solution to their use case.

>
> readdir would go to the layer which has the complete directory (which
> I guess the lower one must always have, but the upper could also).
>
> I'm probably missing lots of details, though...
>

That's what I said too :)

Does that mean that you are open to seeing patches for
an overlayfs mode that does not copy-up on write to lower?
I can come up with some semantics for readdir that will
make sense.

Thanks,
Amir.
Hao Luo Sept. 12, 2022, 5:43 p.m. UTC | #16
Sorry, resend, my response was bounced back by mail system due to not
using plain text.

On Mon, Sep 12, 2022 at 8:40 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Sep 12, 2022 at 5:22 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > On Mon, 12 Sept 2022 at 15:26, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > > FWIW duplicate page cache exists in passthough FUSE whether
> > > passthrough is in kernel or in userspace, but going through yet another
> > > "switch" fs would make things even worse.
> >
> > I imagine the "switch" layer for a HSM would be simple enough:
> >
> > a) if file exists on fastest layer (upper) then take that
> > b) if not then fall back to fuse layer (lower) .
> >
> > It's almost like a read-only overlayfs (no copy up) except it would be
> > read-write and copy-up/down would be performed by the server as
> > needed. No page cache duplication for upper, and AFAICS no corner
> > cases that overlayfs has, since all layers are consistent (the fuse
> > layer would reference the upper if that is currently the up-to-date
> > one).
>
> On recent LSF/MM/BPF, BPF developers asked me about using overlayfs
> for something that looks like the above - merging of non overlapping layers
> without any copy-up/down, but with write to lower.
>
> I gave them the same solution (overlayfs without copy-up)
> but I said I didn't know what you would think about this overlayfs mode
> and I also pointed them to the eBPF-FUSE developers as another
> possible solution to their use case.
>

Thanks Amir for adding me in the thread. This idea is very useful for
BPF use cases.

A bit more context here: we were thinking of overlaying two
filesystems together to create a view that extends the filesystem at
the lower layer. In our design, the lower layer is a pseudo
filesystem, which one can _not_ create/delete files, but make
directories _indirectly_, via creating a kernel object; the upper is
bpf filesystem, from which, one can create files. The file's purpose
is to describe the directory in the lower layer, that is, to describe
the kernel object that directory corresponds to.

With the flexibility brought by BPF, it can be a quite flexible
solution to query kernel objects' states.

>
> >
> > readdir would go to the layer which has the complete directory (which
> > I guess the lower one must always have, but the upper could also).
> >
> > I'm probably missing lots of details, though...
> >
>
> That's what I said too :)
>
> Does that mean that you are open to seeing patches for
> an overlayfs mode that does not copy-up on write to lower?
> I can come up with some semantics for readdir that will
> make sense.

I am excited about this. Thanks Amir!

>
> Thanks,
> Amir.
Amir Goldstein Sept. 12, 2022, 6:28 p.m. UTC | #17
[Changing subject, reduce CC list and add move to overlayfs list]

On Mon, Sep 12, 2022 at 8:43 PM Hao Luo <haoluo@google.com> wrote:
>
> Sorry, resend, my response was bounced back by mail system due to not
> using plain text.
>
> On Mon, Sep 12, 2022 at 8:40 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Mon, Sep 12, 2022 at 5:22 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > >
> > > I imagine the "switch" layer for a HSM would be simple enough:
> > >
> > > a) if file exists on fastest layer (upper) then take that
> > > b) if not then fall back to fuse layer (lower) .
> > >
> > > It's almost like a read-only overlayfs (no copy up) except it would be
> > > read-write and copy-up/down would be performed by the server as
> > > needed. No page cache duplication for upper, and AFAICS no corner
> > > cases that overlayfs has, since all layers are consistent (the fuse
> > > layer would reference the upper if that is currently the up-to-date
> > > one).
> >
> > On recent LSF/MM/BPF, BPF developers asked me about using overlayfs
> > for something that looks like the above - merging of non overlapping layers
> > without any copy-up/down, but with write to lower.
> >
> > I gave them the same solution (overlayfs without copy-up)
> > but I said I didn't know what you would think about this overlayfs mode
> > and I also pointed them to the eBPF-FUSE developers as another
> > possible solution to their use case.
> >
>
> Thanks Amir for adding me in the thread. This idea is very useful for
> BPF use cases.

Hi Hao,

Thanks for chiming in.
This thread has long diverged from the FUSE_PASSTHROUGH
patch review so I started a new thread to discuss the overlayfs
option.

Am I to understand that the eBPF-FUSE option currently
does not fit your needs (maybe because it is not ready yet)?

>
> A bit more context here: we were thinking of overlaying two
> filesystems together to create a view that extends the filesystem at
> the lower layer. In our design, the lower layer is a pseudo
> filesystem, which one can _not_ create/delete files, but make
> directories _indirectly_, via creating a kernel object; the upper is
> bpf filesystem, from which, one can create files. The file's purpose
> is to describe the directory in the lower layer, that is, to describe
> the kernel object that directory corresponds to.
>
> With the flexibility brought by BPF, it can be a quite flexible
> solution to query kernel objects' states.
>

Can't say I was able to understand the description, but let me
try to write the requirement in overlayfs terminology.
Please correct me if I am wrong.

1. The "lower" fs (cgroupfs?) is a "remote" fs where directories
    may appear or disappear due to "remote" events

I think there were similar requirements to support changes
to lower fs which in a network fs in the past.
As long as those directories are strictly lower that should be
possible.

Does upper fs have directories of the same name that need to
be merged with those lower dirs?

2. You did not mention this but IIRC, the lower fs has (pseudo)
    files that you want to be able to write to, and those files never
    exist in the upper fs

That translates to allowing write to lower fs without triggering copy-up

3. The upper fs files are all "pure" - that means those file paths
    do not exist in lower fs

Creating files in the upper fs is normal in overlayfs, so not a problem.

4. Merging upper and lower directory content is usually not needed??

It must be needed for the root dir at least but where else?
In which directories are the upper files created?
Which directories are "pure" (containing either lower or upper files)
and which directories are "merge" of upper and lower children?

If the answer is that some directories exist both in upper and in lower
and that those directories should be merged for the unified view then
this is standard overlayfs behavior.

Once I get the requirement in order I will try to write coherent
semantics for this mode and see if it can meet the needs of other
projects, such as HSM with FUSE as lower fs.

Thanks,
Amir.
Hao Luo Sept. 13, 2022, 6:26 p.m. UTC | #18
On Mon, Sep 12, 2022 at 11:28 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Sep 12, 2022 at 8:43 PM Hao Luo <haoluo@google.com> wrote:
> >
> > On Mon, Sep 12, 2022 at 8:40 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Mon, Sep 12, 2022 at 5:22 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > >
> > > > I imagine the "switch" layer for a HSM would be simple enough:
> > > >
> > > > a) if file exists on fastest layer (upper) then take that
> > > > b) if not then fall back to fuse layer (lower) .
> > > >
> > > > It's almost like a read-only overlayfs (no copy up) except it would be
> > > > read-write and copy-up/down would be performed by the server as
> > > > needed. No page cache duplication for upper, and AFAICS no corner
> > > > cases that overlayfs has, since all layers are consistent (the fuse
> > > > layer would reference the upper if that is currently the up-to-date
> > > > one).
> > >
> > > On recent LSF/MM/BPF, BPF developers asked me about using overlayfs
> > > for something that looks like the above - merging of non overlapping layers
> > > without any copy-up/down, but with write to lower.
> > >
> > > I gave them the same solution (overlayfs without copy-up)
> > > but I said I didn't know what you would think about this overlayfs mode
> > > and I also pointed them to the eBPF-FUSE developers as another
> > > possible solution to their use case.
> > >
> >
> > Thanks Amir for adding me in the thread. This idea is very useful for
> > BPF use cases.
>
[...]
>
> Am I to understand that the eBPF-FUSE option currently
> does not fit your needs (maybe because it is not ready yet)?
>

Yeah, mostly because eBPF-FUSE is not ready. I talked to Alessio and
his colleague after LSF/MM/BPF. They were distracted from eBPF-FUSE
development at that time and I didn't follow up, working on other BPF
stuff.

> >
[...]
> Can't say I was able to understand the description, but let me
> try to write the requirement in overlayfs terminology.
> Please correct me if I am wrong.
>
> 1. The "lower" fs (cgroupfs?) is a "remote" fs where directories
>     may appear or disappear due to "remote" events
>

Right. Seems we are aligned on this.

> I think there were similar requirements to support changes
> to lower fs which in a network fs in the past.
> As long as those directories are strictly lower that should be
> possible.
>
> Does upper fs have directories of the same name that need to
> be merged with those lower dirs?
>

No, I don't think so. Upper fs should only have files in my use case.

> 2. You did not mention this but IIRC, the lower fs has (pseudo)
>     files that you want to be able to write to, and those files never
>     exist in the upper fs
>
> That translates to allowing write to lower fs without triggering copy-up
>

Writing to lower is not needed right now.

> 3. The upper fs files are all "pure" - that means those file paths
>     do not exist in lower fs
>
> Creating files in the upper fs is normal in overlayfs, so not a problem.
>

Yes. If used in an expected way, the lower fs won't have file paths
that also exist in the upper.

> 4. Merging upper and lower directory content is usually not needed??
>
> It must be needed for the root dir at least but where else?
> In which directories are the upper files created?
> Which directories are "pure" (containing either lower or upper files)
> and which directories are "merge" of upper and lower children?
>

In my use case, if that's doable, all the directories are "pure",
except the root dir, and they are in the lower. The files are either
from upper or from lower, so no merge. This should be sufficient for
the BPF use case.

> If the answer is that some directories exist both in upper and in lower
> and that those directories should be merged for the unified view then
> this is standard overlayfs behavior.
>
> Once I get the requirement in order I will try to write coherent
> semantics for this mode and see if it can meet the needs of other
> projects, such as HSM with FUSE as lower fs.

Thanks Amir. I want to clarify that with my very very limited
knowledge on FS, I can't say what I am thinking right now is
absolutely correct. Please take my request for features with a grain
of salt and feel free to pick the semantics that you see make most
sense.

Hao
Amir Goldstein Sept. 13, 2022, 6:54 p.m. UTC | #19
On Tue, Sep 13, 2022 at 9:26 PM Hao Luo <haoluo@google.com> wrote:
>
> On Mon, Sep 12, 2022 at 11:28 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Mon, Sep 12, 2022 at 8:43 PM Hao Luo <haoluo@google.com> wrote:
> > >
> > > On Mon, Sep 12, 2022 at 8:40 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Mon, Sep 12, 2022 at 5:22 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > > >
> > > > > I imagine the "switch" layer for a HSM would be simple enough:
> > > > >
> > > > > a) if file exists on fastest layer (upper) then take that
> > > > > b) if not then fall back to fuse layer (lower) .
> > > > >
> > > > > It's almost like a read-only overlayfs (no copy up) except it would be
> > > > > read-write and copy-up/down would be performed by the server as
> > > > > needed. No page cache duplication for upper, and AFAICS no corner
> > > > > cases that overlayfs has, since all layers are consistent (the fuse
> > > > > layer would reference the upper if that is currently the up-to-date
> > > > > one).
> > > >
> > > > On recent LSF/MM/BPF, BPF developers asked me about using overlayfs
> > > > for something that looks like the above - merging of non overlapping layers
> > > > without any copy-up/down, but with write to lower.
> > > >
> > > > I gave them the same solution (overlayfs without copy-up)
> > > > but I said I didn't know what you would think about this overlayfs mode
> > > > and I also pointed them to the eBPF-FUSE developers as another
> > > > possible solution to their use case.
> > > >
> > >
> > > Thanks Amir for adding me in the thread. This idea is very useful for
> > > BPF use cases.
> >
> [...]
> >
> > Am I to understand that the eBPF-FUSE option currently
> > does not fit your needs (maybe because it is not ready yet)?
> >
>
> Yeah, mostly because eBPF-FUSE is not ready. I talked to Alessio and
> his colleague after LSF/MM/BPF. They were distracted from eBPF-FUSE
> development at that time and I didn't follow up, working on other BPF
> stuff.
>
> > >
> [...]
> > Can't say I was able to understand the description, but let me
> > try to write the requirement in overlayfs terminology.
> > Please correct me if I am wrong.
> >
> > 1. The "lower" fs (cgroupfs?) is a "remote" fs where directories
> >     may appear or disappear due to "remote" events
> >
>
> Right. Seems we are aligned on this.
>
> > I think there were similar requirements to support changes
> > to lower fs which in a network fs in the past.
> > As long as those directories are strictly lower that should be
> > possible.
> >
> > Does upper fs have directories of the same name that need to
> > be merged with those lower dirs?
> >
>
> No, I don't think so. Upper fs should only have files in my use case.
>
> > 2. You did not mention this but IIRC, the lower fs has (pseudo)
> >     files that you want to be able to write to, and those files never
> >     exist in the upper fs
> >
> > That translates to allowing write to lower fs without triggering copy-up
> >
>
> Writing to lower is not needed right now.
>
> > 3. The upper fs files are all "pure" - that means those file paths
> >     do not exist in lower fs
> >
> > Creating files in the upper fs is normal in overlayfs, so not a problem.
> >
>
> Yes. If used in an expected way, the lower fs won't have file paths
> that also exist in the upper.
>
> > 4. Merging upper and lower directory content is usually not needed??
> >
> > It must be needed for the root dir at least but where else?
> > In which directories are the upper files created?
> > Which directories are "pure" (containing either lower or upper files)
> > and which directories are "merge" of upper and lower children?
> >
>
> In my use case, if that's doable, all the directories are "pure",
> except the root dir, and they are in the lower. The files are either
> from upper or from lower, so no merge. This should be sufficient for
> the BPF use case.
>
> > If the answer is that some directories exist both in upper and in lower
> > and that those directories should be merged for the unified view then
> > this is standard overlayfs behavior.
> >
> > Once I get the requirement in order I will try to write coherent
> > semantics for this mode and see if it can meet the needs of other
> > projects, such as HSM with FUSE as lower fs.
>
> Thanks Amir. I want to clarify that with my very very limited
> knowledge on FS, I can't say what I am thinking right now is
> absolutely correct. Please take my request for features with a grain
> of salt and feel free to pick the semantics that you see make most
> sense.
>

OK. IIUC, you have upper fs files only in the root dir?
And the lower root dir has only subdirs?

If that is all then it sounds pretty simple.
It could be described something like this:
1. merged directories cannot appear/disappear
2. lower pure directories can appear/disappear
3. upper files/dirs can be created inside merge dirs and pure upper dirs

I think I have some patches that could help with #2.

Can you give a small example of an upper a lower and their
union trees just for the sake of discussion?

Thanks,
Amir.
Hao Luo Sept. 13, 2022, 8:33 p.m. UTC | #20
On Tue, Sep 13, 2022 at 11:54 AM Amir Goldstein <amir73il@gmail.com> wrote:
> OK. IIUC, you have upper fs files only in the root dir?

Sorry, no, the upper fs files need to be in subdir.

> And the lower root dir has only subdirs?

There could be files.

> Can you give a small example of an upper a lower and their
> union trees just for the sake of discussion?
>

For example, assume lower has the following layout:
$ tree lower
.
└── A
    ├── B
    │   └── lower
    └── lower

I can't create files in the fs in the lower.
$ touch A/B/file
touch: cannot touch 'A/B/file': Permission denied

The upper is initially empty.

I would like to overlay a writable fs on top of lower, so the union
tree looks like
$ tree union
.
└── A
    ├── B
    │   └── lower
    └── lower
$ touch A/B/file
$ tree union
.
└── A
    ├── B
    │   ├── file
    │   └── lower2
    └── lower1

Here, 'file' exists in the upper.

Further, directory B could disappear from lower. When that happens, I
think there are two possible behaviors:
 - make 'file' disappear from union as well;
 - make 'file' and its directory accessible as well.

In behavior 1, it will look like
$ tree union
.
└── A
    └── lower1

In behavior 2, it will look like
$ tree union
.
└── A
    ├── B
    │   └── file
    └── lower1

IMHO, behavior 1 works better in my use case. But if the FS experts
think behavior 2 makes more sense, I can work around.

>
> If that is all then it sounds pretty simple.
> It could be described something like this:
> 1. merged directories cannot appear/disappear
> 2. lower pure directories can appear/disappear
> 3. upper files/dirs can be created inside merge dirs and pure upper dirs
>
> I think I have some patches that could help with #2.
>

These three semantics looks good to me.

> Thanks,
> Amir.
Amir Goldstein Sept. 14, 2022, 3:46 a.m. UTC | #21
On Tue, Sep 13, 2022 at 11:33 PM Hao Luo <haoluo@google.com> wrote:
>
> On Tue, Sep 13, 2022 at 11:54 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > OK. IIUC, you have upper fs files only in the root dir?
>
> Sorry, no, the upper fs files need to be in subdir.
>
> > And the lower root dir has only subdirs?
>
> There could be files.
>

And assuming that those files are cgroupfs files, why
did you say there is no need to write to those files?

I seem to recall that was an important distinction from
standard overlayfs when you described the problem in LSFMM.

> > Can you give a small example of an upper a lower and their
> > union trees just for the sake of discussion?
> >
>
> For example, assume lower has the following layout:
> $ tree lower
> .
> └── A
>     ├── B
>     │   └── lower
>     └── lower
>
> I can't create files in the fs in the lower.
> $ touch A/B/file
> touch: cannot touch 'A/B/file': Permission denied
>
> The upper is initially empty.
>
> I would like to overlay a writable fs on top of lower, so the union
> tree looks like
> $ tree union
> .
> └── A
>     ├── B
>     │   └── lower
>     └── lower
> $ touch A/B/file
> $ tree union
> .
> └── A
>     ├── B
>     │   ├── file
>     │   └── lower2
>     └── lower1
>
> Here, 'file' exists in the upper.
>

So B is now called a "merged" dir - it is not a "pure" dir
anymore because it contains both upper and lower files.

Normally in overlayfs before creating 'file' in upper,
the hierarchy A/B/ needs to be created in upper fs
to contain the file.

Unless your upper fs automagically has the same
dirs hierarchy as the lower fs?

You should know that overlayfs does more than just
mkdir("A");mkdir("A/B")
it created tmp dirs, sets xattrs and attrs on them and moves
them into place.
I am not sure if you planned to support all those operations
in your upper fs?

There are probably some other limitations at the moment
related to pseudo filesystems that prevent them from being
used as upper and/or lower fs in overlayfs.

We will need to check what those limitations are and whether
those limitations could be lifted for your specific use case.

> Further, directory B could disappear from lower. When that happens, I
> think there are two possible behaviors:
>  - make 'file' disappear from union as well;
>  - make 'file' and its directory accessible as well.
>
> In behavior 1, it will look like
> $ tree union
> .
> └── A
>     └── lower1
>
> In behavior 2, it will look like
> $ tree union
> .
> └── A
>     ├── B
>     │   └── file
>     └── lower1
>
> IMHO, behavior 1 works better in my use case. But if the FS experts
> think behavior 2 makes more sense, I can work around.
>

Something that I always wanted to try is to get rid of the duplicated
upper fs hierarchy.

It's a bit complicated to explain the details, but if your use case
does not involve any directory renames(?), then the upper path
for the merge directories can be index based and not hierarchical.

IOW:
- union tree lookup/traversal by path is performed on the lower fs
- upper fs is traversed by name only for pure upper dirs
- merged dirs are found by index of lower dir inode

The result will be behavior 1 that you wanted

I have some other use cases that might benefit from this mode.

> >
> > If that is all then it sounds pretty simple.
> > It could be described something like this:
> > 1. merged directories cannot appear/disappear
> > 2. lower pure directories can appear/disappear
> > 3. upper files/dirs can be created inside merge dirs and pure upper dirs
> >
> > I think I have some patches that could help with #2.
> >
>
> These three semantics looks good to me.
>

Except the case about disappearing B that you just described
above breaks rule #1 ;)
so other semantics are needed.

I will need some more clarifications about your use case to
understand if what I have in mind could help your use case.

Thanks,
Amir.
Hao Luo Sept. 14, 2022, 6 p.m. UTC | #22
On Tue, Sep 13, 2022 at 8:46 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Tue, Sep 13, 2022 at 11:33 PM Hao Luo <haoluo@google.com> wrote:
> >
> > On Tue, Sep 13, 2022 at 11:54 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > OK. IIUC, you have upper fs files only in the root dir?
> >
> > Sorry, no, the upper fs files need to be in subdir.
> >
> > > And the lower root dir has only subdirs?
> >
> > There could be files.
> >
>
> And assuming that those files are cgroupfs files, why
> did you say there is no need to write to those files?
>
> I seem to recall that was an important distinction from
> standard overlayfs when you described the problem in LSFMM.

In my last reply, I was assuming all the writes to the cgroupfs files
happen from remote, not from the union. One can read files from union,
or create files that exist in the upper.

The idea is, I can provide two copies of lower to users. One is the
original lower, writable, so any update happens there. And the other
is a union of the lower and the upper, it's a read-only view of the
lower, but extended by the upper.

I actually don't know whether supporting writes to the lower from the
union is better. It probably is, because then I can combine the two
copies into one.

>
>
> > > Can you give a small example of an upper a lower and their
> > > union trees just for the sake of discussion?
> > >
> >
> > For example, assume lower has the following layout:
> > $ tree lower
> > .
> > └── A
> >     ├── B
> >     │   └── lower
> >     └── lower
> >
> > I can't create files in the fs in the lower.
> > $ touch A/B/file
> > touch: cannot touch 'A/B/file': Permission denied
> >
> > The upper is initially empty.
> >
> > I would like to overlay a writable fs on top of lower, so the union
> > tree looks like
> > $ tree union
> > .
> > └── A
> >     ├── B
> >     │   └── lower
> >     └── lower
> > $ touch A/B/file
> > $ tree union
> > .
> > └── A
> >     ├── B
> >     │   ├── file
> >     │   └── lower2
> >     └── lower1
> >
> > Here, 'file' exists in the upper.
> >
>
> So B is now called a "merged" dir - it is not a "pure" dir
> anymore because it contains both upper and lower files.
>
> Normally in overlayfs before creating 'file' in upper,
> the hierarchy A/B/ needs to be created in upper fs
> to contain the file.
>
> Unless your upper fs automagically has the same
> dirs hierarchy as the lower fs?
>
> You should know that overlayfs does more than just
> mkdir("A");mkdir("A/B")
> it created tmp dirs, sets xattrs and attrs on them and moves
> them into place.
> I am not sure if you planned to support all those operations
> in your upper fs?
>

Yeah. I can add support for tmp dirs, tmp files in my upper fs, that
is, bpffs. I played it a bit back in May, that is totally doable. I
remembered I successfully made bpffs accepted as overlayfs's upper
without xattrs and attrs back then. Maybe I missed something.

> There are probably some other limitations at the moment
> related to pseudo filesystems that prevent them from being
> used as upper and/or lower fs in overlayfs.
>
> We will need to check what those limitations are and whether
> those limitations could be lifted for your specific use case.
>

How can we approach this? Maybe I can send my patch that adds tmp dir,
tmp files and xattr, attr to upstream as RFC, so you can take a look?

> > Further, directory B could disappear from lower. When that happens, I
> > think there are two possible behaviors:
> >  - make 'file' disappear from union as well;
> >  - make 'file' and its directory accessible as well.
> >
> > In behavior 1, it will look like
> > $ tree union
> > .
> > └── A
> >     └── lower1
> >
> > In behavior 2, it will look like
> > $ tree union
> > .
> > └── A
> >     ├── B
> >     │   └── file
> >     └── lower1
> >
> > IMHO, behavior 1 works better in my use case. But if the FS experts
> > think behavior 2 makes more sense, I can work around.
> >
>
> Something that I always wanted to try is to get rid of the duplicated
> upper fs hierarchy.
>
> It's a bit complicated to explain the details, but if your use case
> does not involve any directory renames(?), then the upper path
> for the merge directories can be index based and not hierarchical.
>

Yeah, I don't expect directory renaming. But I can't say if there is
anyone trying to do that by accident, or by bad intention.

> IOW:
> - union tree lookup/traversal by path is performed on the lower fs
> - upper fs is traversed by name only for pure upper dirs
> - merged dirs are found by index of lower dir inode
>
> The result will be behavior 1 that you wanted
>
> I have some other use cases that might benefit from this mode.
>

That will be very cool!

> > >
> > > If that is all then it sounds pretty simple.
> > > It could be described something like this:
> > > 1. merged directories cannot appear/disappear
> > > 2. lower pure directories can appear/disappear
> > > 3. upper files/dirs can be created inside merge dirs and pure upper dirs
> > >
> > > I think I have some patches that could help with #2.
> > >
> >
> > These three semantics looks good to me.
> >
>
> Except the case about disappearing B that you just described
> above breaks rule #1 ;)
> so other semantics are needed.
>

Yes :) Now I understand. Thanks for the explanation on "merged" dir.

> I will need some more clarifications about your use case to
> understand if what I have in mind could help your use case.
>
> Thanks,
> Amir.
Amir Goldstein Sept. 14, 2022, 7:23 p.m. UTC | #23
On Wed, Sep 14, 2022 at 9:00 PM Hao Luo <haoluo@google.com> wrote:
>
> On Tue, Sep 13, 2022 at 8:46 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Tue, Sep 13, 2022 at 11:33 PM Hao Luo <haoluo@google.com> wrote:
> > >
> > > On Tue, Sep 13, 2022 at 11:54 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > OK. IIUC, you have upper fs files only in the root dir?
> > >
> > > Sorry, no, the upper fs files need to be in subdir.
> > >
> > > > And the lower root dir has only subdirs?
> > >
> > > There could be files.
> > >
> >
> > And assuming that those files are cgroupfs files, why
> > did you say there is no need to write to those files?
> >
> > I seem to recall that was an important distinction from
> > standard overlayfs when you described the problem in LSFMM.
>
> In my last reply, I was assuming all the writes to the cgroupfs files
> happen from remote, not from the union. One can read files from union,
> or create files that exist in the upper.
>
> The idea is, I can provide two copies of lower to users. One is the
> original lower, writable, so any update happens there. And the other
> is a union of the lower and the upper, it's a read-only view of the
> lower, but extended by the upper.
>
> I actually don't know whether supporting writes to the lower from the
> union is better. It probably is, because then I can combine the two
> copies into one.
>

Understood.

> >
> >
> > > > Can you give a small example of an upper a lower and their
> > > > union trees just for the sake of discussion?
> > > >
> > >
> > > For example, assume lower has the following layout:
> > > $ tree lower
> > > .
> > > └── A
> > >     ├── B
> > >     │   └── lower
> > >     └── lower
> > >
> > > I can't create files in the fs in the lower.
> > > $ touch A/B/file
> > > touch: cannot touch 'A/B/file': Permission denied
> > >
> > > The upper is initially empty.
> > >
> > > I would like to overlay a writable fs on top of lower, so the union
> > > tree looks like
> > > $ tree union
> > > .
> > > └── A
> > >     ├── B
> > >     │   └── lower
> > >     └── lower
> > > $ touch A/B/file
> > > $ tree union
> > > .
> > > └── A
> > >     ├── B
> > >     │   ├── file
> > >     │   └── lower2
> > >     └── lower1
> > >
> > > Here, 'file' exists in the upper.
> > >
> >
> > So B is now called a "merged" dir - it is not a "pure" dir
> > anymore because it contains both upper and lower files.
> >
> > Normally in overlayfs before creating 'file' in upper,
> > the hierarchy A/B/ needs to be created in upper fs
> > to contain the file.
> >
> > Unless your upper fs automagically has the same
> > dirs hierarchy as the lower fs?
> >
> > You should know that overlayfs does more than just
> > mkdir("A");mkdir("A/B")
> > it created tmp dirs, sets xattrs and attrs on them and moves
> > them into place.
> > I am not sure if you planned to support all those operations
> > in your upper fs?
> >
>
> Yeah. I can add support for tmp dirs, tmp files in my upper fs, that
> is, bpffs. I played it a bit back in May, that is totally doable. I
> remembered I successfully made bpffs accepted as overlayfs's upper
> without xattrs and attrs back then. Maybe I missed something.
>

sounds reasonable.
xattr is not a hard requirement but some things (that you do not need)
cannot work without xattr like merged directory rename and rmdir.

> > There are probably some other limitations at the moment
> > related to pseudo filesystems that prevent them from being
> > used as upper and/or lower fs in overlayfs.
> >
> > We will need to check what those limitations are and whether
> > those limitations could be lifted for your specific use case.
> >
>
> How can we approach this? Maybe I can send my patch that adds tmp dir,
> tmp files and xattr, attr to upstream as RFC, so you can take a look?
>

I don't think I need your fs to test.
The only thing special in this setup as far as I can tell is the dynamic
cgroupfs (or cgroup2?) lower dirs.

IIUC, everything worked for you except for oddities related to
lower directories not appearing and not disappearing from the union.
Is that correct? is that the only thing that you need a fix for?

> > > Further, directory B could disappear from lower. When that happens, I
> > > think there are two possible behaviors:
> > >  - make 'file' disappear from union as well;
> > >  - make 'file' and its directory accessible as well.
> > >
> > > In behavior 1, it will look like
> > > $ tree union
> > > .
> > > └── A
> > >     └── lower1
> > >
> > > In behavior 2, it will look like
> > > $ tree union
> > > .
> > > └── A
> > >     ├── B
> > >     │   └── file
> > >     └── lower1
> > >
> > > IMHO, behavior 1 works better in my use case. But if the FS experts
> > > think behavior 2 makes more sense, I can work around.
> > >
> >
> > Something that I always wanted to try is to get rid of the duplicated
> > upper fs hierarchy.
> >
> > It's a bit complicated to explain the details, but if your use case
> > does not involve any directory renames(?), then the upper path
> > for the merge directories can be index based and not hierarchical.
> >
>
> Yeah, I don't expect directory renaming. But I can't say if there is
> anyone trying to do that by accident, or by bad intention.
>

Your fs will return an error for rename if you did not implement it.

Anyway, if you can accept behavior 2, it is much more simple.
This other idea is very vague and not simple, so better not risk it.

If you confirm that you only need to get uptodate view of
lower dirs in union, then I will look for the patches that I have
and see if they can help you.

Thanks,
Amir.
Hao Luo Sept. 14, 2022, 7:33 p.m. UTC | #24
On Wed, Sep 14, 2022 at 12:23 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Wed, Sep 14, 2022 at 9:00 PM Hao Luo <haoluo@google.com> wrote:
> >
> > On Tue, Sep 13, 2022 at 8:46 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Tue, Sep 13, 2022 at 11:33 PM Hao Luo <haoluo@google.com> wrote:
> > > >
> > > > On Tue, Sep 13, 2022 at 11:54 AM Amir Goldstein <amir73il@gmail.com> wrote:
[...]
> > > There are probably some other limitations at the moment
> > > related to pseudo filesystems that prevent them from being
> > > used as upper and/or lower fs in overlayfs.
> > >
> > > We will need to check what those limitations are and whether
> > > those limitations could be lifted for your specific use case.
> > >
> >
> > How can we approach this? Maybe I can send my patch that adds tmp dir,
> > tmp files and xattr, attr to upstream as RFC, so you can take a look?
> >
>
> I don't think I need your fs to test.
> The only thing special in this setup as far as I can tell is the dynamic
> cgroupfs (or cgroup2?) lower dirs.
>
> IIUC, everything worked for you except for oddities related to
> lower directories not appearing and not disappearing from the union.
> Is that correct? is that the only thing that you need a fix for?
>

Yes, that's correct.

> > > > Further, directory B could disappear from lower. When that happens, I
> > > > think there are two possible behaviors:
> > > >  - make 'file' disappear from union as well;
> > > >  - make 'file' and its directory accessible as well.
> > > >
> > > > In behavior 1, it will look like
> > > > $ tree union
> > > > .
> > > > └── A
> > > >     └── lower1
> > > >
> > > > In behavior 2, it will look like
> > > > $ tree union
> > > > .
> > > > └── A
> > > >     ├── B
> > > >     │   └── file
> > > >     └── lower1
> > > >
> > > > IMHO, behavior 1 works better in my use case. But if the FS experts
> > > > think behavior 2 makes more sense, I can work around.
> > > >
> > >
> > > Something that I always wanted to try is to get rid of the duplicated
> > > upper fs hierarchy.
> > >
> > > It's a bit complicated to explain the details, but if your use case
> > > does not involve any directory renames(?), then the upper path
> > > for the merge directories can be index based and not hierarchical.
> > >
> >
> > Yeah, I don't expect directory renaming. But I can't say if there is
> > anyone trying to do that by accident, or by bad intention.
> >
>
> Your fs will return an error for rename if you did not implement it.
>
> Anyway, if you can accept behavior 2, it is much more simple.
> This other idea is very vague and not simple, so better not risk it.
>
> If you confirm that you only need to get uptodate view of
> lower dirs in union, then I will look for the patches that I have
> and see if they can help you.
>

Yes, I acknowledge that behavior 2 works for me.
Amir Goldstein Sept. 15, 2022, 10:54 a.m. UTC | #25
On Wed, Sep 14, 2022 at 10:33 PM Hao Luo <haoluo@google.com> wrote:
>
> On Wed, Sep 14, 2022 at 12:23 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Wed, Sep 14, 2022 at 9:00 PM Hao Luo <haoluo@google.com> wrote:
> > >
> > > On Tue, Sep 13, 2022 at 8:46 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Tue, Sep 13, 2022 at 11:33 PM Hao Luo <haoluo@google.com> wrote:
> > > > >
> > > > > On Tue, Sep 13, 2022 at 11:54 AM Amir Goldstein <amir73il@gmail.com> wrote:
> [...]
> > > > There are probably some other limitations at the moment
> > > > related to pseudo filesystems that prevent them from being
> > > > used as upper and/or lower fs in overlayfs.
> > > >
> > > > We will need to check what those limitations are and whether
> > > > those limitations could be lifted for your specific use case.
> > > >
> > >
> > > How can we approach this? Maybe I can send my patch that adds tmp dir,
> > > tmp files and xattr, attr to upstream as RFC, so you can take a look?
> > >
> >
> > I don't think I need your fs to test.
> > The only thing special in this setup as far as I can tell is the dynamic
> > cgroupfs (or cgroup2?) lower dirs.
> >
> > IIUC, everything worked for you except for oddities related to
> > lower directories not appearing and not disappearing from the union.
> > Is that correct? is that the only thing that you need a fix for?
> >
>
> Yes, that's correct.
>
> > > > > Further, directory B could disappear from lower. When that happens, I
> > > > > think there are two possible behaviors:
> > > > >  - make 'file' disappear from union as well;
> > > > >  - make 'file' and its directory accessible as well.
> > > > >
> > > > > In behavior 1, it will look like
> > > > > $ tree union
> > > > > .
> > > > > └── A
> > > > >     └── lower1
> > > > >
> > > > > In behavior 2, it will look like
> > > > > $ tree union
> > > > > .
> > > > > └── A
> > > > >     ├── B
> > > > >     │   └── file
> > > > >     └── lower1
> > > > >
> > > > > IMHO, behavior 1 works better in my use case. But if the FS experts
> > > > > think behavior 2 makes more sense, I can work around.
> > > > >
> > > >
> > > > Something that I always wanted to try is to get rid of the duplicated
> > > > upper fs hierarchy.
> > > >
> > > > It's a bit complicated to explain the details, but if your use case
> > > > does not involve any directory renames(?), then the upper path
> > > > for the merge directories can be index based and not hierarchical.
> > > >
> > >
> > > Yeah, I don't expect directory renaming. But I can't say if there is
> > > anyone trying to do that by accident, or by bad intention.
> > >
> >
> > Your fs will return an error for rename if you did not implement it.
> >
> > Anyway, if you can accept behavior 2, it is much more simple.
> > This other idea is very vague and not simple, so better not risk it.
> >
> > If you confirm that you only need to get uptodate view of
> > lower dirs in union, then I will look for the patches that I have
> > and see if they can help you.
> >
>
> Yes, I acknowledge that behavior 2 works for me.

OK. I took a closer look and there are some challenges.
Nothing that cannot be fixed if you are willing to do the work.
I will try to explain the challenges and possible solutions.

Current overlayfs code assumes in many places that the
lower fs is not being changed at all while overlayfs is mounted.
As overlayfs.rst says:

"Changes to the underlying filesystems while part of a mounted overlay
filesystem are not allowed.  If the underlying filesystem is changed,
the behavior of the overlay is undefined, though it will not result in
a crash or deadlock."

One of the most visible impacts of changes to lower later
is that the merge dir cache is not invalidated, which is the
immediate reason that you are seeing the ghost lower dir A/B
in the union even if you did not create file A/B/file.

You can check if this hack fixes your first order problem:

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 78f62cc1797b..4eb6fcf341de 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -326,7 +326,7 @@ static void ovl_dir_reset(struct file *file)
        struct dentry *dentry = file->f_path.dentry;
        bool is_real;

-       if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+       if (cache /*&& ovl_dentry_version_get(dentry) != cache->version*/) {
                ovl_cache_put(od, dentry);
                od->cache = NULL;
                od->cursor = NULL;
---

If it does, it may be acceptable to add that behavior as a mount option.

But it doesn't end here, there is also lookup cache and possibly other
issues as well related to merge dirs with ghosted lower.

If you did create file A/B/file then trying to list A/B after A/B
has gone from lower fs,  may depend on the lower fs behavior.
Some of the issues are not related to overlayfs but to cgroupfs.

For "standard" Linux fs, if you keep an open fd to a directory,
that directory can be removed and then if you try to readdir from
the open fd, or use the fd in one of the XXXat() syscalls,
you will get ENOENT, because of the IS_DEADDIR(dir) checks
in the vfs.

Do you get this behavior with an open fd on a cgroupfs dir
that has disappeared? Please check.

I think that ovl_iterate() can be made more tolerant to
ENOENT when iterating a merge dir with ghosted lower dir.
If you run into this error when trying to list A/B, find out
the place in the code that returns the error and I'll see
if that error may be relaxed.

The patches that I have are doing something different.
The idea is that overlayfs can watch for lower fs changes using
fsnotify() callbacks and do "something" proactive when they happen.

My Overlayfs watch [1] patches do "something" else - they
record the changes to lower fs when they happen, but they
demonstrate the basic concept of watching changes in lower fs.

[1] https://github.com/amir73il/overlayfs/wiki/Overlayfs-watch

The "something" that overlayfs could do when a lower dir
is removed is to invalidate the caches of the union dir and
everything under it.

There is one other small problem with this method w.r.t
lower cgroupfs - cgroupfs does not call any fsnotify callbacks when
directories disappear...

cgroupfs is an instance of kernfs.
kenfs is calling the fsnotify_modify() hook when kernel changes
the content of a file:

d911d9874801 kernfs: make kernfs_notify() trigger inotify events too

but it does not call fsnotify_rmdir/mkdir/delete/create() like other pseudo
fs do (debugfs, configfs, tracefs, ...) when directories appear/disappear -
at least I don't think that it does.

Please run inotifywatch on cgroupfs and find out for yourself.

Hope that some of the info here can help you move forward.
Most of it you can probably ignore.

Thanks,
Amir.
Amir Goldstein May 12, 2023, 7:37 p.m. UTC | #26
On Mon, Sep 12, 2022 at 8:29 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Sep 12, 2022 at 12:29 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > On Sat, 10 Sept 2022 at 10:52, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > > I think we should accept the fact that just as any current FUSE
> > > passthrough (in userspace) implementation is limited to max number of
> > > open files as the server's process limitation, kernel passthrough implementation
> > > will be limited by inheriting the mounter's process limitation.
> > >
> > > There is no reason that the server should need to keep more
> > > passthrough fd's open than client open fds.
> >
> > Maybe you're right.
> >
> > > If we only support FOPEN_PASSTHROUGH_AUTOCLOSE as v12
> > > patches implicitly do, then the memory overhead is not much different
> > > than the extra overlayfs pseudo realfiles.
> >
> > How exactly would this work?
> >
> > ioctl(F_D_I_P_OPEN) - create passthrough fd with ref 1
> > open/FOPEN_PASSTHOUGH -  inc refcount in passthrough fd
> > release - put refcount in passthrough fd
> > ioctl(F_D_I_P_CLOSE) - put ref in passthrough fd
> >
> > Due to being refcounted the F_D_I_P_CLOSE can come at any point past
> > the finished open request.
> >
> > Or did you have something else in mind?
> >
>
> What I had in mind is that FOPEN_PASSTHROUGH_AUTOCLOSE
> "transfers" the server's refcount to the kernel and server does
> not need to call explicit F_D_I_P_CLOSE.
>
> This is useful for servers that don't care about reusing mappings.
>

Hi Daniel,

I was waiting for LSFMM to see if and how FUSE-BPF intends to
address the highest value use case of read/write passthrough.

From what I've seen, you are still taking a very broad approach of
all-or-nothing which still has a lot of core design issues to address,
while these old patches already address the most important use case
of read/write passthrough of fd without any of the core issues
(credentials, hidden fds).

As far as I can tell, this old implementation is mostly independent of your
lookup based approach - they share the low level read/write passthrough
functions but not much more than that, so merging them should not be
a blocker to your efforts in the longer run.
Please correct me if I am wrong.

As things stand, I intend to re-post these old patches with mandatory
FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
questions about managing mappings.

Miklos, please stop me if I missed something and if you do not
think that these two approaches are independent.

Thanks,
Amir.
Miklos Szeredi May 15, 2023, 7:29 a.m. UTC | #27
On Fri, 12 May 2023 at 21:37, Amir Goldstein <amir73il@gmail.com> wrote:

> I was waiting for LSFMM to see if and how FUSE-BPF intends to
> address the highest value use case of read/write passthrough.
>
> From what I've seen, you are still taking a very broad approach of
> all-or-nothing which still has a lot of core design issues to address,
> while these old patches already address the most important use case
> of read/write passthrough of fd without any of the core issues
> (credentials, hidden fds).
>
> As far as I can tell, this old implementation is mostly independent of your
> lookup based approach - they share the low level read/write passthrough
> functions but not much more than that, so merging them should not be
> a blocker to your efforts in the longer run.
> Please correct me if I am wrong.
>
> As things stand, I intend to re-post these old patches with mandatory
> FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
> questions about managing mappings.
>
> Miklos, please stop me if I missed something and if you do not
> think that these two approaches are independent.

Do you mean that the BPF patches should use their own passthrough mechanism?

I think it would be better if we could agree on a common interface for
passthough (or per Paul's suggestion: backing) mechanism.

Let's see this patchset and then we can discuss how this could be
usable for the BPF case as well.

Thanks,
Miklos
Amir Goldstein May 15, 2023, 2 p.m. UTC | #28
On Mon, May 15, 2023 at 10:29 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Fri, 12 May 2023 at 21:37, Amir Goldstein <amir73il@gmail.com> wrote:
>
> > I was waiting for LSFMM to see if and how FUSE-BPF intends to
> > address the highest value use case of read/write passthrough.
> >
> > From what I've seen, you are still taking a very broad approach of
> > all-or-nothing which still has a lot of core design issues to address,
> > while these old patches already address the most important use case
> > of read/write passthrough of fd without any of the core issues
> > (credentials, hidden fds).
> >
> > As far as I can tell, this old implementation is mostly independent of your
> > lookup based approach - they share the low level read/write passthrough
> > functions but not much more than that, so merging them should not be
> > a blocker to your efforts in the longer run.
> > Please correct me if I am wrong.
> >
> > As things stand, I intend to re-post these old patches with mandatory
> > FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
> > questions about managing mappings.
> >
> > Miklos, please stop me if I missed something and if you do not
> > think that these two approaches are independent.
>
> Do you mean that the BPF patches should use their own passthrough mechanism?
>
> I think it would be better if we could agree on a common interface for
> passthough (or per Paul's suggestion: backing) mechanism.

Well, not exactly different.
With BFP patches, if you have a backing inode that was established during
LOOKUP with rules to do passthrough for open(), you'd get a backing file and
that backing file would be used to passthrough read/write.

FOPEN_PASSTHROUGH is another way to configure passthrough read/write
to a backing file that is controlled by the server per open fd instead of by BFP
for every open of the backing inode.

Obviously, both methods would use the same backing_file field and
same read/write passthrough methods regardless of how the backing file
was setup.

Obviously, the BFP patches will not use the same ioctl to setup passthrough
(and/or BPF program) to a backing inode, but I don't think that matters much.
When we settle on ioctls for setting up backing inodes, we can also add new
ioctls for setting up backing file with optional BPF program.
I don't see any reason to make the first ioctl more complicated than this:

struct fuse_passthrough_out {
        uint32_t        fd;
        /* For future implementation */
        uint32_t        len;
        void            *vec;
};

One advantage with starting with FOPEN_PASSTHROUGH, besides
dealing with the highest priority performance issue, is how it deals with
resource limits on open files.

While the backing files are not accounted to the server, the server
is very likely to keep an open fd for the backing file until release,
otherwise, the server will not be able to perform other non-passthrough
file operations (e.g. fallocate) on the backing fd, so at least with
FOPEN_PASSTHROUGH_AUTOCLOSE, there should be only
up to 2 times the number of open files, very much the same as overlayfs.

I intend to enforce this heuristically by counting the number of passthrough
fds and restrict new passthrough fd setup to the number of current open fds
by the server, so a malicious or misbehaving server cannot setup infinite
number of backing fds that it does not also keep open itself.

>
> Let's see this patchset and then we can discuss how this could be
> usable for the BPF case as well.
>

OK. I'll try to dust off these patches and re-submit.

Thanks,
Amir.
Nikolaus Rath May 15, 2023, 8:16 p.m. UTC | #29
On May 15 2023, Amir Goldstein <amir73il@gmail.com> wrote:
> On Mon, May 15, 2023 at 10:29 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
>> On Fri, 12 May 2023 at 21:37, Amir Goldstein <amir73il@gmail.com> wrote:
>>
>> > I was waiting for LSFMM to see if and how FUSE-BPF intends to
>> > address the highest value use case of read/write passthrough.
>> >
>> > From what I've seen, you are still taking a very broad approach of
>> > all-or-nothing which still has a lot of core design issues to address,
>> > while these old patches already address the most important use case
>> > of read/write passthrough of fd without any of the core issues
>> > (credentials, hidden fds).
>> >
>> > As far as I can tell, this old implementation is mostly independent of your
>> > lookup based approach - they share the low level read/write passthrough
>> > functions but not much more than that, so merging them should not be
>> > a blocker to your efforts in the longer run.
>> > Please correct me if I am wrong.
>> >
>> > As things stand, I intend to re-post these old patches with mandatory
>> > FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
>> > questions about managing mappings.
>> >
>> > Miklos, please stop me if I missed something and if you do not
>> > think that these two approaches are independent.
>>
>> Do you mean that the BPF patches should use their own passthrough mechanism?
>>
>> I think it would be better if we could agree on a common interface for
>> passthough (or per Paul's suggestion: backing) mechanism.
>
> Well, not exactly different.
> With BFP patches, if you have a backing inode that was established during
> LOOKUP with rules to do passthrough for open(), you'd get a backing file and
> that backing file would be used to passthrough read/write.
>
> FOPEN_PASSTHROUGH is another way to configure passthrough read/write
> to a backing file that is controlled by the server per open fd instead of by BFP
> for every open of the backing inode.
>
> Obviously, both methods would use the same backing_file field and
> same read/write passthrough methods regardless of how the backing file
> was setup.
>
> Obviously, the BFP patches will not use the same ioctl to setup passthrough
> (and/or BPF program) to a backing inode, but I don't think that matters much.
> When we settle on ioctls for setting up backing inodes, we can also add new
> ioctls for setting up backing file with optional BPF program.

> I don't see any reason to make the first ioctl more complicated than this:
>
> struct fuse_passthrough_out {
>         uint32_t        fd;
>         /* For future implementation */
>         uint32_t        len;
>         void            *vec;
> };
>
> One advantage with starting with FOPEN_PASSTHROUGH, besides
> dealing with the highest priority performance issue, is how it deals with
> resource limits on open files.

One thing that struck me when we discussed FUSE-BPF at LSF was that from
a userspace point of view, FUSE-BPF presents an almost completely
different API than traditional FUSE (at least in its current form).

As long as there is no support for falling back to standard FUSE
callbacks, using FUSE-BPF means that most of the existing API no longer
works, and instead there is a large new API surface that doesn't work in
standard FUSE (the pre-filter and post-filter callbacks for each
operation).

I think this means that FUSE-BPF file systems won't work with FUSE, and
FUSE filesystems won't work with FUSE-BPF.

Would it be worth thinking about FUSE-BPF as a completely separate
approach that stands next to FUSE, as opposed to considering it an
extension?

In that case, we wouldn't need to worry about a FUSE-passthrough
implementation being forward compatible with FUSE-BPF or not.



Best,
-Nikolaus
Bernd Schubert May 15, 2023, 9:11 p.m. UTC | #30
On 5/15/23 22:16, Nikolaus Rath wrote:
> On May 15 2023, Amir Goldstein <amir73il@gmail.com> wrote:
>> On Mon, May 15, 2023 at 10:29 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
>>> On Fri, 12 May 2023 at 21:37, Amir Goldstein <amir73il@gmail.com> wrote:
>>>
>>>> I was waiting for LSFMM to see if and how FUSE-BPF intends to
>>>> address the highest value use case of read/write passthrough.
>>>>
>>>>  From what I've seen, you are still taking a very broad approach of
>>>> all-or-nothing which still has a lot of core design issues to address,
>>>> while these old patches already address the most important use case
>>>> of read/write passthrough of fd without any of the core issues
>>>> (credentials, hidden fds).
>>>>
>>>> As far as I can tell, this old implementation is mostly independent of your
>>>> lookup based approach - they share the low level read/write passthrough
>>>> functions but not much more than that, so merging them should not be
>>>> a blocker to your efforts in the longer run.
>>>> Please correct me if I am wrong.
>>>>
>>>> As things stand, I intend to re-post these old patches with mandatory
>>>> FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
>>>> questions about managing mappings.
>>>>
>>>> Miklos, please stop me if I missed something and if you do not
>>>> think that these two approaches are independent.
>>>
>>> Do you mean that the BPF patches should use their own passthrough mechanism?
>>>
>>> I think it would be better if we could agree on a common interface for
>>> passthough (or per Paul's suggestion: backing) mechanism.
>>
>> Well, not exactly different.
>> With BFP patches, if you have a backing inode that was established during
>> LOOKUP with rules to do passthrough for open(), you'd get a backing file and
>> that backing file would be used to passthrough read/write.
>>
>> FOPEN_PASSTHROUGH is another way to configure passthrough read/write
>> to a backing file that is controlled by the server per open fd instead of by BFP
>> for every open of the backing inode.
>>
>> Obviously, both methods would use the same backing_file field and
>> same read/write passthrough methods regardless of how the backing file
>> was setup.
>>
>> Obviously, the BFP patches will not use the same ioctl to setup passthrough
>> (and/or BPF program) to a backing inode, but I don't think that matters much.
>> When we settle on ioctls for setting up backing inodes, we can also add new
>> ioctls for setting up backing file with optional BPF program.
> 
>> I don't see any reason to make the first ioctl more complicated than this:
>>
>> struct fuse_passthrough_out {
>>          uint32_t        fd;
>>          /* For future implementation */
>>          uint32_t        len;
>>          void            *vec;
>> };
>>
>> One advantage with starting with FOPEN_PASSTHROUGH, besides
>> dealing with the highest priority performance issue, is how it deals with
>> resource limits on open files.
> 
> One thing that struck me when we discussed FUSE-BPF at LSF was that from
> a userspace point of view, FUSE-BPF presents an almost completely
> different API than traditional FUSE (at least in its current form).
> 
> As long as there is no support for falling back to standard FUSE
> callbacks, using FUSE-BPF means that most of the existing API no longer
> works, and instead there is a large new API surface that doesn't work in
> standard FUSE (the pre-filter and post-filter callbacks for each
> operation).
> 
> I think this means that FUSE-BPF file systems won't work with FUSE, and
> FUSE filesystems won't work with FUSE-BPF.

Is that so? I think found some incompatibilities in the patches (need to 
double check), but doesn't it just do normal fuse operations and then 
replies with an ioctl to do passthrough? BPF is used for additional 
filtering, that would have to be done otherwise in userspace.

Really difficult in the current patch set and data structures is to see 
what is actually BPF and what is passthrough.


Thanks,
Bernd
Paul Lawrence May 15, 2023, 9:45 p.m. UTC | #31
On Mon, May 15, 2023 at 2:11 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
> On 5/15/23 22:16, Nikolaus Rath wrote:
> > On May 15 2023, Amir Goldstein <amir73il@gmail.com> wrote:
> >> On Mon, May 15, 2023 at 10:29 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >>> On Fri, 12 May 2023 at 21:37, Amir Goldstein <amir73il@gmail.com> wrote:
> >>>
> >>>> I was waiting for LSFMM to see if and how FUSE-BPF intends to
> >>>> address the highest value use case of read/write passthrough.
> >>>>
> >>>>  From what I've seen, you are still taking a very broad approach of
> >>>> all-or-nothing which still has a lot of core design issues to address,
> >>>> while these old patches already address the most important use case
> >>>> of read/write passthrough of fd without any of the core issues
> >>>> (credentials, hidden fds).
> >>>>
> >>>> As far as I can tell, this old implementation is mostly independent of your
> >>>> lookup based approach - they share the low level read/write passthrough
> >>>> functions but not much more than that, so merging them should not be
> >>>> a blocker to your efforts in the longer run.
> >>>> Please correct me if I am wrong.
> >>>>
> >>>> As things stand, I intend to re-post these old patches with mandatory
> >>>> FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
> >>>> questions about managing mappings.
> >>>>
> >>>> Miklos, please stop me if I missed something and if you do not
> >>>> think that these two approaches are independent.
> >>>
> >>> Do you mean that the BPF patches should use their own passthrough mechanism?
> >>>
> >>> I think it would be better if we could agree on a common interface for
> >>> passthough (or per Paul's suggestion: backing) mechanism.
> >>
> >> Well, not exactly different.
> >> With BFP patches, if you have a backing inode that was established during
> >> LOOKUP with rules to do passthrough for open(), you'd get a backing file and
> >> that backing file would be used to passthrough read/write.
> >>
> >> FOPEN_PASSTHROUGH is another way to configure passthrough read/write
> >> to a backing file that is controlled by the server per open fd instead of by BFP
> >> for every open of the backing inode.
> >>
> >> Obviously, both methods would use the same backing_file field and
> >> same read/write passthrough methods regardless of how the backing file
> >> was setup.
> >>
> >> Obviously, the BFP patches will not use the same ioctl to setup passthrough
> >> (and/or BPF program) to a backing inode, but I don't think that matters much.
> >> When we settle on ioctls for setting up backing inodes, we can also add new
> >> ioctls for setting up backing file with optional BPF program.
> >
> >> I don't see any reason to make the first ioctl more complicated than this:
> >>
> >> struct fuse_passthrough_out {
> >>          uint32_t        fd;
> >>          /* For future implementation */
> >>          uint32_t        len;
> >>          void            *vec;
> >> };
> >>
> >> One advantage with starting with FOPEN_PASSTHROUGH, besides
> >> dealing with the highest priority performance issue, is how it deals with
> >> resource limits on open files.
> >
> > One thing that struck me when we discussed FUSE-BPF at LSF was that from
> > a userspace point of view, FUSE-BPF presents an almost completely
> > different API than traditional FUSE (at least in its current form).
> >
> > As long as there is no support for falling back to standard FUSE
> > callbacks, using FUSE-BPF means that most of the existing API no longer
> > works, and instead there is a large new API surface that doesn't work in
> > standard FUSE (the pre-filter and post-filter callbacks for each
> > operation).
> >
> > I think this means that FUSE-BPF file systems won't work with FUSE, and
> > FUSE filesystems won't work with FUSE-BPF.
>
> Is that so? I think found some incompatibilities in the patches (need to
> double check), but doesn't it just do normal fuse operations and then
> replies with an ioctl to do passthrough? BPF is used for additional
> filtering, that would have to be done otherwise in userspace.
>
> Really difficult in the current patch set and data structures is to see
> what is actually BPF and what is passthrough.

I hope that fuse and fuse-bpf play together a little better than that
;) In the current design, you can set a backing file from within
traditional fuse lookups, which moves you to fuse-bpf for that
file/directory, and you can remove the backing file during the
post-filter, moving that node back to fuse. You can also return a
value from the bpf prefilter that tells fuse to use traditional fuse
for that command. I think this is a very useful feature - it's one of
the first ones we used in Android.

If we do find any areas where we can't easily switch between
traditional fuse and fuse-bpf, we would consider that a bug and fix it
as fast as possible.

And yes, we got the feedback from LSFMMBPF that the current patches
are hard to follow, and we will be reordering them and resending them
as three patchsets. One will add backing files, one will add backing
directories, and the final will add bpf filters to both. Hopefully
that will make them easier to understand.

Paul
Miklos Szeredi May 16, 2023, 8:43 a.m. UTC | #32
On Mon, 15 May 2023 at 23:45, Paul Lawrence <paullawrence@google.com> wrote:
>
> On Mon, May 15, 2023 at 2:11 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
> > On 5/15/23 22:16, Nikolaus Rath wrote:

> > > One thing that struck me when we discussed FUSE-BPF at LSF was that from
> > > a userspace point of view, FUSE-BPF presents an almost completely
> > > different API than traditional FUSE (at least in its current form).
> > >
> > > As long as there is no support for falling back to standard FUSE
> > > callbacks, using FUSE-BPF means that most of the existing API no longer
> > > works, and instead there is a large new API surface that doesn't work in
> > > standard FUSE (the pre-filter and post-filter callbacks for each
> > > operation).
> > >
> > > I think this means that FUSE-BPF file systems won't work with FUSE, and
> > > FUSE filesystems won't work with FUSE-BPF.
> >
> > Is that so? I think found some incompatibilities in the patches (need to
> > double check), but doesn't it just do normal fuse operations and then
> > replies with an ioctl to do passthrough? BPF is used for additional
> > filtering, that would have to be done otherwise in userspace.

I think Nikolaus' concern is that the BPF hooks add a major upgrade to
the API, i.e. it looks very difficult to port a BPF based fs to
non-BPF based fuse.  The new API should at least come with sufficient
warnings about portability issues.

I don't think the other direction has problems. The fuse API/ABI must
remain backward compatible and old filesystems must be able to work
after this feature is added.

Thanks,
Miklos
Amir Goldstein May 16, 2023, 8:48 a.m. UTC | #33
On Tue, May 16, 2023 at 12:45 AM Paul Lawrence <paullawrence@google.com> wrote:
>
> On Mon, May 15, 2023 at 2:11 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
> > On 5/15/23 22:16, Nikolaus Rath wrote:
> > > On May 15 2023, Amir Goldstein <amir73il@gmail.com> wrote:
> > >> On Mon, May 15, 2023 at 10:29 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> > >>> On Fri, 12 May 2023 at 21:37, Amir Goldstein <amir73il@gmail.com> wrote:
> > >>>
> > >>>> I was waiting for LSFMM to see if and how FUSE-BPF intends to
> > >>>> address the highest value use case of read/write passthrough.
> > >>>>
> > >>>>  From what I've seen, you are still taking a very broad approach of
> > >>>> all-or-nothing which still has a lot of core design issues to address,
> > >>>> while these old patches already address the most important use case
> > >>>> of read/write passthrough of fd without any of the core issues
> > >>>> (credentials, hidden fds).
> > >>>>
> > >>>> As far as I can tell, this old implementation is mostly independent of your
> > >>>> lookup based approach - they share the low level read/write passthrough
> > >>>> functions but not much more than that, so merging them should not be
> > >>>> a blocker to your efforts in the longer run.
> > >>>> Please correct me if I am wrong.
> > >>>>
> > >>>> As things stand, I intend to re-post these old patches with mandatory
> > >>>> FOPEN_PASSTHROUGH_AUTOCLOSE to eliminate the open
> > >>>> questions about managing mappings.
> > >>>>
> > >>>> Miklos, please stop me if I missed something and if you do not
> > >>>> think that these two approaches are independent.
> > >>>
> > >>> Do you mean that the BPF patches should use their own passthrough mechanism?
> > >>>
> > >>> I think it would be better if we could agree on a common interface for
> > >>> passthough (or per Paul's suggestion: backing) mechanism.
> > >>
> > >> Well, not exactly different.
> > >> With BFP patches, if you have a backing inode that was established during
> > >> LOOKUP with rules to do passthrough for open(), you'd get a backing file and
> > >> that backing file would be used to passthrough read/write.
> > >>
> > >> FOPEN_PASSTHROUGH is another way to configure passthrough read/write
> > >> to a backing file that is controlled by the server per open fd instead of by BFP
> > >> for every open of the backing inode.
> > >>
> > >> Obviously, both methods would use the same backing_file field and
> > >> same read/write passthrough methods regardless of how the backing file
> > >> was setup.
> > >>
> > >> Obviously, the BFP patches will not use the same ioctl to setup passthrough
> > >> (and/or BPF program) to a backing inode, but I don't think that matters much.
> > >> When we settle on ioctls for setting up backing inodes, we can also add new
> > >> ioctls for setting up backing file with optional BPF program.
> > >
> > >> I don't see any reason to make the first ioctl more complicated than this:
> > >>
> > >> struct fuse_passthrough_out {
> > >>          uint32_t        fd;
> > >>          /* For future implementation */
> > >>          uint32_t        len;
> > >>          void            *vec;
> > >> };
> > >>
> > >> One advantage with starting with FOPEN_PASSTHROUGH, besides
> > >> dealing with the highest priority performance issue, is how it deals with
> > >> resource limits on open files.
> > >
> > > One thing that struck me when we discussed FUSE-BPF at LSF was that from
> > > a userspace point of view, FUSE-BPF presents an almost completely
> > > different API than traditional FUSE (at least in its current form).
> > >
> > > As long as there is no support for falling back to standard FUSE
> > > callbacks, using FUSE-BPF means that most of the existing API no longer
> > > works, and instead there is a large new API surface that doesn't work in
> > > standard FUSE (the pre-filter and post-filter callbacks for each
> > > operation).

I think there is a confusion here that needs to be clarified.
I was confused when you asked in the session why the usermode
post-filter was needed.

IIUC, there is no usermode post filter. There are only in-kernel BPF
pre/post filters.

Paul/Daniel will correct me if I am wrong, but I think the FUSE server
can be called at most once per op as legacy FUSE, but with
FUSE-BPF, the server may be bypassed.

Pre/post filters are used to toggle the bypass mode permanently
or for a specific op and post filter can also be used to modify the
server response.

> > >
> > > I think this means that FUSE-BPF file systems won't work with FUSE, and
> > > FUSE filesystems won't work with FUSE-BPF.
> >
> > Is that so? I think found some incompatibilities in the patches (need to
> > double check), but doesn't it just do normal fuse operations and then
> > replies with an ioctl to do passthrough?

About that, I wanted to ask.
Alessio's initial patches used to have a similar approach.
Without ioctl, but the passthrough/backing fd was provided as part of the
response to OPEN request.

Following feedback from Miklos and Jens, not only the passthrough
request was moved to ioctl, but it was also decoupled from the OPEN
response.

This allows the server more flexibility in managing the passthrough
mode of files (or inodes in FUSE-BPF case).
FUSE-BPF patches use ioctl for response, but without decoupling.
I wonder if that should be amended for the next version?

> > BPF is used for additional
> > filtering, that would have to be done otherwise in userspace.
> >
> > Really difficult in the current patch set and data structures is to see
> > what is actually BPF and what is passthrough.
>
> I hope that fuse and fuse-bpf play together a little better than that
> ;) In the current design, you can set a backing file from within
> traditional fuse lookups, which moves you to fuse-bpf for that
> file/directory, and you can remove the backing file during the
> post-filter, moving that node back to fuse. You can also return a
> value from the bpf prefilter that tells fuse to use traditional fuse
> for that command. I think this is a very useful feature - it's one of
> the first ones we used in Android.
>
> If we do find any areas where we can't easily switch between
> traditional fuse and fuse-bpf, we would consider that a bug and fix it
> as fast as possible.
>
> And yes, we got the feedback from LSFMMBPF that the current patches
> are hard to follow, and we will be reordering them and resending them
> as three patchsets. One will add backing files, one will add backing
> directories, and the final will add bpf filters to both. Hopefully
> that will make them easier to understand.
>

That sounds great!
I started to dust off Alessio's patches.
I might just post what I have as a reference implementation that
we can compare to your "backing file" series.
I would much rather that your version is the one that ends up being
merged at the end ;-)

Thanks,
Amir.
Nikolaus Rath May 16, 2023, 10:16 a.m. UTC | #34
On May 16 2023, Miklos Szeredi via fuse-devel <fuse-devel@lists.sourceforge.net> wrote:
> On Mon, 15 May 2023 at 23:45, Paul Lawrence <paullawrence@google.com> wrote:
>>
>> On Mon, May 15, 2023 at 2:11 PM Bernd Schubert
>> <bernd.schubert@fastmail.fm> wrote:
>> > On 5/15/23 22:16, Nikolaus Rath wrote:
>
>> > > One thing that struck me when we discussed FUSE-BPF at LSF was that from
>> > > a userspace point of view, FUSE-BPF presents an almost completely
>> > > different API than traditional FUSE (at least in its current form).
>> > >
>> > > As long as there is no support for falling back to standard FUSE
>> > > callbacks, using FUSE-BPF means that most of the existing API no longer
>> > > works, and instead there is a large new API surface that doesn't work in
>> > > standard FUSE (the pre-filter and post-filter callbacks for each
>> > > operation).
>> > >
>> > > I think this means that FUSE-BPF file systems won't work with FUSE, and
>> > > FUSE filesystems won't work with FUSE-BPF.
>> >
>> > Is that so? I think found some incompatibilities in the patches (need to
>> > double check), but doesn't it just do normal fuse operations and then
>> > replies with an ioctl to do passthrough? BPF is used for additional
>> > filtering, that would have to be done otherwise in userspace.
>
> I think Nikolaus' concern is that the BPF hooks add a major upgrade to
> the API, i.e. it looks very difficult to port a BPF based fs to
> non-BPF based fuse.  The new API should at least come with sufficient
> warnings about portability issues.
>
> I don't think the other direction has problems. The fuse API/ABI must
> remain backward compatible and old filesystems must be able to work
> after this feature is added.

I wouldn't say I'm concerned, it's more of an observation.

To me it seemed like we are combining two very different
approaches/interfaces in the same kernel module / userspace
library. This doesn't result in a compatibility problem, but it seems to
me that we could cleanly split this into two different components (that
may share code) with almost no API overlap.

But it seems I may have misunderstood some aspects about how the
fallback works. Let's wait for the FUSE-BPF patches and then revisit the
question :-).



Best,
-Nikolaus
diff mbox series

Patch

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 8c7021fb2cd4..20ed23aa16fa 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -8,6 +8,7 @@  obj-$(CONFIG_CUSE) += cuse.o
 obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
 
 fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-y += passthrough.o
 fuse-$(CONFIG_FUSE_DAX) += dax.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ff9f3b83f879..5446f13db5a0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2236,6 +2236,7 @@  static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 	int res;
 	int oldfd;
 	struct fuse_dev *fud = NULL;
+	struct fuse_passthrough_out pto;
 
 	if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC)
 		return -EINVAL;
@@ -2266,6 +2267,17 @@  static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 			}
 		}
 		break;
+	case _IOC_NR(FUSE_DEV_IOC_PASSTHROUGH_OPEN):
+		res = -EFAULT;
+		if (!copy_from_user(&pto,
+				    (struct fuse_passthrough_out __user *)arg,
+				    sizeof(pto))) {
+			res = -EINVAL;
+			fud = fuse_get_dev(file);
+			if (fud)
+				res = fuse_passthrough_open(fud, &pto);
+		}
+		break;
 	default:
 		res = -ENOTTY;
 		break;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 78f9f209078c..c9a1b33c5481 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -513,6 +513,7 @@  static int fuse_create_open(struct inode *dir, struct dentry *entry,
 {
 	int err;
 	struct inode *inode;
+	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_mount *fm = get_fuse_mount(dir);
 	FUSE_ARGS(args);
 	struct fuse_forget_link *forget;
@@ -574,6 +575,7 @@  static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	ff->fh = outopen.fh;
 	ff->nodeid = outentry.nodeid;
 	ff->open_flags = outopen.open_flags;
+	fuse_passthrough_setup(fc, ff, &outopen);
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
 			  &outentry.attr, entry_attr_timeout(&outentry), 0);
 	if (!inode) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8cccecb55fb8..953f3034c375 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -158,7 +158,7 @@  int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
 		if (!err) {
 			ff->fh = outarg.fh;
 			ff->open_flags = outarg.open_flags;
-
+			fuse_passthrough_setup(fc, ff, &outarg);
 		} else if (err != -ENOSYS) {
 			fuse_file_free(ff);
 			return err;
@@ -304,6 +304,8 @@  void fuse_release_common(struct file *file, bool isdir)
 	struct fuse_release_args *ra = ff->release_args;
 	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 
+	fuse_passthrough_release(&ff->passthrough);
+
 	fuse_prepare_release(fi, ff, file->f_flags, opcode);
 
 	if (ff->flock) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7c4b8cb93f9f..8d39f5304a11 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -180,6 +180,14 @@  struct fuse_conn;
 struct fuse_mount;
 struct fuse_release_args;
 
+/**
+ * Reference to lower filesystem file for read/write operations handled in
+ * passthrough mode
+ */
+struct fuse_passthrough {
+	struct file *filp;
+};
+
 /** FUSE specific file data */
 struct fuse_file {
 	/** Fuse connection for this file */
@@ -225,6 +233,9 @@  struct fuse_file {
 
 	} readdir;
 
+	/** Container for data related to the passthrough functionality */
+	struct fuse_passthrough passthrough;
+
 	/** RB node to be linked on fuse_conn->polled_files */
 	struct rb_node polled_node;
 
@@ -755,6 +766,9 @@  struct fuse_conn {
 	/* Auto-mount submounts announced by the server */
 	unsigned int auto_submounts:1;
 
+	/** Passthrough mode for read/write IO */
+	unsigned int passthrough:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -798,6 +812,12 @@  struct fuse_conn {
 
 	/** List of filesystems using this connection */
 	struct list_head mounts;
+
+	/** IDR for passthrough requests */
+	struct idr passthrough_req;
+
+	/** Protects passthrough_req */
+	spinlock_t passthrough_req_lock;
 };
 
 /*
@@ -1213,4 +1233,11 @@  void fuse_dax_inode_cleanup(struct inode *inode);
 bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
 void fuse_dax_cancel_work(struct fuse_conn *fc);
 
+/* passthrough.c */
+int fuse_passthrough_open(struct fuse_dev *fud,
+			  struct fuse_passthrough_out *pto);
+int fuse_passthrough_setup(struct fuse_conn *fc, struct fuse_file *ff,
+			   struct fuse_open_out *openarg);
+void fuse_passthrough_release(struct fuse_passthrough *passthrough);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b0e18b470e91..a1104d5abb70 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -691,6 +691,7 @@  void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
 	spin_lock_init(&fc->bg_lock);
+	spin_lock_init(&fc->passthrough_req_lock);
 	init_rwsem(&fc->killsb);
 	refcount_set(&fc->count, 1);
 	atomic_set(&fc->dev_count, 1);
@@ -699,6 +700,7 @@  void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
 	INIT_LIST_HEAD(&fc->devices);
+	idr_init(&fc->passthrough_req);
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
 	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -1052,6 +1054,12 @@  static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				fc->handle_killpriv_v2 = 1;
 				fm->sb->s_flags |= SB_NOSEC;
 			}
+			if (arg->flags & FUSE_PASSTHROUGH) {
+				fc->passthrough = 1;
+				/* Prevent further stacking */
+				fm->sb->s_stack_depth =
+					FILESYSTEM_MAX_STACK_DEPTH;
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1095,7 +1103,7 @@  void fuse_send_init(struct fuse_mount *fm)
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
 		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
-		FUSE_HANDLE_KILLPRIV_V2;
+		FUSE_HANDLE_KILLPRIV_V2 | FUSE_PASSTHROUGH;
 #ifdef CONFIG_FUSE_DAX
 	if (fm->fc->dax)
 		ia->in.flags |= FUSE_MAP_ALIGNMENT;
@@ -1123,9 +1131,16 @@  void fuse_send_init(struct fuse_mount *fm)
 }
 EXPORT_SYMBOL_GPL(fuse_send_init);
 
+static int free_fuse_passthrough(int id, void *p, void *data)
+{
+	return 0;
+}
+
 void fuse_free_conn(struct fuse_conn *fc)
 {
 	WARN_ON(!list_empty(&fc->devices));
+	idr_for_each(&fc->passthrough_req, free_fuse_passthrough, NULL);
+	idr_destroy(&fc->passthrough_req);
 	kfree_rcu(fc, rcu);
 }
 EXPORT_SYMBOL_GPL(fuse_free_conn);
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
new file mode 100644
index 000000000000..594060c654f8
--- /dev/null
+++ b/fs/fuse/passthrough.c
@@ -0,0 +1,21 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include "fuse_i.h"
+
+#include <linux/fuse.h>
+
+int fuse_passthrough_open(struct fuse_dev *fud,
+			  struct fuse_passthrough_out *pto)
+{
+	return -EINVAL;
+}
+
+int fuse_passthrough_setup(struct fuse_conn *fc, struct fuse_file *ff,
+			   struct fuse_open_out *openarg)
+{
+	return -EINVAL;
+}
+
+void fuse_passthrough_release(struct fuse_passthrough *passthrough)
+{
+}
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 54442612c48b..9d7685ce0acd 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -360,6 +360,7 @@  struct fuse_file_lock {
 #define FUSE_MAP_ALIGNMENT	(1 << 26)
 #define FUSE_SUBMOUNTS		(1 << 27)
 #define FUSE_HANDLE_KILLPRIV_V2	(1 << 28)
+#define FUSE_PASSTHROUGH	(1 << 29)
 
 /**
  * CUSE INIT request/reply flags
@@ -625,7 +626,7 @@  struct fuse_create_in {
 struct fuse_open_out {
 	uint64_t	fh;
 	uint32_t	open_flags;
-	uint32_t	padding;
+	uint32_t	passthrough_fh;
 };
 
 struct fuse_release_in {
@@ -828,6 +829,13 @@  struct fuse_in_header {
 	uint32_t	padding;
 };
 
+struct fuse_passthrough_out {
+	uint32_t	fd;
+	/* For future implementation */
+	uint32_t	len;
+	void		*vec;
+};
+
 struct fuse_out_header {
 	uint32_t	len;
 	int32_t		error;
@@ -905,6 +913,7 @@  struct fuse_notify_retrieve_in {
 /* Device ioctls: */
 #define FUSE_DEV_IOC_MAGIC		229
 #define FUSE_DEV_IOC_CLONE		_IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
+#define FUSE_DEV_IOC_PASSTHROUGH_OPEN	_IOW(FUSE_DEV_IOC_MAGIC, 1, struct fuse_passthrough_out)
 
 struct fuse_lseek_in {
 	uint64_t	fh;