diff mbox

[v5] fuse: Add support for passthrough read/write

Message ID 56AFAA5B.3000006@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Nikhilesh Reddy Feb. 1, 2016, 6:56 p.m. UTC
Add support for filesystem passthrough read/write of files
when enabled in userspace through the option FUSE_PASSTHROUGH.

There are many FUSE based filesystems that perform checks or
enforce policy or perform some kind of decision making in certain
functions like the "open" call but simply act as a "passthrough"
when performing operations such as read or write.

When FUSE_PASSTHROUGH is enabled all the reads and writes
to the fuse mount point go directly to the passthrough filesystem
i.e a native filesystem that actually hosts the files rather than
through the fuse daemon. All requests that aren't read/write still
go thought the userspace code.

This allows for significantly better performance on read and writes.
The difference in performance between fuse and the native lower
filesystem is negligible.

There is also a significant cpu/power savings that is achieved which
is really important on embedded systems that use fuse for I/O.

Change log:

v5:
Fix the check when setting the passthrough file
[Found when testing by Mike Shal]

v3 and v4:
Use the fs_stack_depth to prevent further stacking and a minor fix
[Suggested by Jann Horn]

v2:
Changed the feature name to passthrough from stacked_io
[Proposed by Linus Torvalds]

Signed-off-by: Nikhilesh Reddy <reddyn@codeaurora.org>
---
 fs/fuse/Makefile           |   2 +-
 fs/fuse/dev.c              |  13 +++--
 fs/fuse/dir.c              |   3 ++
 fs/fuse/file.c             |  37 +++++++++++--
 fs/fuse/fuse_i.h           |  10 ++++
 fs/fuse/fuse_passthrough.h |  31 +++++++++++
 fs/fuse/inode.c            |   8 +++
 fs/fuse/passthrough.c      | 128 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fuse.h  |   3 +-
 9 files changed, 226 insertions(+), 9 deletions(-)
 create mode 100644 fs/fuse/fuse_passthrough.h
 create mode 100644 fs/fuse/passthrough.c

Comments

Jann Horn Feb. 1, 2016, 7:15 p.m. UTC | #1
On Mon, Feb 01, 2016 at 10:56:27AM -0800, Nikhilesh Reddy wrote:
> diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
[...]
> +static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
> +					    struct iov_iter *iter, int do_write)
> +{
> +	ssize_t ret_val;
> +	struct fuse_file *ff;
> +	struct file *fuse_file, *passthrough_filp;
> +	struct inode *fuse_inode, *passthrough_inode;
> +
> +	ff = iocb->ki_filp->private_data;
> +	fuse_file = iocb->ki_filp;
> +	passthrough_filp = ff->passthrough_filp;
> +
> +	/* lock passthrough file to prevent it from being released */
> +	get_file(passthrough_filp);
> +	iocb->ki_filp = passthrough_filp;
> +	fuse_inode = fuse_file->f_path.dentry->d_inode;
> +	passthrough_inode = file_inode(passthrough_filp);
> +
> +	if (do_write) {
> +		if (!passthrough_filp->f_op->write_iter)
> +			return -EIO;
> +		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);
> +
> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
> +			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
> +			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
> +		}
> +	} else {
> +		if (!passthrough_filp->f_op->read_iter)
> +			return -EIO;
> +		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
> +			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
> +	}
> +
> +	iocb->ki_filp = fuse_file;
> +
> +	/* unlock passthrough file */
> +	fput(passthrough_filp);

Why the get_file() and fput() in this method? This doesn't look right. There
is no lock you're releasing between get_file() and fput(). What are they
intended for?
Nikhilesh Reddy Feb. 1, 2016, 7:28 p.m. UTC | #2
On Mon 01 Feb 2016 11:15:56 AM PST, Jann Horn wrote:
> On Mon, Feb 01, 2016 at 10:56:27AM -0800, Nikhilesh Reddy wrote:
>> diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
> [...]
>> +static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
>> +					    struct iov_iter *iter, int do_write)
>> +{
>> +	ssize_t ret_val;
>> +	struct fuse_file *ff;
>> +	struct file *fuse_file, *passthrough_filp;
>> +	struct inode *fuse_inode, *passthrough_inode;
>> +
>> +	ff = iocb->ki_filp->private_data;
>> +	fuse_file = iocb->ki_filp;
>> +	passthrough_filp = ff->passthrough_filp;
>> +
>> +	/* lock passthrough file to prevent it from being released */
>> +	get_file(passthrough_filp);
>> +	iocb->ki_filp = passthrough_filp;
>> +	fuse_inode = fuse_file->f_path.dentry->d_inode;
>> +	passthrough_inode = file_inode(passthrough_filp);
>> +
>> +	if (do_write) {
>> +		if (!passthrough_filp->f_op->write_iter)
>> +			return -EIO;
>> +		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);
>> +
>> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
>> +			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
>> +			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
>> +		}
>> +	} else {
>> +		if (!passthrough_filp->f_op->read_iter)
>> +			return -EIO;
>> +		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
>> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
>> +			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
>> +	}
>> +
>> +	iocb->ki_filp = fuse_file;
>> +
>> +	/* unlock passthrough file */
>> +	fput(passthrough_filp);
>
> Why the get_file() and fput() in this method? This doesn't look right. There
> is no lock you're releasing between get_file() and fput(). What are they
> intended for?

Hi

Thanks for reviewing the code.

The passthrough file could be released under our feet say  if the 
userspace fuse daemon crashed or was killed  ( while we are processing 
the read or the write) causing bad things to happen.
The calls here are to increase the count temporarily  and then decrease 
it so that we dont release in the middle of a write and everything is 
gracefully handled...

I have a comment right before the get_file call above saying the same 
thing.
Please let me know if you have any more questions.


--
Thanks
Nikhilesh Reddy

Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora 
Forum,
a Linux Foundation Collaborative Project.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jann Horn Feb. 1, 2016, 7:45 p.m. UTC | #3
On Mon, Feb 01, 2016 at 11:28:51AM -0800, Nikhilesh Reddy wrote:
> On Mon 01 Feb 2016 11:15:56 AM PST, Jann Horn wrote:
> >On Mon, Feb 01, 2016 at 10:56:27AM -0800, Nikhilesh Reddy wrote:
> >>diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
> >[...]
> >>+static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
> >>+					    struct iov_iter *iter, int do_write)
> >>+{
> >>+	ssize_t ret_val;
> >>+	struct fuse_file *ff;
> >>+	struct file *fuse_file, *passthrough_filp;
> >>+	struct inode *fuse_inode, *passthrough_inode;
> >>+
> >>+	ff = iocb->ki_filp->private_data;
> >>+	fuse_file = iocb->ki_filp;
> >>+	passthrough_filp = ff->passthrough_filp;
> >>+
> >>+	/* lock passthrough file to prevent it from being released */
> >>+	get_file(passthrough_filp);
> >>+	iocb->ki_filp = passthrough_filp;
> >>+	fuse_inode = fuse_file->f_path.dentry->d_inode;
> >>+	passthrough_inode = file_inode(passthrough_filp);
> >>+
> >>+	if (do_write) {
> >>+		if (!passthrough_filp->f_op->write_iter)
> >>+			return -EIO;
> >>+		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);
> >>+
> >>+		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
> >>+			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
> >>+			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
> >>+		}
> >>+	} else {
> >>+		if (!passthrough_filp->f_op->read_iter)
> >>+			return -EIO;
> >>+		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
> >>+		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
> >>+			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
> >>+	}
> >>+
> >>+	iocb->ki_filp = fuse_file;
> >>+
> >>+	/* unlock passthrough file */
> >>+	fput(passthrough_filp);
> >
> >Why the get_file() and fput() in this method? This doesn't look right. There
> >is no lock you're releasing between get_file() and fput(). What are they
> >intended for?
> 
> Hi
> 
> Thanks for reviewing the code.
> 
> The passthrough file could be released under our feet say  if the userspace
> fuse daemon crashed or was killed  ( while we are processing the read or the
> write) causing bad things to happen.
> The calls here are to increase the count temporarily  and then decrease it
> so that we dont release in the middle of a write and everything is
> gracefully handled...
> 
> I have a comment right before the get_file call above saying the same thing.
> Please let me know if you have any more questions.

If that is the case, why can't the passthrough file be released before the
get_file() call, e.g. while the core processing the filesystem read request
is entering fuse_passthrough_read_write_iter()?

As far as I can tell, you can drop the get_file() and fput() calls.
fuse_setup_passthrough() already took a reference to the file for you, that
reference can only be dropped in fuse_passthrough_release(), and the VFS
ensures that no release call happens while a read or write is pending.
Jann Horn Feb. 2, 2016, 8:10 a.m. UTC | #4
On Mon, Feb 01, 2016 at 10:56:27AM -0800, Nikhilesh Reddy wrote:
> Add support for filesystem passthrough read/write of files
> when enabled in userspace through the option FUSE_PASSTHROUGH.
[...]
> diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
[...]
> +static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
> +					    struct iov_iter *iter, int do_write)
> +{
> +	ssize_t ret_val;
> +	struct fuse_file *ff;
> +	struct file *fuse_file, *passthrough_filp;
> +	struct inode *fuse_inode, *passthrough_inode;
> +
> +	ff = iocb->ki_filp->private_data;
> +	fuse_file = iocb->ki_filp;
> +	passthrough_filp = ff->passthrough_filp;
> +
> +	/* lock passthrough file to prevent it from being released */
> +	get_file(passthrough_filp);
> +	iocb->ki_filp = passthrough_filp;
> +	fuse_inode = fuse_file->f_path.dentry->d_inode;
> +	passthrough_inode = file_inode(passthrough_filp);
> +
> +	if (do_write) {
> +		if (!passthrough_filp->f_op->write_iter)
> +			return -EIO;
> +		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);

Uh... how do you know at this point that the file is actually writable?
Normally, e.g. vfs_write() will ensure that the file is writable, and
e.g. generic_file_write_iter() won't check for writability as far as I
can tell. This might allow someone to use the passthrough mechanism to
overwrite a file he is only allowed to read, but not write, like
/etc/passwd.

Also, I think this might bypass mandatory locks, the
security_file_permission hook (which seems like a bad idea anyway
though), inotify/fsnotify and sb_start_write.



> +
> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
> +			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
> +			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
> +		}
> +	} else {
> +		if (!passthrough_filp->f_op->read_iter)
> +			return -EIO;
> +		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
> +			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
> +	}
> +
> +	iocb->ki_filp = fuse_file;
> +
> +	/* unlock passthrough file */
> +	fput(passthrough_filp);
> +
> +	return ret_val;
> +}
> +
> +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	return fuse_passthrough_read_write_iter(iocb, to, 0);
> +}
> +
> +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	return fuse_passthrough_read_write_iter(iocb, from, 1);
> +}
> +
> +void fuse_passthrough_release(struct fuse_file *ff)
> +{
> +	if (!(ff->passthrough_filp))
> +		return;
> +
> +	/* Release the passthrough file. */
> +	fput(ff->passthrough_filp);
> +	ff->passthrough_filp = NULL;
> +}
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index c9aca04..a08933a 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -250,6 +250,7 @@ struct fuse_file_lock {
>  #define FUSE_ASYNC_DIO		(1 << 15)
>  #define FUSE_WRITEBACK_CACHE	(1 << 16)
>  #define FUSE_NO_OPEN_SUPPORT	(1 << 17)
> +#define FUSE_PASSTHROUGH	(1 << 18)
>  
>  /**
>   * CUSE INIT request/reply flags
> @@ -480,7 +481,7 @@ struct fuse_create_in {
>  struct fuse_open_out {
>  	uint64_t	fh;
>  	uint32_t	open_flags;
> -	uint32_t	padding;
> +	int32_t         passthrough_fd;
>  };
>  
>  struct fuse_release_in {
> -- 
> 1.8.2.1
> 
> 
> -- 
> Thanks
> Nikhilesh Reddy
> 
> Qualcomm Innovation Center, Inc.
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project.
Nikhilesh Reddy Feb. 3, 2016, 7:05 p.m. UTC | #5
On 02/01/2016 11:45 AM, Jann Horn wrote:
> On Mon, Feb 01, 2016 at 11:28:51AM -0800, Nikhilesh Reddy wrote:
>> On Mon 01 Feb 2016 11:15:56 AM PST, Jann Horn wrote:
>>> On Mon, Feb 01, 2016 at 10:56:27AM -0800, Nikhilesh Reddy wrote:
>>>> diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
>>> [...]
>>>> +static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
>>>> +					    struct iov_iter *iter, int do_write)
>>>> +{
>>>> +	ssize_t ret_val;
>>>> +	struct fuse_file *ff;
>>>> +	struct file *fuse_file, *passthrough_filp;
>>>> +	struct inode *fuse_inode, *passthrough_inode;
>>>> +
>>>> +	ff = iocb->ki_filp->private_data;
>>>> +	fuse_file = iocb->ki_filp;
>>>> +	passthrough_filp = ff->passthrough_filp;
>>>> +
>>>> +	/* lock passthrough file to prevent it from being released */
>>>> +	get_file(passthrough_filp);
>>>> +	iocb->ki_filp = passthrough_filp;
>>>> +	fuse_inode = fuse_file->f_path.dentry->d_inode;
>>>> +	passthrough_inode = file_inode(passthrough_filp);
>>>> +
>>>> +	if (do_write) {
>>>> +		if (!passthrough_filp->f_op->write_iter)
>>>> +			return -EIO;
>>>> +		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);
>>>> +
>>>> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
>>>> +			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
>>>> +			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
>>>> +		}
>>>> +	} else {
>>>> +		if (!passthrough_filp->f_op->read_iter)
>>>> +			return -EIO;
>>>> +		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
>>>> +		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
>>>> +			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
>>>> +	}
>>>> +
>>>> +	iocb->ki_filp = fuse_file;
>>>> +
>>>> +	/* unlock passthrough file */
>>>> +	fput(passthrough_filp);
>>>
>>> Why the get_file() and fput() in this method? This doesn't look right. There
>>> is no lock you're releasing between get_file() and fput(). What are they
>>> intended for?
>>
>> Hi
>>
>> Thanks for reviewing the code.
>>
>> The passthrough file could be released under our feet say  if the userspace
>> fuse daemon crashed or was killed  ( while we are processing the read or the
>> write) causing bad things to happen.
>> The calls here are to increase the count temporarily  and then decrease it
>> so that we dont release in the middle of a write and everything is
>> gracefully handled...
>>
>> I have a comment right before the get_file call above saying the same thing.
>> Please let me know if you have any more questions.
>
> If that is the case, why can't the passthrough file be released before the
> get_file() call, e.g. while the core processing the filesystem read request
> is entering fuse_passthrough_read_write_iter()?
>
> As far as I can tell, you can drop the get_file() and fput() calls.
> fuse_setup_passthrough() already took a reference to the file for you, that
> reference can only be dropped in fuse_passthrough_release(), and the VFS
> ensures that no release call happens while a read or write is pending.
>
I just feel uncomfortable with dropping them. I thought they could be 
released ( i/o ) takes longer than the actual execution... but if i can 
be sure of it then maybe..
Nikhilesh Reddy Feb. 3, 2016, 7:05 p.m. UTC | #6
Hi
Thanks for your review again :)
>
> Uh... how do you know at this point that the file is actually writable?
> Normally, e.g. vfs_write() will ensure that the file is writable, and
> e.g. generic_file_write_iter() won't check for writability as far as I
> can tell. This might allow someone to use the passthrough mechanism to
> overwrite a file he is only allowed to read, but not write, like
> /etc/passwd.

I considered adding the checks ( the same ones that VFS does)  but not 
sure if we need to.
So the user will need to construct a fuse filesystem ( that opens for 
O_READONLY even though the user asks for a O_RDWR from the FUSE open) 
and then mount it , with CAP_SYS_ADMIN  for which you need to be root 
but  once he has that he should be able to easily get to the files 
without needing to go through FUSE  right using CAP_DAC_OVERRIDE?

Am i missing something? Please do help me understand.

But yes if really needed I can add additional checks once i understand it


>
> Also, I think this might bypass mandatory locks, the
> security_file_permission hook (which seems like a bad idea anyway
> though), inotify/fsnotify and sb_start_write.
>
Can you please elaborate/clarify further? I am am not sure what you mean.


Again thanks for your reviews :)
Appreciate your help
Jann Horn Feb. 3, 2016, 7:53 p.m. UTC | #7
On Wed, Feb 03, 2016 at 11:05:57AM -0800, Nikhilesh Reddy wrote:
> Hi
> Thanks for your review again :)
> >
> >Uh... how do you know at this point that the file is actually writable?
> >Normally, e.g. vfs_write() will ensure that the file is writable, and
> >e.g. generic_file_write_iter() won't check for writability as far as I
> >can tell. This might allow someone to use the passthrough mechanism to
> >overwrite a file he is only allowed to read, but not write, like
> >/etc/passwd.
> 
> I considered adding the checks ( the same ones that VFS does)  but not sure
> if we need to.
> So the user will need to construct a fuse filesystem ( that opens for
> O_READONLY even though the user asks for a O_RDWR from the FUSE open) and
> then mount it , with CAP_SYS_ADMIN  for which you need to be root but  once
> he has that he should be able to easily get to the files without needing to
> go through FUSE  right using CAP_DAC_OVERRIDE?
> 
> Am i missing something? Please do help me understand.
> 
> But yes if really needed I can add additional checks once i understand it

On most Linux desktop systems, and on many servers, the userland "fuse"
package is installed, which ships with a setuid root helper "fusermount":

$ ls -l /bin/fusermount
-rwsr-xr-x 1 root root 30800 May 21  2015 /bin/fusermount

This setuid helper allows any user to mount FUSE filesystems anywhere he
wants. This works as follows: main() calls mount_fuse(), which opens
/dev/fuse by calling open_fuse_device(). mount_fuse() then makes sure
that the caller has write access to the directory he is about to mount
over using check_perm(), then calls mount() via do_mount(). mount_fuse()
returns the /dev/fuse fd, which is then sent to the invoker of fusermount
through a unix domain socket.

(What the setuid binary does control are the mount options; those are
used to enforce that the user can't mount filesystems that are
accessible for other users.)

Note that at no point, any data is sent to or read from the FUSE control
fd by fusermount. Therefore, the init reply that is processed in
process_init_reply() and determines whether passthrough will be enabled
is controlled by the unprivileged caller.

This fusermount mechanism is used by pseudo-filesystems like sshfs in
order to allow unprivileged users to use them.

fusermount aside, there is also an (as far as I know) pending patch with
the intent to make FUSE usable in user namespaces, which is going to
allow unprivileged users to mount FUSE filesystems even without a
userspace helper: https://lkml.org/lkml/2015/12/2/472

(Note that this is very different from CUSE, which by design must never
be exposed to non-root code.)

> >Also, I think this might bypass mandatory locks, the
> >security_file_permission hook (which seems like a bad idea anyway
> >though), inotify/fsnotify and sb_start_write.
> >
> Can you please elaborate/clarify further? I am am not sure what you mean.
Jann Horn Feb. 3, 2016, 7:56 p.m. UTC | #8
On Wed, Feb 03, 2016 at 11:05:32AM -0800, Nikhilesh Reddy wrote:
> On 02/01/2016 11:45 AM, Jann Horn wrote:
> >On Mon, Feb 01, 2016 at 11:28:51AM -0800, Nikhilesh Reddy wrote:
> >>On Mon 01 Feb 2016 11:15:56 AM PST, Jann Horn wrote:
> >>>On Mon, Feb 01, 2016 at 10:56:27AM -0800, Nikhilesh Reddy wrote:
> >>>>diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
> >>>[...]
> >>>>+static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
> >>>>+					    struct iov_iter *iter, int do_write)
> >>>>+{
> >>>>+	ssize_t ret_val;
> >>>>+	struct fuse_file *ff;
> >>>>+	struct file *fuse_file, *passthrough_filp;
> >>>>+	struct inode *fuse_inode, *passthrough_inode;
> >>>>+
> >>>>+	ff = iocb->ki_filp->private_data;
> >>>>+	fuse_file = iocb->ki_filp;
> >>>>+	passthrough_filp = ff->passthrough_filp;
> >>>>+
> >>>>+	/* lock passthrough file to prevent it from being released */
> >>>>+	get_file(passthrough_filp);
> >>>>+	iocb->ki_filp = passthrough_filp;
> >>>>+	fuse_inode = fuse_file->f_path.dentry->d_inode;
> >>>>+	passthrough_inode = file_inode(passthrough_filp);
> >>>>+
> >>>>+	if (do_write) {
> >>>>+		if (!passthrough_filp->f_op->write_iter)
> >>>>+			return -EIO;
> >>>>+		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);
> >>>>+
> >>>>+		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
> >>>>+			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
> >>>>+			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
> >>>>+		}
> >>>>+	} else {
> >>>>+		if (!passthrough_filp->f_op->read_iter)
> >>>>+			return -EIO;
> >>>>+		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
> >>>>+		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
> >>>>+			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
> >>>>+	}
> >>>>+
> >>>>+	iocb->ki_filp = fuse_file;
> >>>>+
> >>>>+	/* unlock passthrough file */
> >>>>+	fput(passthrough_filp);
> >>>
> >>>Why the get_file() and fput() in this method? This doesn't look right. There
> >>>is no lock you're releasing between get_file() and fput(). What are they
> >>>intended for?
> >>
> >>Hi
> >>
> >>Thanks for reviewing the code.
> >>
> >>The passthrough file could be released under our feet say  if the userspace
> >>fuse daemon crashed or was killed  ( while we are processing the read or the
> >>write) causing bad things to happen.
> >>The calls here are to increase the count temporarily  and then decrease it
> >>so that we dont release in the middle of a write and everything is
> >>gracefully handled...
> >>
> >>I have a comment right before the get_file call above saying the same thing.
> >>Please let me know if you have any more questions.
> >
> >If that is the case, why can't the passthrough file be released before the
> >get_file() call, e.g. while the core processing the filesystem read request
> >is entering fuse_passthrough_read_write_iter()?
> >
> >As far as I can tell, you can drop the get_file() and fput() calls.
> >fuse_setup_passthrough() already took a reference to the file for you, that
> >reference can only be dropped in fuse_passthrough_release(), and the VFS
> >ensures that no release call happens while a read or write is pending.
> >
> I just feel uncomfortable with dropping them. I thought they could be
> released ( i/o ) takes longer than the actual execution... but if i can be
> sure of it then maybe..

These get_file() and fput() calls aren't useful.

And I think they can lead to a reference count leak, which would lead to a
use-after-free vulnerability on 32bit kernels, because you forgot to fput()
in the error cases where you return -EIO.
Nikhilesh Reddy Feb. 3, 2016, 8:16 p.m. UTC | #9
On 02/03/2016 11:53 AM, Jann Horn wrote:
> On Wed, Feb 03, 2016 at 11:05:57AM -0800, Nikhilesh Reddy wrote:
>> Hi
>> Thanks for your review again :)
>>>
>>> Uh... how do you know at this point that the file is actually writable?
>>> Normally, e.g. vfs_write() will ensure that the file is writable, and
>>> e.g. generic_file_write_iter() won't check for writability as far as I
>>> can tell. This might allow someone to use the passthrough mechanism to
>>> overwrite a file he is only allowed to read, but not write, like
>>> /etc/passwd.
>>
>> I considered adding the checks ( the same ones that VFS does)  but not sure
>> if we need to.
>> So the user will need to construct a fuse filesystem ( that opens for
>> O_READONLY even though the user asks for a O_RDWR from the FUSE open) and
>> then mount it , with CAP_SYS_ADMIN  for which you need to be root but  once
>> he has that he should be able to easily get to the files without needing to
>> go through FUSE  right using CAP_DAC_OVERRIDE?
>>
>> Am i missing something? Please do help me understand.
>>
>> But yes if really needed I can add additional checks once i understand it
>
> On most Linux desktop systems, and on many servers, the userland "fuse"
> package is installed, which ships with a setuid root helper "fusermount":
>
> $ ls -l /bin/fusermount
> -rwsr-xr-x 1 root root 30800 May 21  2015 /bin/fusermount
>
> This setuid helper allows any user to mount FUSE filesystems anywhere he
> wants. This works as follows: main() calls mount_fuse(), which opens
> /dev/fuse by calling open_fuse_device(). mount_fuse() then makes sure
> that the caller has write access to the directory he is about to mount
> over using check_perm(), then calls mount() via do_mount(). mount_fuse()
> returns the /dev/fuse fd, which is then sent to the invoker of fusermount
> through a unix domain socket.
>
> (What the setuid binary does control are the mount options; those are
> used to enforce that the user can't mount filesystems that are
> accessible for other users.)
>
> Note that at no point, any data is sent to or read from the FUSE control
> fd by fusermount. Therefore, the init reply that is processed in
> process_init_reply() and determines whether passthrough will be enabled
> is controlled by the unprivileged caller.
>
> This fusermount mechanism is used by pseudo-filesystems like sshfs in
> order to allow unprivileged users to use them.
>
> fusermount aside, there is also an (as far as I know) pending patch with
> the intent to make FUSE usable in user namespaces, which is going to
> allow unprivileged users to mount FUSE filesystems even without a
> userspace helper: https://lkml.org/lkml/2015/12/2/472
>
> (Note that this is very different from CUSE, which by design must never
> be exposed to non-root code.)

Thanks for the explanation ..I am convinced :)
  will add the checks when i send out the next version ( Probably by end 
of the week.. hopefully will be the last version :) :) )

Something on the lines of
if (!(file->f_mode & FMODE_WRITE))
	return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
	return -EINVAL;

And
if (!(file->f_mode & FMODE_READ))
	return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
	return -EINVAL;
	

>
>>> Also, I think this might bypass mandatory locks, the
>>> security_file_permission hook (which seems like a bad idea anyway
>>> though), inotify/fsnotify and sb_start_write.
>>>
>> Can you please elaborate/clarify further? I am am not sure what you mean.

Can you please also explain what you meant by :
"might bypass mandatory locks, the security_file_permission hook (which 
seems like a bad idea anyway though), inotify/fsnotify and sb_start_write."
Jann Horn Feb. 3, 2016, 8:42 p.m. UTC | #10
On Wed, Feb 03, 2016 at 12:16:17PM -0800, Nikhilesh Reddy wrote:
> On 02/03/2016 11:53 AM, Jann Horn wrote:
> >On Wed, Feb 03, 2016 at 11:05:57AM -0800, Nikhilesh Reddy wrote:
> >>Hi
> >>Thanks for your review again :)
> >>>
> >>>Uh... how do you know at this point that the file is actually writable?
> >>>Normally, e.g. vfs_write() will ensure that the file is writable, and
> >>>e.g. generic_file_write_iter() won't check for writability as far as I
> >>>can tell. This might allow someone to use the passthrough mechanism to
> >>>overwrite a file he is only allowed to read, but not write, like
> >>>/etc/passwd.
> >>
> >>I considered adding the checks ( the same ones that VFS does)  but not sure
> >>if we need to.
> >>So the user will need to construct a fuse filesystem ( that opens for
> >>O_READONLY even though the user asks for a O_RDWR from the FUSE open) and
> >>then mount it , with CAP_SYS_ADMIN  for which you need to be root but  once
> >>he has that he should be able to easily get to the files without needing to
> >>go through FUSE  right using CAP_DAC_OVERRIDE?
> >>
> >>Am i missing something? Please do help me understand.
> >>
> >>But yes if really needed I can add additional checks once i understand it
> >
> >On most Linux desktop systems, and on many servers, the userland "fuse"
> >package is installed, which ships with a setuid root helper "fusermount":
> >
> >$ ls -l /bin/fusermount
> >-rwsr-xr-x 1 root root 30800 May 21  2015 /bin/fusermount
> >
> >This setuid helper allows any user to mount FUSE filesystems anywhere he
> >wants. This works as follows: main() calls mount_fuse(), which opens
> >/dev/fuse by calling open_fuse_device(). mount_fuse() then makes sure
> >that the caller has write access to the directory he is about to mount
> >over using check_perm(), then calls mount() via do_mount(). mount_fuse()
> >returns the /dev/fuse fd, which is then sent to the invoker of fusermount
> >through a unix domain socket.
> >
> >(What the setuid binary does control are the mount options; those are
> >used to enforce that the user can't mount filesystems that are
> >accessible for other users.)
> >
> >Note that at no point, any data is sent to or read from the FUSE control
> >fd by fusermount. Therefore, the init reply that is processed in
> >process_init_reply() and determines whether passthrough will be enabled
> >is controlled by the unprivileged caller.
> >
> >This fusermount mechanism is used by pseudo-filesystems like sshfs in
> >order to allow unprivileged users to use them.
> >
> >fusermount aside, there is also an (as far as I know) pending patch with
> >the intent to make FUSE usable in user namespaces, which is going to
> >allow unprivileged users to mount FUSE filesystems even without a
> >userspace helper: https://lkml.org/lkml/2015/12/2/472
> >
> >(Note that this is very different from CUSE, which by design must never
> >be exposed to non-root code.)
> 
> Thanks for the explanation ..I am convinced :)
>  will add the checks when i send out the next version ( Probably by end of
> the week.. hopefully will be the last version :) :) )
> 
> Something on the lines of
> if (!(file->f_mode & FMODE_WRITE))
> 	return -EBADF;
> if (!(file->f_mode & FMODE_CAN_WRITE))
> 	return -EINVAL;
> 
> And
> if (!(file->f_mode & FMODE_READ))
> 	return -EBADF;
> if (!(file->f_mode & FMODE_CAN_READ))
> 	return -EINVAL;
> 	
> 
> >
> >>>Also, I think this might bypass mandatory locks, the
> >>>security_file_permission hook (which seems like a bad idea anyway
> >>>though), inotify/fsnotify and sb_start_write.
> >>>
> >>Can you please elaborate/clarify further? I am am not sure what you mean.
> 
> Can you please also explain what you meant by :
> "might bypass mandatory locks, the security_file_permission hook (which
> seems like a bad idea anyway though), inotify/fsnotify and sb_start_write."

Have a look at the stuff that goes on in vfs_write:


It checks that FMODE_WRITE is in file->f_mode (to ensure that the file is
open for writing.

It uses rw_verify_area() to check for various things; in particular, it
verifies that nobody else has taken a mandatory lock on the target of
the write operation and that no LSM wishes to prevent the write access
through the file_permission hook.

It uses file_start_write() and file_end_write() to (as far as I can
tell) prevent the filesystem from being frozen (for standby/hibernate)
while a write access is in progress.

It uses fsnotify_modify() to inform userspace that the file changed.


You might want to call one of the vfs helper functions so that you
don't have to do all of this yourself. Something like
vfs_readv/vfs_writev - but I'm not very familiar with this part.
Andrew Karpow March 4, 2016, 12:23 p.m. UTC | #11
Hi!

Thanks for the passthrough patch, exactly what I was searching for.
I'am currently trying to port your code to a prehistoric RedHat 6 Kernel, but I wonder about some implementation details...
 
> +++ b/fs/fuse/passthrough.c
...
> +void fuse_setup_passthrough(struct fuse_conn *fc, struct fuse_req *req)
..
> +	daemon_fd = (int)open_out->passthrough_fd;
...

> +++ b/include/uapi/linux/fuse.h
...
> @@ -480,7 +481,7 @@ struct fuse_create_in {
>  struct fuse_open_out {
>  	uint64_t	fh;
>  	uint32_t	open_flags;
> -	uint32_t	padding;
> +	int32_t         passthrough_fd;
>  };
 
I don't get where the passthrough_fd struct member is ever set?

with best regards,

Andy

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index e95eeb4..3805040 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,4 +5,4 @@ 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 obj-$(CONFIG_CUSE) += cuse.o
 
-fuse-objs := dev.o dir.o file.o inode.o control.o
+fuse-objs := dev.o dir.o file.o inode.o control.o passthrough.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ebb5e37..e807d98 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -7,6 +7,7 @@ 
 */
 
 #include "fuse_i.h"
+#include "fuse_passthrough.h"
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -566,9 +567,14 @@  ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
 	       args->out.numargs * sizeof(struct fuse_arg));
 	fuse_request_send(fc, req);
 	ret = req->out.h.error;
-	if (!ret && args->out.argvar) {
-		BUG_ON(args->out.numargs != 1);
-		ret = req->out.args[0].size;
+	if (!ret) {
+		if (args->out.argvar) {
+			BUG_ON(args->out.numargs != 1);
+			ret = req->out.args[0].size;
+		}
+
+		if (req->passthrough_filp != NULL)
+			args->out.passthrough_filp = req->passthrough_filp;
 	}
 	fuse_put_request(fc, req);
 
@@ -1934,6 +1940,7 @@  static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	err = copy_out_args(cs, &req->out, nbytes);
 	fuse_copy_finish(cs);
 
+	fuse_setup_passthrough(fc, req);
 	spin_lock(&fpq->lock);
 	clear_bit(FR_LOCKED, &req->flags);
 	if (!fpq->connected)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 712601f..ef34298 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -428,6 +428,7 @@  static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	args.out.args[0].value = &outentry;
 	args.out.args[1].size = sizeof(outopen);
 	args.out.args[1].value = &outopen;
+	args.out.passthrough_filp = NULL;
 	err = fuse_simple_request(fc, &args);
 	if (err)
 		goto out_free_ff;
@@ -439,6 +440,8 @@  static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	ff->fh = outopen.fh;
 	ff->nodeid = outentry.nodeid;
 	ff->open_flags = outopen.open_flags;
+	if (args.out.passthrough_filp != NULL)
+		ff->passthrough_filp = args.out.passthrough_filp;
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
 			  &outentry.attr, entry_attr_timeout(&outentry), 0);
 	if (!inode) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 570ca40..14b0c69 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -7,6 +7,7 @@ 
 */
 
 #include "fuse_i.h"
+#include "fuse_passthrough.h"
 
 #include <linux/pagemap.h>
 #include <linux/slab.h>
@@ -21,8 +22,10 @@ 
 static const struct file_operations fuse_direct_io_file_operations;
 
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
-			  int opcode, struct fuse_open_out *outargp)
+			  int opcode, struct fuse_open_out *outargp,
+			  struct file **passthrough_filpp)
 {
+	int ret_val;
 	struct fuse_open_in inarg;
 	FUSE_ARGS(args);
 
@@ -38,8 +41,14 @@  static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 	args.out.numargs = 1;
 	args.out.args[0].size = sizeof(*outargp);
 	args.out.args[0].value = outargp;
+	args.out.passthrough_filp = NULL;
 
-	return fuse_simple_request(fc, &args);
+	ret_val = fuse_simple_request(fc, &args);
+
+	if (args.out.passthrough_filp != NULL)
+		*passthrough_filpp = args.out.passthrough_filp;
+
+	return ret_val;
 }
 
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
@@ -50,6 +59,7 @@  struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 	if (unlikely(!ff))
 		return NULL;
 
+	ff->passthrough_filp = NULL;
 	ff->fc = fc;
 	ff->reserved_req = fuse_request_alloc(0);
 	if (unlikely(!ff->reserved_req)) {
@@ -117,6 +127,7 @@  int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir)
 {
 	struct fuse_file *ff;
+	struct file *passthrough_filp = NULL;
 	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 
 	ff = fuse_file_alloc(fc);
@@ -129,10 +140,12 @@  int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		struct fuse_open_out outarg;
 		int err;
 
-		err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+		err = fuse_send_open(fc, nodeid, file, opcode, &outarg,
+				     &(passthrough_filp));
 		if (!err) {
 			ff->fh = outarg.fh;
 			ff->open_flags = outarg.open_flags;
+			ff->passthrough_filp = passthrough_filp;
 
 		} else if (err != -ENOSYS || isdir) {
 			fuse_file_free(ff);
@@ -252,6 +265,8 @@  void fuse_release_common(struct file *file, int opcode)
 	if (unlikely(!ff))
 		return;
 
+	fuse_passthrough_release(ff);
+
 	req = ff->reserved_req;
 	fuse_prepare_release(ff, file->f_flags, opcode);
 
@@ -896,8 +911,10 @@  out:
 
 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
+	ssize_t ret_val;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = iocb->ki_filp->private_data;
 
 	/*
 	 * In auto invalidate mode, always update attributes on read.
@@ -912,7 +929,12 @@  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return err;
 	}
 
-	return generic_file_read_iter(iocb, to);
+	if (ff && ff->passthrough_filp)
+		ret_val = fuse_passthrough_read_iter(iocb, to);
+	else
+		ret_val = generic_file_read_iter(iocb, to);
+
+	return ret_val;
 }
 
 static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
@@ -1144,6 +1166,7 @@  static ssize_t fuse_perform_write(struct file *file,
 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
 	struct address_space *mapping = file->f_mapping;
 	ssize_t written = 0;
 	ssize_t written_buffered = 0;
@@ -1177,8 +1200,14 @@  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err)
 		goto out;
 
+	if (ff && ff->passthrough_filp) {
+		written = fuse_passthrough_write_iter(iocb, from);
+		goto out;
+	}
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		loff_t pos = iocb->ki_pos;
+
 		written = generic_file_direct_write(iocb, from, pos);
 		if (written < 0 || !iov_iter_count(from))
 			goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4051131..2f4d986 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -157,6 +157,9 @@  struct fuse_file {
 
 	/** Has flock been performed on this file? */
 	bool flock:1;
+
+	/* the read write file */
+	struct file *passthrough_filp;
 };
 
 /** One input argument of a request */
@@ -236,6 +239,7 @@  struct fuse_args {
 		unsigned argvar:1;
 		unsigned numargs;
 		struct fuse_arg args[2];
+		struct file *passthrough_filp;
 	} out;
 };
 
@@ -374,6 +378,9 @@  struct fuse_req {
 
 	/** Request is stolen from fuse_file->reserved_req */
 	struct file *stolen_file;
+
+	/** fuse passthrough file  */
+	struct file *passthrough_filp;
 };
 
 struct fuse_iqueue {
@@ -531,6 +538,9 @@  struct fuse_conn {
 	/** write-back cache policy (default is write-through) */
 	unsigned writeback_cache:1;
 
+	/** passthrough IO. */
+	unsigned passthrough:1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
diff --git a/fs/fuse/fuse_passthrough.h b/fs/fuse/fuse_passthrough.h
new file mode 100644
index 0000000..62f12c1
--- /dev/null
+++ b/fs/fuse/fuse_passthrough.h
@@ -0,0 +1,31 @@ 
+/*
+ * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FS_FUSE_PASSTHROUGH_H
+#define _FS_FUSE_PASSTHROUGH_H
+
+#include "fuse_i.h"
+
+#include <linux/fuse.h>
+#include <linux/file.h>
+
+void fuse_setup_passthrough(struct fuse_conn *fc, struct fuse_req *req);
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to);
+
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from);
+
+void fuse_passthrough_release(struct fuse_file *ff);
+
+#endif /* _FS_FUSE_PASSTHROUGH_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2..33ec874 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -860,6 +860,7 @@  static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->conn_error = 1;
 	else {
 		unsigned long ra_pages;
+		struct super_block *sb = fc->sb;
 
 		process_init_limits(fc, arg);
 
@@ -898,6 +899,13 @@  static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->async_dio = 1;
 			if (arg->flags & FUSE_WRITEBACK_CACHE)
 				fc->writeback_cache = 1;
+			if (arg->flags & FUSE_PASSTHROUGH) {
+				fc->passthrough = 1;
+				/* Prevent further stacking */
+				sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+				pr_info("FUSE: Pass through is enabled [%s : %d]!\n",
+					current->comm, current->pid);
+			}
 			if (arg->time_gran && arg->time_gran <= 1000000000)
 				fc->sb->s_time_gran = arg->time_gran;
 		} else {
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
new file mode 100644
index 0000000..e867194
--- /dev/null
+++ b/fs/fuse/passthrough.c
@@ -0,0 +1,128 @@ 
+/*
+ * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "fuse_passthrough.h"
+
+#include <linux/aio.h>
+#include <linux/fs_stack.h>
+
+void fuse_setup_passthrough(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int daemon_fd, fs_stack_depth;
+	unsigned open_out_index;
+	struct file *passthrough_filp;
+	struct inode *passthrough_inode;
+	struct super_block *passthrough_sb;
+	struct fuse_open_out *open_out;
+
+	req->passthrough_filp = NULL;
+
+	if (!(fc->passthrough))
+		return;
+
+	if ((req->in.h.opcode != FUSE_OPEN) &&
+	    (req->in.h.opcode != FUSE_CREATE))
+		return;
+
+	open_out_index = req->in.numargs - 1;
+
+	BUG_ON(open_out_index != 0 && open_out_index != 1);
+	BUG_ON(req->out.args[open_out_index].size != sizeof(*open_out));
+
+	open_out = req->out.args[open_out_index].value;
+
+	daemon_fd = (int)open_out->passthrough_fd;
+	if (daemon_fd < 0)
+		return;
+
+	passthrough_filp = fget_raw(daemon_fd);
+	if (!passthrough_filp)
+		return;
+
+	passthrough_inode = file_inode(passthrough_filp);
+	passthrough_sb = passthrough_inode->i_sb;
+	fs_stack_depth = passthrough_sb->s_stack_depth + 1;
+
+	/* If we reached the stacking limit go through regular io */
+	if (fs_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		/* Release the passthrough file. */
+		fput(passthrough_filp);
+		pr_err("FUSE: maximum fs stacking depth exceeded, cannot use passthrough for this file\n");
+		return;
+	}
+	req->passthrough_filp = passthrough_filp;
+}
+
+static ssize_t fuse_passthrough_read_write_iter(struct kiocb *iocb,
+					    struct iov_iter *iter, int do_write)
+{
+	ssize_t ret_val;
+	struct fuse_file *ff;
+	struct file *fuse_file, *passthrough_filp;
+	struct inode *fuse_inode, *passthrough_inode;
+
+	ff = iocb->ki_filp->private_data;
+	fuse_file = iocb->ki_filp;
+	passthrough_filp = ff->passthrough_filp;
+
+	/* lock passthrough file to prevent it from being released */
+	get_file(passthrough_filp);
+	iocb->ki_filp = passthrough_filp;
+	fuse_inode = fuse_file->f_path.dentry->d_inode;
+	passthrough_inode = file_inode(passthrough_filp);
+
+	if (do_write) {
+		if (!passthrough_filp->f_op->write_iter)
+			return -EIO;
+		ret_val = passthrough_filp->f_op->write_iter(iocb, iter);
+
+		if (ret_val >= 0 || ret_val == -EIOCBQUEUED) {
+			fsstack_copy_inode_size(fuse_inode, passthrough_inode);
+			fsstack_copy_attr_times(fuse_inode, passthrough_inode);
+		}
+	} else {
+		if (!passthrough_filp->f_op->read_iter)
+			return -EIO;
+		ret_val = passthrough_filp->f_op->read_iter(iocb, iter);
+		if (ret_val >= 0 || ret_val == -EIOCBQUEUED)
+			fsstack_copy_attr_atime(fuse_inode, passthrough_inode);
+	}
+
+	iocb->ki_filp = fuse_file;
+
+	/* unlock passthrough file */
+	fput(passthrough_filp);
+
+	return ret_val;
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	return fuse_passthrough_read_write_iter(iocb, to, 0);
+}
+
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	return fuse_passthrough_read_write_iter(iocb, from, 1);
+}
+
+void fuse_passthrough_release(struct fuse_file *ff)
+{
+	if (!(ff->passthrough_filp))
+		return;
+
+	/* Release the passthrough file. */
+	fput(ff->passthrough_filp);
+	ff->passthrough_filp = NULL;
+}
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index c9aca04..a08933a 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -250,6 +250,7 @@  struct fuse_file_lock {
 #define FUSE_ASYNC_DIO		(1 << 15)
 #define FUSE_WRITEBACK_CACHE	(1 << 16)
 #define FUSE_NO_OPEN_SUPPORT	(1 << 17)
+#define FUSE_PASSTHROUGH	(1 << 18)
 
 /**
  * CUSE INIT request/reply flags
@@ -480,7 +481,7 @@  struct fuse_create_in {
 struct fuse_open_out {
 	uint64_t	fh;
 	uint32_t	open_flags;
-	uint32_t	padding;
+	int32_t         passthrough_fd;
 };
 
 struct fuse_release_in {