diff mbox series

[RFC] fuse: add generic file store

Message ID 1622537906-54361-1-git-send-email-tao.peng@linux.alibaba.com (mailing list archive)
State New
Headers show
Series [RFC] fuse: add generic file store | expand

Commit Message

Peng Tao June 1, 2021, 8:58 a.m. UTC
Add a generic file store that userspace can save/restore any open file
descriptor. These file descriptors can be managed by different
applications not just the same user space application.

A possible use case is fuse fd passthrough being developed
by Alessio Balsini [1] where underlying file system fd can be saved in
this file store.

Another possible use case is user space application live upgrade and
failover (upon panic etc.). Currently during userspace live upgrade and
failover, open file descriptors usually have to be saved seprately in
a different management process with AF_UNIX sendmsg.

But it causes chicken and egg problem and such management process needs
to support live upgrade and failover as well. With a generic file store
in the kernel, application live upgrade and failover no longer require such
management process to hold reference for their open file descriptors.

This is an RFC to see if the approach makes sense to upstream and it can be
tested with the following C programe.

Why FUSE?
- Because we are trying to solve FUSE fd passthrough and FUSE daemon
  live upgrade.

Why global IDR rather than per fuse connnection one?
- Because for live upgrade new process, we don't have a valid fuse connection
  in the first place.

Missing cleanup method in case user space messes up?
- We can limit the number of saved FDs and hey it is RFC ;).

[1] https://lore.kernel.org/lkml/20210125153057.3623715-1-balsini@android.com/
--------

/#include <sys/types.h>
/#include <sys/stat.h>
/#include <fcntl.h>
/#include <unistd.h>
/#include <stdlib.h>
/#include <stdio.h>
/#include <stdbool.h>
/#include <string.h>
/#include <sys/ioctl.h>
/#include <stdint.h>

/#define FUSE_DEV_IOC_SAVE_FD    _IOWR(229, 1, uint32_t)
/#define FUSE_DEV_IOC_RESTORE_FD _IOWR(229, 2, uint32_t)
/#define FUSE_DEV_IOC_REMOVE_FD  _IOW(229, 3, uint32_t)

static const char *FUSEDEV = "/dev/fuse";
int fuse_fd;

void show_help(char *pname)
{
	fprintf(stdout, "%s [-sh] [-d <id>] [-f <filename>] [-r <id>]\n", pname);
	fprintf(stdout, "\t-s open <filename>(default foobar) and save its fd\n");
	fprintf(stdout, "\t-r <id> restore fd identified by id\n");
	fprintf(stdout, "\t-f <filename> set filename to save\n");
	fprintf(stdout, "\t-d <id> delete id identified by id\n");
	fprintf(stdout, "\t-h show this help\n");
}

int open_save_fd(char *name)
{
	int fd;

	fd = open(name, O_RDWR | O_CREAT, 0666);
	if (fd < 0)
		return fd;
	if (ioctl(fuse_fd, FUSE_DEV_IOC_SAVE_FD, &fd) < 0)
		return -1;

	return fd;
}

int restore_fd(int id)
{
	if (ioctl(fuse_fd, FUSE_DEV_IOC_RESTORE_FD, &id) < 0)
		return -1;
	return id;
}

int delete_fd(int id)
{
	return ioctl(fuse_fd, FUSE_DEV_IOC_REMOVE_FD, &id);
}

int read_file(int fd)
{
	char buf[1024];
	int ret;

	lseek(fd, 0, SEEK_SET);
	memset(buf, 0, sizeof(buf));
	fprintf(stdout, "file content:\n");
	while ((ret = read(fd, buf, sizeof(buf))) > 0) {
		fprintf(stdout, "%s", buf);
	}
	return 0;
}

int main(int argc, char *argv[])
{
	bool save = false, restore = false;
	int fd, opt, ret, id, restored_fd;
	char *name = "foobar";

	fuse_fd = open(FUSEDEV, O_RDWR);
	if (fuse_fd < 0) {
		fprintf(stderr, "failed to open fuse device\n");
		return -1;
	}

	while ((opt = getopt(argc, argv, "shd:f:r:")) != -1) {
		switch (opt) {
		case 's':
			save = true;
			break;
		case 'r':
			id = atoi(optarg);
			restored_fd = restore_fd(id);
			if (restored_fd < 0)
				fprintf(stderr, "failed to restore fd with id %d\n", id);
			else
				fprintf(stdout, "restored file fd %d with id %d\n", restored_fd, id);
			restore = true;
			break;
		case 'd':
			id = atoi(optarg);
			ret = delete_fd(id);
			if (ret < 0)
				fprintf(stderr, "failed to delete fd with id %d\n", id);
			else
				fprintf(stdout, "deleted file id %d\n", id);
		case 'f':
			name = strdup(optarg);
			break;
		case 'h':
		default:
			show_help(argv[0]);
			return 0;
		}
	}
	if (save) {
		ret = open_save_fd(name);
		if (ret < 0)
			fprintf(stderr, "failed to open save fd with filename %s\n", name);
		else
			fprintf(stdout, "saved file id %d\n", ret);
	}
	if (restore && restored_fd > 0) {
		ret = read_file(restored_fd);
		if (ret < 0)
			fprintf(stderr, "failed to read_file with fd %d\n", restored_fd);
	}

	return 0;
}

Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
---
 fs/fuse/dev.c             | 31 ++++++++++++++---
 fs/fuse/fuse_i.h          |  5 +++
 fs/fuse/inode.c           | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fuse.h |  3 ++
 4 files changed, 119 insertions(+), 5 deletions(-)

Comments

Alessio Balsini June 2, 2021, 3:50 p.m. UTC | #1
On Tue, Jun 01, 2021 at 04:58:26PM +0800, Peng Tao wrote:
> Add a generic file store that userspace can save/restore any open file
> descriptor. These file descriptors can be managed by different
> applications not just the same user space application.
> 
> A possible use case is fuse fd passthrough being developed
> by Alessio Balsini [1] where underlying file system fd can be saved in
> this file store.
> 
> Another possible use case is user space application live upgrade and
> failover (upon panic etc.). Currently during userspace live upgrade and
> failover, open file descriptors usually have to be saved seprately in
> a different management process with AF_UNIX sendmsg.
> 
> But it causes chicken and egg problem and such management process needs
> to support live upgrade and failover as well. With a generic file store
> in the kernel, application live upgrade and failover no longer require such
> management process to hold reference for their open file descriptors.
> 
> This is an RFC to see if the approach makes sense to upstream and it can be
> tested with the following C programe.
> 
> Why FUSE?
> - Because we are trying to solve FUSE fd passthrough and FUSE daemon
>   live upgrade.
> 
> Why global IDR rather than per fuse connnection one?
> - Because for live upgrade new process, we don't have a valid fuse connection
>   in the first place.
> 
> Missing cleanup method in case user space messes up?
> - We can limit the number of saved FDs and hey it is RFC ;).
> 
> [1] https://lore.kernel.org/lkml/20210125153057.3623715-1-balsini@android.com/
> --------
> 
> [...]
> 


Hi Peng,

This is a cool feature indeed.

I guess we also want to ensure that restoring an FD can only be
performed by a trusted FUSE daemon, and not any other process attached
to /dev/fuse. Maybe adding some permission checks?

I also see that multiple restores can be done on the same FD, is that
intended? Shouldn't the IDR entry be removed once restored?

As far as I understand, the main use case is to be able to replace a
FUSE daemon with another, preserving the opened lower file system files.
How would user space handle the unmounting of the old FUSE file system
and mounting of the new one?
I wonder if something can be done with a pair of ioctls similar to
FUSE_DEV_IOC_CLONE to transfer the FUSE connection from the old to the
new FUSE daemon.  Maybe either the IDR or some other container to store
the files that are intended to be preserved can be put in fuse_conn
instead of keeping it global.

Does it make sense?

Thanks,
Alessio
Peng Tao June 7, 2021, 7:46 a.m. UTC | #2
On Wed, Jun 2, 2021 at 11:52 PM Alessio Balsini <balsini@android.com> wrote:
>
> On Tue, Jun 01, 2021 at 04:58:26PM +0800, Peng Tao wrote:
> > Add a generic file store that userspace can save/restore any open file
> > descriptor. These file descriptors can be managed by different
> > applications not just the same user space application.
> >
> > A possible use case is fuse fd passthrough being developed
> > by Alessio Balsini [1] where underlying file system fd can be saved in
> > this file store.
> >
> > Another possible use case is user space application live upgrade and
> > failover (upon panic etc.). Currently during userspace live upgrade and
> > failover, open file descriptors usually have to be saved seprately in
> > a different management process with AF_UNIX sendmsg.
> >
> > But it causes chicken and egg problem and such management process needs
> > to support live upgrade and failover as well. With a generic file store
> > in the kernel, application live upgrade and failover no longer require such
> > management process to hold reference for their open file descriptors.
> >
> > This is an RFC to see if the approach makes sense to upstream and it can be
> > tested with the following C programe.
> >
> > Why FUSE?
> > - Because we are trying to solve FUSE fd passthrough and FUSE daemon
> >   live upgrade.
> >
> > Why global IDR rather than per fuse connnection one?
> > - Because for live upgrade new process, we don't have a valid fuse connection
> >   in the first place.
> >
> > Missing cleanup method in case user space messes up?
> > - We can limit the number of saved FDs and hey it is RFC ;).
> >
> > [1] https://lore.kernel.org/lkml/20210125153057.3623715-1-balsini@android.com/
> > --------
> >
> > [...]
> >
>
>
> Hi Peng,
Hi Alessio,

>
> This is a cool feature indeed.
>
> I guess we also want to ensure that restoring an FD can only be
> performed by a trusted FUSE daemon, and not any other process attached
> to /dev/fuse. Maybe adding some permission checks?
>
The idea is to allow any daemon capable of opening /dev/fuse to be
able to restore an FD. I don't quite get which permissions do you like
to check? SYS_ADMIN?

> I also see that multiple restores can be done on the same FD, is that
> intended? Shouldn't the IDR entry be removed once restored?
>
In a crash recovery scenario, if the kernel destroys the IDR once an
FD is restored, and the recovering daemon panics again, the FD is lost
forever. So I would prefer to keep it in the kernel until explicit FD
removal.

> As far as I understand, the main use case is to be able to replace a
> FUSE daemon with another, preserving the opened lower file system files.
> How would user space handle the unmounting of the old FUSE file system
> and mounting of the new one?
It can call FUSE_DEV_IOC_REMOVE_FD before or after unmounting the old
FUSE file system. Either way, the last closer of the FUSE connection
FD would actually close the FUSE connection.

> I wonder if something can be done with a pair of ioctls similar to
> FUSE_DEV_IOC_CLONE to transfer the FUSE connection from the old to the
> new FUSE daemon.  Maybe either the IDR or some other container to store
> the files that are intended to be preserved can be put in fuse_conn
> instead of keeping it global.
>
> Does it make sense?
>
It makes sense at first glance since obviously it helps IDR cleaning
up as we can do it on a per fuse_conn basis. But giving it a second
thought, how do we preserve the FUSE connection fd representing the
same fuse_conn itself? We need to do it because we want to handle FUSE
daemon crash recovery cases. Maybe we can have something like:
1. use a tag to uniquely identify a fuse conn (as being done for virtio-fs)
2. in each of these SAVE/GET/REMOVE ioctls, it takes a tag argument so
FDs are kept locally to the specified fuse_conn
3. add FUSE_DEV_IOC_TRANSFER ioctl to transfer ownership of a saved FD
to a new userspace daemon. It can be seen as a combination of GET and
REMOVE ioctls

Cheers,
Tao
Enrico Weigelt, metux IT consult June 7, 2021, 3:32 p.m. UTC | #3
On 02.06.21 17:50, Alessio Balsini wrote:

>> A possible use case is fuse fd passthrough being developed
>> by Alessio Balsini [1] where underlying file system fd can be saved in
>> this file store.

oh, this could be quite what I'm currently intending do implement
(just considered 9p instead of fuse as it looks simpler to me):

I'd like the server being able to directly send an already opened fd to
the client (in response to it calling open()), quite like we can send
fd's via unix sockets.

The primary use case of that is sending already prepared fd's (eg. an
active network connection, locked-down file fd, a device that the client
can't open himself, etc).

Is that what you're working on ?


--mtx
Peng Tao June 8, 2021, 2:58 a.m. UTC | #4
On Mon, Jun 7, 2021 at 11:34 PM Enrico Weigelt, metux IT consult
<lkml@metux.net> wrote:
>
> On 02.06.21 17:50, Alessio Balsini wrote:
>
> >> A possible use case is fuse fd passthrough being developed
> >> by Alessio Balsini [1] where underlying file system fd can be saved in
> >> this file store.
>
> oh, this could be quite what I'm currently intending do implement
> (just considered 9p instead of fuse as it looks simpler to me):
>
> I'd like the server being able to directly send an already opened fd to
> the client (in response to it calling open()), quite like we can send
> fd's via unix sockets.
>
> The primary use case of that is sending already prepared fd's (eg. an
> active network connection, locked-down file fd, a device that the client
> can't open himself, etc).
>
> Is that what you're working on ?
If the server and client run on the same kernel, then yes, the current
RFC supports your use case as well, E.g.,
1. the server opens a file, saves the FD to the kernel, and passes the
IDR to the client.
2. the client retrieves the FD from the kernel

Does it match your use case?

Cheers,
Tao
Enrico Weigelt, metux IT consult June 8, 2021, 10:49 a.m. UTC | #5
On 08.06.21 04:58, Peng Tao wrote:

>> oh, this could be quite what I'm currently intending do implement
>> (just considered 9p instead of fuse as it looks simpler to me):
>>
>> I'd like the server being able to directly send an already opened fd to
>> the client (in response to it calling open()), quite like we can send
>> fd's via unix sockets.
>>
>> The primary use case of that is sending already prepared fd's (eg. an
>> active network connection, locked-down file fd, a device that the client
>> can't open himself, etc).
>>
>> Is that what you're working on ?
> If the server and client run on the same kernel, then yes, the current
> RFC supports your use case as well, E.g.,
> 1. the server opens a file, saves the FD to the kernel, and passes the
> IDR to the client.
> 2. the client retrieves the FD from the kernel
> 
> Does it match your use case?

Seems that's exactly what I'm looking for :)

Could you perhaps give a little example code how it looks in userland ?


--mtx
Peng Tao June 8, 2021, 12:41 p.m. UTC | #6
On Tue, Jun 8, 2021 at 6:50 PM Enrico Weigelt, metux IT consult
<lkml@metux.net> wrote:
>
> On 08.06.21 04:58, Peng Tao wrote:
>
> >> oh, this could be quite what I'm currently intending do implement
> >> (just considered 9p instead of fuse as it looks simpler to me):
> >>
> >> I'd like the server being able to directly send an already opened fd to
> >> the client (in response to it calling open()), quite like we can send
> >> fd's via unix sockets.
> >>
> >> The primary use case of that is sending already prepared fd's (eg. an
> >> active network connection, locked-down file fd, a device that the client
> >> can't open himself, etc).
> >>
> >> Is that what you're working on ?
> > If the server and client run on the same kernel, then yes, the current
> > RFC supports your use case as well, E.g.,
> > 1. the server opens a file, saves the FD to the kernel, and passes the
> > IDR to the client.
> > 2. the client retrieves the FD from the kernel
> >
> > Does it match your use case?
>
> Seems that's exactly what I'm looking for :)
>
> Could you perhaps give a little example code how it looks in userland ?
The initial RFC mail in the thread has a userspace example code. Does
it make sense to you?

Cheers,
Tao
Enrico Weigelt, metux IT consult June 9, 2021, 12:54 p.m. UTC | #7
On 08.06.21 14:41, Peng Tao wrote:

Hi,

> The initial RFC mail in the thread has a userspace example code. Does
> it make sense to you?

Sorry, I had missed that, now found it.

There're some things I don't quite understand:

* it just stores fd's I don't see anything where it is actually returned
   to some open() operation.
* the store is machine wide global - everybody uses the same number
   space, dont see any kind of access conrol ... how about security ?

I don't believe that just storing the fd's somewhere is really helpful
for that purpose - the fuse server shall be able to reply the open()
request with an fd, which then is directly transferred to the client.

--mtx
Peng Tao June 11, 2021, 12:46 p.m. UTC | #8
On Wed, Jun 9, 2021 at 11:16 PM Enrico Weigelt, metux IT consult
<lkml@metux.net> wrote:
>
> On 08.06.21 14:41, Peng Tao wrote:
>
> Hi,
>
Hi,
> > The initial RFC mail in the thread has a userspace example code. Does
> > it make sense to you?
>
> Sorry, I had missed that, now found it.
>
> There're some things I don't quite understand:
>
> * it just stores fd's I don't see anything where it is actually returned
>    to some open() operation.
The FUSE_DEV_IOC_RESTORE_FD ioctl returns the opened fd to a different process.

> * the store is machine wide global - everybody uses the same number
>    space, dont see any kind of access conrol ... how about security ?
>
The idea is that anyone capable of opening /dev/fuse can retrieve the FD.

> I don't believe that just storing the fd's somewhere is really helpful
> for that purpose - the fuse server shall be able to reply the open()
> request with an fd, which then is directly transferred to the client.
>
Could you describe your use case a bit? How does your client talk to
your server? Through open syscall or through some process-to-process
RPC calls?

Cheers,
Tao
Enrico Weigelt, metux IT consult June 15, 2021, 6:50 p.m. UTC | #9
On 11.06.21 14:46, Peng Tao wrote:

>>
>> * it just stores fd's I don't see anything where it is actually returned
>>     to some open() operation.
> The FUSE_DEV_IOC_RESTORE_FD ioctl returns the opened fd to a different process.

So, just open() a file on a fuse fs can't restore the fd directly 
(instead of opening a new file) ? If that's the case, that would mean,
userland has to take very special actions in order to get it. Right ?

>> * the store is machine wide global - everybody uses the same number
>>     space, dont see any kind of access conrol ... how about security ?
>>
> The idea is that anyone capable of opening /dev/fuse can retrieve the FD.
> 
>> I don't believe that just storing the fd's somewhere is really helpful
>> for that purpose - the fuse server shall be able to reply the open()
>> request with an fd, which then is directly transferred to the client.
>>
> Could you describe your use case a bit? How does your client talk to
> your server? Through open syscall or through some process-to-process
> RPC calls?

I'd like to write synthetic file systems (file servers) that allows
certain unprivileged processes (in some confined environment) directly
open()ing prepared file descriptors (e.g. devices, sockets, etc) that it
isn't allowed to open directly (but the server obviously is). Those fds
could be prepared in any ways (eg. sealed, seek'ed, already connected
sockets, etc).

The client thinks it just open()'s a normal file, but actually gets some
fd prepared elsewhere.


--mtx
Peng Tao June 16, 2021, 10:20 a.m. UTC | #10
On Wed, Jun 16, 2021 at 2:53 AM Enrico Weigelt, metux IT consult
<lkml@metux.net> wrote:
>
> On 11.06.21 14:46, Peng Tao wrote:
>
> >>
> >> * it just stores fd's I don't see anything where it is actually returned
> >>     to some open() operation.
> > The FUSE_DEV_IOC_RESTORE_FD ioctl returns the opened fd to a different process.
>
> So, just open() a file on a fuse fs can't restore the fd directly
> (instead of opening a new file) ? If that's the case, that would mean,
> userland has to take very special actions in order to get it. Right ?
Yes.

>
> >> * the store is machine wide global - everybody uses the same number
> >>     space, dont see any kind of access conrol ... how about security ?
> >>
> > The idea is that anyone capable of opening /dev/fuse can retrieve the FD.
> >
> >> I don't believe that just storing the fd's somewhere is really helpful
> >> for that purpose - the fuse server shall be able to reply the open()
> >> request with an fd, which then is directly transferred to the client.
> >>
> > Could you describe your use case a bit? How does your client talk to
> > your server? Through open syscall or through some process-to-process
> > RPC calls?
>
> I'd like to write synthetic file systems (file servers) that allows
> certain unprivileged processes (in some confined environment) directly
> open()ing prepared file descriptors (e.g. devices, sockets, etc) that it
> isn't allowed to open directly (but the server obviously is). Those fds
> could be prepared in any ways (eg. sealed, seek'ed, already connected
> sockets, etc).
>
> The client thinks it just open()'s a normal file, but actually gets some
> fd prepared elsewhere.
>
Oh, nop, that is not how the current RFC works. I see two gaps:
1. /dev/fuse is not accessible to all processes by default
2. open() syscall doesn't take enough arguments to tell the kernel
which file's fd it wants.

It seems that a proper solution to your use case is to:
1. extend the open() syscall to take a flag like FOPEN_FUSE_OPEN_FD (I
agree it's a bad name;)
2. FUSE kernel passes such a flag to fuse daemon
3. FUSE userspace daemon opens the file in the underlying file system,
store it to a kernel FD store, then return its IDR in the reply to
FUSE_OPEN API
4. FUSE kernel looks up underlying FD with the IDR, install it in the
calling process FD table, and return the new FD to the application

Is it what you want? It looks doable and is indeed an extension to the
current RFC.

Cheers,
Tao
Enrico Weigelt, metux IT consult June 16, 2021, 4:09 p.m. UTC | #11
On 16.06.21 12:20, Peng Tao wrote:
>> So, just open() a file on a fuse fs can't restore the fd directly
>> (instead of opening a new file) ? If that's the case, that would mean,
>> userland has to take very special actions in order to get it. Right ?
> Yes.

Then, it's not at all what I'm looking for :(

> Oh, nop, that is not how the current RFC works. I see two gaps:
> 1. /dev/fuse is not accessible to all processes by default

it shouldn't necessary at all - in my use case.

> 2. open() syscall doesn't take enough arguments to tell the kernel
> which file's fd it wants.

open() only works on a path name - that's exactly what I need.

Could you please tell more what your use case is actually about ?
Still struggling what you're actually going to achieve.

Just keeping fd's open while a server restarts ?
If that's what you want, I see much wider use far outside of fuse,
and that might call for some more generic approach - something like
Plan9's /srv filesystem.

> It seems that a proper solution to your use case is to:
> 1. extend the open() syscall to take a flag like FOPEN_FUSE_OPEN_FD (I
> agree it's a bad name;)

But that would still require changes in my userland. Something I do not
want per definition - it should work transparently. The application just
knows some file name (it might be even expecting common device names,
but put inside its own mount ns), nothing more, and no need to change
the application itself.

> 2. FUSE kernel passes such a flag to fuse daemon
> 3. FUSE userspace daemon opens the file in the underlying file system,
> store it to a kernel FD store, then return its IDR in the reply to
> FUSE_OPEN API
> 4. FUSE kernel looks up underlying FD with the IDR, install it in the
> calling process FD table, and return the new FD to the application

Does FUSE actually manipulate the process' fd table directly, while
in the open() callback ?


--mtx
Peng Tao June 17, 2021, 1:23 p.m. UTC | #12
On Thu, Jun 17, 2021 at 12:12 AM Enrico Weigelt, metux IT consult
<lkml@metux.net> wrote:
>
> On 16.06.21 12:20, Peng Tao wrote:
> >> So, just open() a file on a fuse fs can't restore the fd directly
> >> (instead of opening a new file) ? If that's the case, that would mean,
> >> userland has to take very special actions in order to get it. Right ?
> > Yes.
>
> Then, it's not at all what I'm looking for :(
>
> > Oh, nop, that is not how the current RFC works. I see two gaps:
> > 1. /dev/fuse is not accessible to all processes by default
>
> it shouldn't necessary at all - in my use case.
>
> > 2. open() syscall doesn't take enough arguments to tell the kernel
> > which file's fd it wants.
>
> open() only works on a path name - that's exactly what I need.
>
> Could you please tell more what your use case is actually about ?
> Still struggling what you're actually going to achieve.
>
> Just keeping fd's open while a server restarts ?
> If that's what you want, I see much wider use far outside of fuse,
> and that might call for some more generic approach - something like
> Plan9's /srv filesystem.
>
1. keeping FDs across userspace restart
2. help save FD in the FUSE fd passthrough use case as implemented by
Alessio Balsini

> > It seems that a proper solution to your use case is to:
> > 1. extend the open() syscall to take a flag like FOPEN_FUSE_OPEN_FD (I
> > agree it's a bad name;)
>
> But that would still require changes in my userland. Something I do not
> want per definition - it should work transparently. The application just
> knows some file name (it might be even expecting common device names,
> but put inside its own mount ns), nothing more, and no need to change
> the application itself.
>
> > 2. FUSE kernel passes such a flag to fuse daemon
> > 3. FUSE userspace daemon opens the file in the underlying file system,
> > store it to a kernel FD store, then return its IDR in the reply to
> > FUSE_OPEN API
> > 4. FUSE kernel looks up underlying FD with the IDR, install it in the
> > calling process FD table, and return the new FD to the application
>
> Does FUSE actually manipulate the process' fd table directly, while
> in the open() callback ?

hmm, you are right. The open() callback cannot install FD from there.
So in order for your use case to work, the VFS layer needs to be
changed to transparently replace an empty file struct with another
file struct that is prepared by the file system somewhere else. It is
really beyond the current RFC patch's scope IMHO.

Cheers,
Tao


Cheers,
Tao
--
Into Sth. Rich & Strange
Enrico Weigelt, metux IT consult June 21, 2021, 7:05 p.m. UTC | #13
On 17.06.21 15:23, Peng Tao wrote:

>> Just keeping fd's open while a server restarts ?
>> If that's what you want, I see much wider use far outside of fuse,
>> and that might call for some more generic approach - something like
>> Plan9's /srv filesystem.
>>
> 1. keeping FDs across userspace restart

if application needs to be rewritten for that anyways, there're other
ways to achieve this, w/o touching the kernel at all - exec() doesn't
automatically close fd's (unless they're opened w/ O_CLOEXEC)

> 2. help save FD in the FUSE fd passthrough use case as implemented by
> Alessio Balsini

you mean this one ?

https://lore.kernel.org/lkml/20210125153057.3623715-1-balsini@android.com

I fail to see why an extra fd store within the fuse device is necessary
for that - I'd just let the fuse daemon(s) reply the open request with
the fd it already holds.

I'd hate to run into situations where even killing all processes holding
some file open leads to a situation where it remains open inside the
kernel, thus blocking e.g. unmounting. I already see operators getting
very angy ... :o

by the way: alessio's approach is limited to simple read/write
operations anyways - other operations like ioctl() don't seem to work
easily that way.

and for the creds switching: I tend to believe that cases where a fs or
device looks at the calling process' creds in operations on an already
open fd, it's most likely a bad implementation.

yes, some legacy drivers actually do check for CAP_SYS_ADMIN e.g. for
low level hardware configuration (e.g. IO and IRQ on ISA bus), but I
wonder whether these are use at all in the our use cases and should be
ever allowed to non-root.

do you have any case where you really need to use the opener's creds ?
(after the fd is already open)

>> Does FUSE actually manipulate the process' fd table directly, while
>> in the open() callback ?
> 
> hmm, you are right. The open() callback cannot install FD from there.
> So in order for your use case to work, the VFS layer needs to be
> changed to transparently replace an empty file struct with another
> file struct that is prepared by the file system somewhere else. It is
> really beyond the current RFC patch's scope IMHO.

Exactly. That's where I'm struggling right now. Yet have to find out
whether I could just copy from one struct file into another (probably
some refcnt'ing required). And that still has some drawback: fd state
like file position won't be shared.

I've been thinking about changing the vfs_open() chain so that it
doesn't pass in an existing/prepared struct file, but instead returns
one, which is allocated further down the chain, right before the fs'
open operation is called. Then we could add another variant that
returns struct file. If the new one is present, it will be called,
otherwise a new struct file is allocated, the old variant is called
on the newly allocated one, and finally return this one.

this is a bigger job to do ...


--mtx
Peng Tao June 22, 2021, 6:46 a.m. UTC | #14
On Tue, Jun 22, 2021 at 3:07 AM Enrico Weigelt, metux IT consult
<lkml@metux.net> wrote:
>
> On 17.06.21 15:23, Peng Tao wrote:
>
> >> Just keeping fd's open while a server restarts ?
> >> If that's what you want, I see much wider use far outside of fuse,
> >> and that might call for some more generic approach - something like
> >> Plan9's /srv filesystem.
> >>
> > 1. keeping FDs across userspace restart
>
> if application needs to be rewritten for that anyways, there're other
> ways to achieve this, w/o touching the kernel at all - exec() doesn't
> automatically close fd's (unless they're opened w/ O_CLOEXEC)
Or application recovery after panic ;)

>
> > 2. help save FD in the FUSE fd passthrough use case as implemented by
> > Alessio Balsini
>
> you mean this one ?
>
> https://lore.kernel.org/lkml/20210125153057.3623715-1-balsini@android.com
>
> I fail to see why an extra fd store within the fuse device is necessary
> for that - I'd just let the fuse daemon(s) reply the open request with
> the fd it already holds.
Alessio already has a similar implementation in his patchset. The RPC
patch tries to make it generic and thus usable for other use cases
like fuse daemon upgrade and panic-recovery.b

>
> I'd hate to run into situations where even killing all processes holding
> some file open leads to a situation where it remains open inside the
> kernel, thus blocking e.g. unmounting. I already see operators getting
> very angy ... :o
This is really a different design approach. The idea is to keep an FD
active beyond the lifetime of a running process so that we can do
panic recovery. Alessio's patchset has similar side effect in some
corner cases and this RFC patch makes it a semantic promise. Whether
ops like it would really depend on what they want.

>
> by the way: alessio's approach is limited to simple read/write
> operations anyways - other operations like ioctl() don't seem to work
> easily that way.
>
> and for the creds switching: I tend to believe that cases where a fs or
> device looks at the calling process' creds in operations on an already
> open fd, it's most likely a bad implementation.
>
I agree but I understand the rationale as well. A normal FUSE
read/write uses FUSE daemon creds so the semantics are the same.
Otherwise as you outline below, we'd have to go through all the
read/write callbacks to make sure none of them is checking process
creds.

> yes, some legacy drivers actually do check for CAP_SYS_ADMIN e.g. for
> low level hardware configuration (e.g. IO and IRQ on ISA bus), but I
> wonder whether these are use at all in the our use cases and should be
> ever allowed to non-root.
>
> do you have any case where you really need to use the opener's creds ?
> (after the fd is already open)
>
> >> Does FUSE actually manipulate the process' fd table directly, while
> >> in the open() callback ?
> >
> > hmm, you are right. The open() callback cannot install FD from there.
> > So in order for your use case to work, the VFS layer needs to be
> > changed to transparently replace an empty file struct with another
> > file struct that is prepared by the file system somewhere else. It is
> > really beyond the current RFC patch's scope IMHO.
>
> Exactly. That's where I'm struggling right now. Yet have to find out
> whether I could just copy from one struct file into another (probably
> some refcnt'ing required). And that still has some drawback: fd state
> like file position won't be shared.
>
> I've been thinking about changing the vfs_open() chain so that it
> doesn't pass in an existing/prepared struct file, but instead returns
> one, which is allocated further down the chain, right before the fs'
> open operation is called. Then we could add another variant that
> returns struct file. If the new one is present, it will be called,
> otherwise a new struct file is allocated, the old variant is called
> on the newly allocated one, and finally return this one.
>
> this is a bigger job to do ...
>
Agreed.

Cheers,
Tao
Enrico Weigelt, metux IT consult June 24, 2021, 2:19 p.m. UTC | #15
On 22.06.21 08:46, Peng Tao wrote:

> Or application recovery after panic ;)

Okay, thats a different scenario. But, of course, the application needs
to have the fds already registered before it crashes. This gives two
options:

a) always store them right after opening and do it's own garbage
    collection (e.g. on each close() call). - expensive and complex
b) construct it in a way that evem on critical signals (eg. sigsegv,
    sigbus) a signal handler can still store the fds somewhere (eg. using
    statically allocated memory and extra stack) - tricky

OTOH, i'd try to construct in a way that a crash of some master process
(that can hold the fds even across restarts) is very unlikely to crash,
since it doesn't do much more than that (plus spawning workers).

By the way, if you just wanna store fd's - i'm working on a more generic
solution: an Plan9-style srvfs. It's a file system that stores aready
opened fd's and on open() gives your that fd (instead of a new one).

My previous experimental implementation did that indirectly by bridging
all operations under the hood (tedious to synchronize the whole file
state), but I'm now taking a fresh start w/ adding some "file boxing"
mechanism to the kernel (patches not ready for submission):

* a file systems's open operation can put a pointer to another struct
   file into the struct file it's operating on -- that's what I called
   a "boxed file".
* the places (actually two) that actually create new struct file's and
   call into the open chain (eg. through vfs_open() etc) then do the
   unboxing -- if there's a boxed file, they fetch it out and drop the
   just newly fd.

> Alessio already has a similar implementation in his patchset. The RPC
> patch tries to make it generic and thus usable for other use cases
> like fuse daemon upgrade and panic-recovery.b

I believe this shouldn't be some fuse specific thing. And we certainly
have to make sure that it can't be abused for dos'ing the machine.
Not sure whether that should be accounted to a namespace or cgroup.

>> I'd hate to run into situations where even killing all processes holding
>> some file open leads to a situation where it remains open inside the
>> kernel, thus blocking e.g. unmounting. I already see operators getting
>> very angy ... :o
> This is really a different design approach. The idea is to keep an FD
> active beyond the lifetime of a running process so that we can do
> panic recovery. Alessio's patchset has similar side effect in some
> corner cases and this RFC patch makes it a semantic promise. Whether
> ops like it would really depend on what they want.

The problem is: (most) fd's are bound to some processes - when they're
killed, the fd's are closed. Usually you can force a closing files
(and thus allow unmount) by checking via lsof which processes still hold
open fd to them and kill'em. If we can't do that anymore, we can run
into big trouble. There needs to be some clear lifetime control for
that.

Let's look at containers: usually the runtime/orchestration sets up a
bunch of mounts before starting the actual workload inside the 
container. On container shutdown, the processes are killed and then
everything's unmounted again (temporary storage, eg. lvm volumes or
btrfs subvols are removed afterwards).

Now, with the persistant fd's, an unprivileged user can block that
(either accidentially or on purpose). We cannot allow that to happen.

Apropos containers: this really should be, some how, bound to some
namespace (not sure whether mountfs or userns is the right place),
so containers cannot interfer with each other.

> I agree but I understand the rationale as well. A normal FUSE
> read/write uses FUSE daemon creds so the semantics are the same.
> Otherwise as you outline below, we'd have to go through all the
> read/write callbacks to make sure none of them is checking process
> creds.

I've actually looked deeper into this. There indeed are certain places
with checks for CAP_SYS_ADMIN, but these are really root-only things
where we should think very carefully whether they should work with
fds passed to processes under separate users at all. And they also
never worked with fd passing via unix socket.



--mtx
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a5ceccc..dfce809 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2229,15 +2229,14 @@  static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
 static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 			   unsigned long arg)
 {
-	int res;
-	int oldfd;
+	int res = -EFAULT;
+	int fd;
 	struct fuse_dev *fud = NULL;
 
 	switch (cmd) {
 	case FUSE_DEV_IOC_CLONE:
-		res = -EFAULT;
-		if (!get_user(oldfd, (__u32 __user *)arg)) {
-			struct file *old = fget(oldfd);
+		if (!get_user(fd, (__u32 __user *)arg)) {
+			struct file *old = fget(fd);
 
 			res = -EINVAL;
 			if (old) {
@@ -2258,6 +2257,28 @@  static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 			}
 		}
 		break;
+	case FUSE_DEV_IOC_SAVE_FD:
+		if (!get_user(fd, (__u32 __user *) arg)) {
+			res = fuse_filp_save(fd);
+			if (res > 0) {
+				res = put_user(res, (__u32 __user *) arg);
+			}
+		}
+		break;
+	case FUSE_DEV_IOC_RESTORE_FD:
+		if (!get_user(fd, (__u32 __user *) arg)) {
+			res = fuse_filp_restore(fd);
+			if (res > 0) {
+				res = put_user(res, (__u32 __user *) arg);
+			}
+		}
+		break;
+	case FUSE_DEV_IOC_REMOVE_FD:
+		if (!get_user(fd, (__u32 __user *) arg)) {
+			fuse_filp_remove(fd);
+			res = 0;
+		}
+		break;
 	default:
 		res = -ENOTTY;
 		break;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7e463e2..1060391 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1259,4 +1259,9 @@  struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 		       unsigned int open_flags, fl_owner_t id, bool isdir);
 
+/* fuse fd store management */
+int fuse_filp_save(int fd);
+int fuse_filp_restore(int id);
+void fuse_filp_remove(int id);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 393e36b7..b95c597 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,7 @@ 
 #include <linux/exportfs.h>
 #include <linux/posix_acl.h>
 #include <linux/pid_namespace.h>
+#include <linux/idr.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -32,6 +33,9 @@ 
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
+DEFINE_IDR(fuse_filp_store);
+DEFINE_MUTEX(fuse_filp_store_mutex);
+
 static int set_global_limit(const char *val, const struct kernel_param *kp);
 
 unsigned max_user_bgreq;
@@ -1623,6 +1627,86 @@  static inline void unregister_fuseblk(void)
 }
 #endif
 
+int fuse_filp_save(int fd)
+{
+	struct file *filp;
+	int res = -EBADF;
+
+	filp = fget(fd);
+	if (!filp) {
+		pr_err("FUSE: invalid file descriptor to save.\n");
+		goto out;
+	}
+
+	idr_preload(GFP_KERNEL);
+	mutex_lock(&fuse_filp_store_mutex);
+	res = idr_alloc(&fuse_filp_store, filp, 1, 0, GFP_ATOMIC);
+	mutex_unlock(&fuse_filp_store_mutex);
+	idr_preload_end();
+
+	if (res < 0)
+		fput(filp);
+
+out:
+	return res;
+}
+
+static struct file *do_fuse_filp_restore(int id)
+{
+	struct file *filp;
+
+	rcu_read_lock();
+	filp = idr_find(&fuse_filp_store, id);
+	if (filp && !get_file_rcu(filp))
+		filp = NULL;
+	rcu_read_unlock();
+
+	return filp;
+}
+
+int fuse_filp_restore(int id)
+{
+	struct file *filp;
+	int res;
+
+	filp = do_fuse_filp_restore(id);
+	if (!filp)
+		return -ENOENT;
+
+	res = get_unused_fd_flags(0);
+	if (res >= 0)
+		fd_install(res, filp);
+	else
+		fput(filp);
+
+	return res;
+}
+
+void fuse_filp_remove(int id)
+{
+	struct file *filp;
+
+	mutex_lock(&fuse_filp_store_mutex);
+	filp = idr_remove(&fuse_filp_store, id);
+	mutex_unlock(&fuse_filp_store_mutex);
+	if (filp)
+		fput(filp);
+}
+
+static int fuse_filp_free(int id, void *p, void *data)
+{
+	struct file *filp = (struct file *)p;
+	fput(filp);
+
+	return 0;
+}
+
+static void fuse_filp_store_cleanup(void)
+{
+	idr_for_each(&fuse_filp_store, &fuse_filp_free, NULL);
+	idr_destroy(&fuse_filp_store);
+}
+
 static void fuse_inode_init_once(void *foo)
 {
 	struct inode *inode = foo;
@@ -1746,6 +1830,7 @@  static void __exit fuse_exit(void)
 {
 	pr_debug("exit\n");
 
+	fuse_filp_store_cleanup();
 	fuse_ctl_cleanup();
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 271ae90..d12a5c2 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -919,6 +919,9 @@  struct fuse_notify_retrieve_in {
 /* Device ioctls: */
 #define FUSE_DEV_IOC_MAGIC		229
 #define FUSE_DEV_IOC_CLONE		_IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
+#define FUSE_DEV_IOC_SAVE_FD		_IOWR(FUSE_DEV_IOC_MAGIC, 1, uint32_t)
+#define FUSE_DEV_IOC_RESTORE_FD		_IOWR(FUSE_DEV_IOC_MAGIC, 2, uint32_t)
+#define FUSE_DEV_IOC_REMOVE_FD		_IOW(FUSE_DEV_IOC_MAGIC, 3, uint32_t)
 
 struct fuse_lseek_in {
 	uint64_t	fh;