Message ID | 20240520-exportfs-u64-mount-id-v1-1-f55fd9215b8e@cyphar.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [RFC] fhandle: expose u64 mount id to name_to_handle_at(2) | expand |
On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote: > Now that we have stabilised the unique 64-bit mount ID interface in > statx, we can now provide a race-free way for name_to_handle_at(2) to > provide a file handle and corresponding mount without needing to worry > about racing with /proc/mountinfo parsing. > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > that doesn't make sense for name_to_handle_at(2). > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > --- > fs/fhandle.c | 27 +++++++++++++++++++-------- > include/uapi/linux/fcntl.h | 2 ++ > 2 files changed, 21 insertions(+), 8 deletions(-) > > diff --git a/fs/fhandle.c b/fs/fhandle.c > index 8a7f86c2139a..6bc7ffccff8c 100644 > --- a/fs/fhandle.c > +++ b/fs/fhandle.c > @@ -16,7 +16,8 @@ > > static long do_sys_name_to_handle(const struct path *path, > struct file_handle __user *ufh, > - int __user *mnt_id, int fh_flags) > + void __user *mnt_id, bool unique_mntid, > + int fh_flags) > { > long retval; > struct file_handle f_handle; > @@ -69,10 +70,16 @@ static long do_sys_name_to_handle(const struct path *path, > } else > retval = 0; > /* copy the mount id */ > - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || > - copy_to_user(ufh, handle, > - struct_size(handle, f_handle, handle_bytes))) > - retval = -EFAULT; > + if (unique_mntid) > + retval = put_user(real_mount(path->mnt)->mnt_id_unique, > + (u64 __user *) mnt_id); > + else > + retval = put_user(real_mount(path->mnt)->mnt_id, > + (int __user *) mnt_id); > + /* copy the handle */ > + if (!retval) > + retval = copy_to_user(ufh, handle, > + struct_size(handle, f_handle, handle_bytes)); > kfree(handle); > return retval; > } > @@ -83,6 +90,7 @@ static long do_sys_name_to_handle(const struct path *path, > * @name: name that should be converted to handle. > * @handle: resulting file handle > * @mnt_id: mount id of the file system containing the file > + * (u64 if AT_HANDLE_UNIQUE_MNT_ID, otherwise int) > * @flag: flag value to indicate whether to follow symlink or not > * and whether a decodable file handle is required. > * > @@ -92,7 +100,7 @@ static long do_sys_name_to_handle(const struct path *path, > * value required. > */ > SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, > - struct file_handle __user *, handle, int __user *, mnt_id, > + struct file_handle __user *, handle, void __user *, mnt_id, > Changing the syscall signature like this is rather nasty. The new flag seems like it should safely gate the difference, but I still have some concerns about misuse and people passing in too small a buffer for the mnt_id. > int, flag) > { > struct path path; > @@ -100,7 +108,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, > int fh_flags; > int err; > > - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) > + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | > + AT_HANDLE_UNIQUE_MNT_ID)) > return -EINVAL; > > lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; > @@ -109,7 +118,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, > lookup_flags |= LOOKUP_EMPTY; > err = user_path_at(dfd, name, lookup_flags, &path); > if (!err) { > - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); > + err = do_sys_name_to_handle(&path, handle, mnt_id, > + flag & AT_HANDLE_UNIQUE_MNT_ID, > + fh_flags); > path_put(&path); > } > return err; > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > index c0bcc185fa48..fda970f92fba 100644 > --- a/include/uapi/linux/fcntl.h > +++ b/include/uapi/linux/fcntl.h > @@ -118,6 +118,8 @@ > #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to > compare object identity and may not > be usable to open_by_handle_at(2) */ > +#define AT_HANDLE_UNIQUE_MNT_ID AT_STATX_FORCE_SYNC /* returned mount id is > + the u64 unique mount id */ > #if defined(__KERNEL__) > #define AT_GETATTR_NOSEC 0x80000000 > #endif > > --- > base-commit: 584bbf439d0fa83d728ec49f3a38c581bdc828b4 > change-id: 20240515-exportfs-u64-mount-id-9ebb5c58b53c > > Best regards,
On 2024-05-20, Jeff Layton <jlayton@kernel.org> wrote: > On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote: > > Now that we have stabilised the unique 64-bit mount ID interface in > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > provide a file handle and corresponding mount without needing to worry > > about racing with /proc/mountinfo parsing. > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > that doesn't make sense for name_to_handle_at(2). > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > --- > > fs/fhandle.c | 27 +++++++++++++++++++-------- > > include/uapi/linux/fcntl.h | 2 ++ > > 2 files changed, 21 insertions(+), 8 deletions(-) > > > > diff --git a/fs/fhandle.c b/fs/fhandle.c > > index 8a7f86c2139a..6bc7ffccff8c 100644 > > --- a/fs/fhandle.c > > +++ b/fs/fhandle.c > > @@ -16,7 +16,8 @@ > > > > static long do_sys_name_to_handle(const struct path *path, > > struct file_handle __user *ufh, > > - int __user *mnt_id, int fh_flags) > > + void __user *mnt_id, bool unique_mntid, > > + int fh_flags) > > { > > long retval; > > struct file_handle f_handle; > > @@ -69,10 +70,16 @@ static long do_sys_name_to_handle(const struct path *path, > > } else > > retval = 0; > > /* copy the mount id */ > > - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || > > - copy_to_user(ufh, handle, > > - struct_size(handle, f_handle, handle_bytes))) > > - retval = -EFAULT; > > + if (unique_mntid) > > + retval = put_user(real_mount(path->mnt)->mnt_id_unique, > > + (u64 __user *) mnt_id); > > + else > > + retval = put_user(real_mount(path->mnt)->mnt_id, > > + (int __user *) mnt_id); > > + /* copy the handle */ > > + if (!retval) > > + retval = copy_to_user(ufh, handle, > > + struct_size(handle, f_handle, handle_bytes)); > > kfree(handle); > > return retval; > > } > > @@ -83,6 +90,7 @@ static long do_sys_name_to_handle(const struct path *path, > > * @name: name that should be converted to handle. > > * @handle: resulting file handle > > * @mnt_id: mount id of the file system containing the file > > + * (u64 if AT_HANDLE_UNIQUE_MNT_ID, otherwise int) > > * @flag: flag value to indicate whether to follow symlink or not > > * and whether a decodable file handle is required. > > * > > @@ -92,7 +100,7 @@ static long do_sys_name_to_handle(const struct path *path, > > * value required. > > */ > > SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, > > - struct file_handle __user *, handle, int __user *, mnt_id, > > + struct file_handle __user *, handle, void __user *, mnt_id, > > > > Changing the syscall signature like this is rather nasty. The new flag > seems like it should safely gate the difference, but I still have some > concerns about misuse and people passing in too small a buffer for the > mnt_id. Yeah, it's a little ugly, but an name_to_handle_at2 feels like overkill for such a minor change. I'm also not sure there's a huge risk of users accidentally passing AT_HANDLE_UNIQUE_MNT_ID with an (int *). > > int, flag) > > { > > struct path path; > > @@ -100,7 +108,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, > > int fh_flags; > > int err; > > > > - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) > > + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | > > + AT_HANDLE_UNIQUE_MNT_ID)) > > return -EINVAL; > > > > lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; > > @@ -109,7 +118,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, > > lookup_flags |= LOOKUP_EMPTY; > > err = user_path_at(dfd, name, lookup_flags, &path); > > if (!err) { > > - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); > > + err = do_sys_name_to_handle(&path, handle, mnt_id, > > + flag & AT_HANDLE_UNIQUE_MNT_ID, > > + fh_flags); > > path_put(&path); > > } > > return err; > > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > > index c0bcc185fa48..fda970f92fba 100644 > > --- a/include/uapi/linux/fcntl.h > > +++ b/include/uapi/linux/fcntl.h > > @@ -118,6 +118,8 @@ > > #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to > > compare object identity and may not > > be usable to open_by_handle_at(2) */ > > +#define AT_HANDLE_UNIQUE_MNT_ID AT_STATX_FORCE_SYNC /* returned mount id is > > + the u64 unique mount id */ > > #if defined(__KERNEL__) > > #define AT_GETATTR_NOSEC 0x80000000 > > #endif > > > > --- > > base-commit: 584bbf439d0fa83d728ec49f3a38c581bdc828b4 > > change-id: 20240515-exportfs-u64-mount-id-9ebb5c58b53c > > > > Best regards, > > -- > Jeff Layton <jlayton@kernel.org> >
On Tue, May 21, 2024 at 1:28 AM Aleksa Sarai <cyphar@cyphar.com> wrote: > > On 2024-05-20, Jeff Layton <jlayton@kernel.org> wrote: > > On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote: > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > provide a file handle and corresponding mount without needing to worry > > > about racing with /proc/mountinfo parsing. Both statx() and name_to_handle_at() support AT_EMPTY_PATH, so there is a race-free way to get a file handle and unique mount id for statmount(). Why do you mean /proc/mountinfo parsing? > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > that doesn't make sense for name_to_handle_at(2). Christian is probably regretting merging AT_HANDLE_FID now ;-) Seriously, I would rearrange the AT_* flags namespace this way to explicitly declare the overloaded per-syscall AT_* flags and possibly prepare for the upcoming setxattrat(2) syscall [1]. [1] https://lore.kernel.org/linux-fsdevel/20240426162042.191916-1-cgoettsche@seltendoof.de/ The reason I would avoid overloading the AT_STATX_* flags is that they describe a generic behavior that could potentially be relevant to other syscalls in the future, e.g.: renameat2(..., AT_RENAME_TEMPFILE | AT_FORCE_SYNC); But then again, I don't understand why you need to extend name_to_handle_at() at all for your purpose... Thanks, Amir. --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h [...] + +#define AT_PRIVATE_FLAGS 0x2ff /* Per-syscall flags mask. */ + +/* Common flags for *at() syscalls */ #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ -#define AT_EACCESS 0x200 /* Test access permitted for - effective IDs, not real IDs. */ -#define AT_REMOVEDIR 0x200 /* Remove directory instead of - unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ #define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ +/* Flags for statx(2) */ #define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ #define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ #define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ [...] #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ -/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ -#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to +/* Flags for name_to_handle_at(2) */ +#define AT_HANDLE_FID 0x200 /* file handle is needed to compare object identity and may not be usable to open_by_handle_at(2) */ +/* Flags for faccessat(2) */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ +/* Flags for unlinkat(2) */ +#define AT_REMOVEDIR 0x200 /* Remove directory instead of + unlinking file. */ + +/* Flags for renameat2(2) (should match legacy RENAME_* flags) */ +#define AT_RENAME_NOREPLACE 0x001 /* Don't overwrite target */ +#define AT_RENAME_EXCHANGE 0x002 /* Exchange source and dest */ +#define AT_RENAME_WHITEOUT 0x004 /* Whiteout source */ +#define AT_RENAME_TEMPFILE 0x008 /* Source file is O_TMPFILE */ + +/* Flags for setxattrat(2) (should match legacy XATTR_* flags) */ +#define AT_XATTR_CREATE 0x001 /* set value, fail if attr already exists */ +#define AT_XATTR_REPLACE 0x002 /* set value, fail if attr does not exist */ +
On 2024-05-21, Amir Goldstein <amir73il@gmail.com> wrote: > On Tue, May 21, 2024 at 1:28 AM Aleksa Sarai <cyphar@cyphar.com> wrote: > > > > On 2024-05-20, Jeff Layton <jlayton@kernel.org> wrote: > > > On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote: > > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > > provide a file handle and corresponding mount without needing to worry > > > > about racing with /proc/mountinfo parsing. > > Both statx() and name_to_handle_at() support AT_EMPTY_PATH, so > there is a race-free way to get a file handle and unique mount id > for statmount(). Doing it that way would require doing an open and statx for every path you want to get a filehandle for, tripling the number of syscalls you need to do. This is related to the syscall overhead issue Lennart talked about last week at LSF (though for his usecase we would need to add a hashed filehandle in statx). > Why do you mean /proc/mountinfo parsing? The man page for name_to_handle_at(2) talks about needing to parse /proc/mountinfo as well as the possible races you can hit. > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > > that doesn't make sense for name_to_handle_at(2). > > Christian is probably regretting merging AT_HANDLE_FID now ;-) > > Seriously, I would rearrange the AT_* flags namespace this way to > explicitly declare the overloaded per-syscall AT_* flags and possibly > prepare for the upcoming setxattrat(2) syscall [1]. I'm not sure that unifying the flag namespace is a good idea -- while it would be nicer, burning a flag bit for an extension will be more expensive because we would only have 32 bits for every possible extension we ever plan to have. FWIW, I think that statx should've had their own flag namespace like move_mount and renameat2. > [1] https://lore.kernel.org/linux-fsdevel/20240426162042.191916-1-cgoettsche@seltendoof.de/ > > The reason I would avoid overloading the AT_STATX_* flags is that > they describe a generic behavior that could potentially be relevant to > other syscalls in the future, e.g.: > renameat2(..., AT_RENAME_TEMPFILE | AT_FORCE_SYNC); Yeah, you might be right that the sync-related flags aren't the right ones to overload here. > But then again, I don't understand why you need to extend name_to_handle_at() > at all for your purpose... > > Thanks, > Amir. > > --- a/include/uapi/linux/fcntl.h > +++ b/include/uapi/linux/fcntl.h > [...] > + > +#define AT_PRIVATE_FLAGS 0x2ff /* Per-syscall flags mask. */ > + > +/* Common flags for *at() syscalls */ > #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ > -#define AT_EACCESS 0x200 /* Test access permitted for > - effective IDs, not real IDs. */ > -#define AT_REMOVEDIR 0x200 /* Remove directory instead of > - unlinking file. */ > #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ > #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal > automount traversal */ > #define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ > > +/* Flags for statx(2) */ > #define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation > required from statx() */ > #define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ > #define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to > be sync'd with the server */ > [...] > > #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ > > -/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ > -#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to > +/* Flags for name_to_handle_at(2) */ > +#define AT_HANDLE_FID 0x200 /* file handle is needed to > compare object identity and may not > be usable to open_by_handle_at(2) */ > +/* Flags for faccessat(2) */ > +#define AT_EACCESS 0x200 /* Test access permitted for > + effective IDs, not real IDs. */ > +/* Flags for unlinkat(2) */ > +#define AT_REMOVEDIR 0x200 /* Remove directory instead of > + unlinking file. */ > + > +/* Flags for renameat2(2) (should match legacy RENAME_* flags) */ > +#define AT_RENAME_NOREPLACE 0x001 /* Don't overwrite target */ > +#define AT_RENAME_EXCHANGE 0x002 /* Exchange source and dest */ > +#define AT_RENAME_WHITEOUT 0x004 /* Whiteout source */ > +#define AT_RENAME_TEMPFILE 0x008 /* Source file is O_TMPFILE */ > + > +/* Flags for setxattrat(2) (should match legacy XATTR_* flags) */ > +#define AT_XATTR_CREATE 0x001 /* set value, fail if > attr already exists */ > +#define AT_XATTR_REPLACE 0x002 /* set value, fail if attr > does not exist */ > +
On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > Now that we have stabilised the unique 64-bit mount ID interface in > statx, we can now provide a race-free way for name_to_handle_at(2) to > provide a file handle and corresponding mount without needing to worry > about racing with /proc/mountinfo parsing. > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > that doesn't make sense for name_to_handle_at(2). > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > --- So I think overall this is probably fine (famous last words). If it's just about being able to retrieve the new mount id without having to take the hit of another statx system call it's indeed a bit much to add a revised system call for this. Althoug I did say earlier that I wouldn't rule that out. But if we'd that then it'll be a long discussion on the form of the new system call and the information it exposes. For example, I lack the grey hair needed to understand why name_to_handle_at() returns a mount id at all. The pitch in commit 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that the (old) mount id can be used to "lookup file system specific information [...] in /proc/<pid>/mountinfo". Granted, that's doable but it'll mean a lot of careful checking to avoid races for mount id recycling because they're not even allocated cyclically. With lots of containers it becomes even more of an issue. So it's doubtful whether exposing the mount id through name_to_handle_at() would be something that we'd still do. So really, if this is just about a use-case where you want to spare the additional system call for statx() and you need the mnt_id then overloading is probably ok. But it remains an unpleasant thing to look at.
On Tue, May 21, 2024 at 03:46:06PM +0200, Christian Brauner wrote: > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > Now that we have stabilised the unique 64-bit mount ID interface in > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > provide a file handle and corresponding mount without needing to worry > > about racing with /proc/mountinfo parsing. > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > that doesn't make sense for name_to_handle_at(2). > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > --- > > So I think overall this is probably fine (famous last words). If it's > just about being able to retrieve the new mount id without having to > take the hit of another statx system call it's indeed a bit much to > add a revised system call for this. Althoug I did say earlier that I > wouldn't rule that out. > > But if we'd that then it'll be a long discussion on the form of the new > system call and the information it exposes. > > For example, I lack the grey hair needed to understand why > name_to_handle_at() returns a mount id at all. The pitch in commit > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > the (old) mount id can be used to "lookup file system specific > information [...] in /proc/<pid>/mountinfo". > > Granted, that's doable but it'll mean a lot of careful checking to avoid > races for mount id recycling because they're not even allocated > cyclically. With lots of containers it becomes even more of an issue. So > it's doubtful whether exposing the mount id through name_to_handle_at() > would be something that we'd still do. > > So really, if this is just about a use-case where you want to spare the > additional system call for statx() and you need the mnt_id then > overloading is probably ok. > > But it remains an unpleasant thing to look at. And I'd like an ok from Jeff and Amir if we're going to try this. :)
On Tue, 2024-05-21 at 16:11 +0200, Christian Brauner wrote: > On Tue, May 21, 2024 at 03:46:06PM +0200, Christian Brauner wrote: > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > provide a file handle and corresponding mount without needing to worry > > > about racing with /proc/mountinfo parsing. > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > that doesn't make sense for name_to_handle_at(2). > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > --- > > > > So I think overall this is probably fine (famous last words). If it's > > just about being able to retrieve the new mount id without having to > > take the hit of another statx system call it's indeed a bit much to > > add a revised system call for this. Althoug I did say earlier that I > > wouldn't rule that out. > > > > But if we'd that then it'll be a long discussion on the form of the new > > system call and the information it exposes. > > > > For example, I lack the grey hair needed to understand why > > name_to_handle_at() returns a mount id at all. The pitch in commit > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > the (old) mount id can be used to "lookup file system specific > > information [...] in /proc/<pid>/mountinfo". > > > > Granted, that's doable but it'll mean a lot of careful checking to avoid > > races for mount id recycling because they're not even allocated > > cyclically. With lots of containers it becomes even more of an issue. So > > it's doubtful whether exposing the mount id through name_to_handle_at() > > would be something that we'd still do. > > > > So really, if this is just about a use-case where you want to spare the > > additional system call for statx() and you need the mnt_id then > > overloading is probably ok. > > > > But it remains an unpleasant thing to look at. > > And I'd like an ok from Jeff and Amir if we're going to try this. :) I don't have strong feelings about it other than "it looks sort of ugly", so I'm OK with doing this. I suspect we will eventually need name_to_handle_at2, or something similar, as it seems like we're starting to grow some new use-cases for filehandles, and hitting the limits of the old syscall. I don't have a good feel for what that should look like though, so I'm happy to put that off for a while.
On Tue, May 21, 2024 at 5:27 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, 2024-05-21 at 16:11 +0200, Christian Brauner wrote: > > On Tue, May 21, 2024 at 03:46:06PM +0200, Christian Brauner wrote: > > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > > provide a file handle and corresponding mount without needing to worry > > > > about racing with /proc/mountinfo parsing. > > > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > > that doesn't make sense for name_to_handle_at(2). > > > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > > --- > > > > > > So I think overall this is probably fine (famous last words). If it's > > > just about being able to retrieve the new mount id without having to > > > take the hit of another statx system call it's indeed a bit much to > > > add a revised system call for this. Althoug I did say earlier that I > > > wouldn't rule that out. > > > > > > But if we'd that then it'll be a long discussion on the form of the new > > > system call and the information it exposes. > > > > > > For example, I lack the grey hair needed to understand why > > > name_to_handle_at() returns a mount id at all. The pitch in commit > > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > > the (old) mount id can be used to "lookup file system specific > > > information [...] in /proc/<pid>/mountinfo". > > > > > > Granted, that's doable but it'll mean a lot of careful checking to avoid > > > races for mount id recycling because they're not even allocated > > > cyclically. With lots of containers it becomes even more of an issue. So > > > it's doubtful whether exposing the mount id through name_to_handle_at() > > > would be something that we'd still do. > > > > > > So really, if this is just about a use-case where you want to spare the > > > additional system call for statx() and you need the mnt_id then > > > overloading is probably ok. > > > > > > But it remains an unpleasant thing to look at. > > > > And I'd like an ok from Jeff and Amir if we're going to try this. :) > > I don't have strong feelings about it other than "it looks sort of > ugly", so I'm OK with doing this. > > I suspect we will eventually need name_to_handle_at2, or something > similar, as it seems like we're starting to grow some new use-cases for > filehandles, and hitting the limits of the old syscall. I don't have a > good feel for what that should look like though, so I'm happy to put > that off for a while. I'm ok with it, but we cannot possibly allow it without any bikeshedding... Please call it AT_HANDLE_MNT_ID_UNIQUE to align with STATX_MNT_ID_UNIQUE and as I wrote, I do not like overloading the AT_*_SYNC flags and as there is no other obvious candidate to overload, so I think that it is best to at least declare in a comment that /* 0x00ff flags are reserved for per-syscall flags */ and use one of those bits for AT_HANDLE_MNT_ID_UNIQUE. It does not matter whether we decide to unify the AT_ flags namespace with RENAME_ flags namespace or not. The fact that there is a syscall named renameat2() with a flags argument, means that someone is bound to pass in an AT_ flags in this syscall sooner or later, so the least we can do is try to delay the day that this will not result in EINVAL. Thanks, Amir. P.S.: As I mentioned to Jeff in LSFMM, I have a patch in my tree to add AT_HANDLE_CONNECTABLE which I have not yet decided if it is upstream worthy.
On 2024-05-21, Christian Brauner <brauner@kernel.org> wrote: > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > Now that we have stabilised the unique 64-bit mount ID interface in > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > provide a file handle and corresponding mount without needing to worry > > about racing with /proc/mountinfo parsing. > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > that doesn't make sense for name_to_handle_at(2). > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > --- > > So I think overall this is probably fine (famous last words). If it's > just about being able to retrieve the new mount id without having to > take the hit of another statx system call it's indeed a bit much to > add a revised system call for this. Althoug I did say earlier that I > wouldn't rule that out. > > But if we'd that then it'll be a long discussion on the form of the new > system call and the information it exposes. > > For example, I lack the grey hair needed to understand why > name_to_handle_at() returns a mount id at all. The pitch in commit > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > the (old) mount id can be used to "lookup file system specific > information [...] in /proc/<pid>/mountinfo". The logic was presumably to allow you to know what mount the resolved file handle came from. If you use AT_EMPTY_PATH this is not needed because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if you just give name_to_handle_at() almost any path, there is no race-free way to make sure that you know which filesystem the file handle came from. I don't know if that could lead to security issues (I guess an attacker could find a way to try to manipulate the file handle you get back, and then try to trick you into operating on the wrong filesystem with open_by_handle_at()) but it is definitely something you'd want to avoid. > Granted, that's doable but it'll mean a lot of careful checking to avoid > races for mount id recycling because they're not even allocated > cyclically. With lots of containers it becomes even more of an issue. So > it's doubtful whether exposing the mount id through name_to_handle_at() > would be something that we'd still do. > > So really, if this is just about a use-case where you want to spare the > additional system call for statx() and you need the mnt_id then > overloading is probably ok. > > But it remains an unpleasant thing to look at. > Yeah, I agree it's ugly.
On 2024-05-21, Amir Goldstein <amir73il@gmail.com> wrote: > On Tue, May 21, 2024 at 5:27 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Tue, 2024-05-21 at 16:11 +0200, Christian Brauner wrote: > > > On Tue, May 21, 2024 at 03:46:06PM +0200, Christian Brauner wrote: > > > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > > > provide a file handle and corresponding mount without needing to worry > > > > > about racing with /proc/mountinfo parsing. > > > > > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > > > that doesn't make sense for name_to_handle_at(2). > > > > > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > > > --- > > > > > > > > So I think overall this is probably fine (famous last words). If it's > > > > just about being able to retrieve the new mount id without having to > > > > take the hit of another statx system call it's indeed a bit much to > > > > add a revised system call for this. Althoug I did say earlier that I > > > > wouldn't rule that out. > > > > > > > > But if we'd that then it'll be a long discussion on the form of the new > > > > system call and the information it exposes. > > > > > > > > For example, I lack the grey hair needed to understand why > > > > name_to_handle_at() returns a mount id at all. The pitch in commit > > > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > > > the (old) mount id can be used to "lookup file system specific > > > > information [...] in /proc/<pid>/mountinfo". > > > > > > > > Granted, that's doable but it'll mean a lot of careful checking to avoid > > > > races for mount id recycling because they're not even allocated > > > > cyclically. With lots of containers it becomes even more of an issue. So > > > > it's doubtful whether exposing the mount id through name_to_handle_at() > > > > would be something that we'd still do. > > > > > > > > So really, if this is just about a use-case where you want to spare the > > > > additional system call for statx() and you need the mnt_id then > > > > overloading is probably ok. > > > > > > > > But it remains an unpleasant thing to look at. > > > > > > And I'd like an ok from Jeff and Amir if we're going to try this. :) > > > > I don't have strong feelings about it other than "it looks sort of > > ugly", so I'm OK with doing this. > > > > I suspect we will eventually need name_to_handle_at2, or something > > similar, as it seems like we're starting to grow some new use-cases for > > filehandles, and hitting the limits of the old syscall. I don't have a > > good feel for what that should look like though, so I'm happy to put > > that off for a while. > > I'm ok with it, but we cannot possibly allow it without any bikeshedding... > > Please call it AT_HANDLE_MNT_ID_UNIQUE to align with > STATX_MNT_ID_UNIQUE > > and as I wrote, I do not like overloading the AT_*_SYNC flags > and as there is no other obvious candidate to overload, so > I think that it is best to at least declare in a comment that > > /* 0x00ff flags are reserved for per-syscall flags */ > > and use one of those bits for AT_HANDLE_MNT_ID_UNIQUE. I can switch the flag to use 0x80, but given there are already exceptions to that rule, it seems unlikely that this is going to be a strong guarantee going forward. I will add a comment though. Note that this will mean that we are planning to only have 15 remaining generic AT_* flags. > It does not matter whether we decide to unify the AT_ flags > namespace with RENAME_ flags namespace or not. > > The fact that there is a syscall named renameat2() with a flags > argument, means that someone is bound to pass in an AT_ flags > in this syscall sooner or later, so the least we can do is try to > delay the day that this will not result in EINVAL. While there is a risk this could happen, in theory a user could also incorrectly pass AT_* to open(). While ergonomics is important, I think that most users generally read the docs when figuring out how to use flags for syscalls (mainly because we don't have a unified flag namespace for all syscalls) so I don't think this is a huge problem. (But I'm sure I was part of making this problem worse with RESOLVE_* flags.) > Thanks, > Amir. > > P.S.: As I mentioned to Jeff in LSFMM, I have a patch in my tree > to add AT_HANDLE_CONNECTABLE which I have not yet > decided if it is upstream worthy.
On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote: > On 2024-05-21, Christian Brauner <brauner@kernel.org> wrote: > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > provide a file handle and corresponding mount without needing to worry > > > about racing with /proc/mountinfo parsing. > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > that doesn't make sense for name_to_handle_at(2). > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > --- > > > > So I think overall this is probably fine (famous last words). If it's > > just about being able to retrieve the new mount id without having to > > take the hit of another statx system call it's indeed a bit much to > > add a revised system call for this. Althoug I did say earlier that I > > wouldn't rule that out. > > > > But if we'd that then it'll be a long discussion on the form of the new > > system call and the information it exposes. > > > > For example, I lack the grey hair needed to understand why > > name_to_handle_at() returns a mount id at all. The pitch in commit > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > the (old) mount id can be used to "lookup file system specific > > information [...] in /proc/<pid>/mountinfo". > > The logic was presumably to allow you to know what mount the resolved > file handle came from. If you use AT_EMPTY_PATH this is not needed > because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if > you just give name_to_handle_at() almost any path, there is no race-free > way to make sure that you know which filesystem the file handle came > from. > > I don't know if that could lead to security issues (I guess an attacker > could find a way to try to manipulate the file handle you get back, and > then try to trick you into operating on the wrong filesystem with > open_by_handle_at()) but it is definitely something you'd want to avoid. So the following paragraphs are prefaced with: I'm not an expert on file handle encoding and so might be totally wrong. Afaiu, the uniqueness guarantee of the file handle mostly depends on: (1) the filesystem (2) the actual file handling encoding Looking at file handle encoding to me it looks like it's fairly easy to fake them in userspace (I guess that's ok if you think about them like a path but with a weird permission model built around them.) for quite a few filesystems. For example, for anything that uses fs/libfs.c:generic_encode_ino32_fh() it's easy to construct a file handle by retrieving the inode number via stat and the generation number via FS_IOC_GETVERSION. Encoding using the inode number and the inode generation number is probably not very strong so it's not impossible to generate a file handle that is not unique without knowing in which filesystem to interpret it in. The problem is with what name_to_handle_at() returns imho. A mnt_id doesn't pin the filesystem and the old mnt_id isn't unique. That means the filesystem can be unmounted and go away and the mnt_id can be recycled almost immediately for another mount but the file handle is still there. So to guarantee that a (weakly encoded) file handle is interpreted in the right filesystem the file handle must either be accompanied by a file descriptor that pins the relevant mount or have any other guarantee that the filesystem doesn't go away (Or of course, the file handle encodes the uuid of the filesystem or something or uses some sort of hashing scheme.). One of the features of file handles is that they're globally usable so they're interesting to use as handles that can be shared. IOW, one can send around a file handle to another process without having to pin anything or have a file descriptor open that needs to be sent via AF_UNIX. But as stated above that's potentially risky so one might still have to send around an fd together with the file handle if sender and receiver don't share the filesystem for the handle. However, with the unique mount id things improve quite a bit. Because now it's possible to send around the unique mount id and the file handle. Then one can use statmount() to figure out which filesystem this file handle needs to be interpreted in. > > > Granted, that's doable but it'll mean a lot of careful checking to avoid > > races for mount id recycling because they're not even allocated > > cyclically. With lots of containers it becomes even more of an issue. So > > it's doubtful whether exposing the mount id through name_to_handle_at() > > would be something that we'd still do. > > > > So really, if this is just about a use-case where you want to spare the > > additional system call for statx() and you need the mnt_id then > > overloading is probably ok. > > > > But it remains an unpleasant thing to look at. > > > > Yeah, I agree it's ugly. > > -- > Aleksa Sarai > Senior Software Engineer (Containers) > SUSE Linux GmbH > <https://www.cyphar.com/>
On 2024-05-24, Christian Brauner <brauner@kernel.org> wrote: > On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote: > > On 2024-05-21, Christian Brauner <brauner@kernel.org> wrote: > > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > > provide a file handle and corresponding mount without needing to worry > > > > about racing with /proc/mountinfo parsing. > > > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > > that doesn't make sense for name_to_handle_at(2). > > > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > > --- > > > > > > So I think overall this is probably fine (famous last words). If it's > > > just about being able to retrieve the new mount id without having to > > > take the hit of another statx system call it's indeed a bit much to > > > add a revised system call for this. Althoug I did say earlier that I > > > wouldn't rule that out. > > > > > > But if we'd that then it'll be a long discussion on the form of the new > > > system call and the information it exposes. > > > > > > For example, I lack the grey hair needed to understand why > > > name_to_handle_at() returns a mount id at all. The pitch in commit > > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > > the (old) mount id can be used to "lookup file system specific > > > information [...] in /proc/<pid>/mountinfo". > > > > The logic was presumably to allow you to know what mount the resolved > > file handle came from. If you use AT_EMPTY_PATH this is not needed > > because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if > > you just give name_to_handle_at() almost any path, there is no race-free > > way to make sure that you know which filesystem the file handle came > > from. > > > > I don't know if that could lead to security issues (I guess an attacker > > could find a way to try to manipulate the file handle you get back, and > > then try to trick you into operating on the wrong filesystem with > > open_by_handle_at()) but it is definitely something you'd want to avoid. > > So the following paragraphs are prefaced with: I'm not an expert on file > handle encoding and so might be totally wrong. > > Afaiu, the uniqueness guarantee of the file handle mostly depends on: > > (1) the filesystem > (2) the actual file handling encoding > > Looking at file handle encoding to me it looks like it's fairly easy to > fake them in userspace (I guess that's ok if you think about them like a > path but with a weird permission model built around them.) for quite a > few filesystems. The old Docker breakout attack did brute-force the fhandle for the root directory of the host filesystem, so it is entirely possible. However, the attack I was thinking of was whether a directory tree that an attacker had mount permissions over could be manipulated such that a privileged process doing name_to_handle_at() on a path within the tree would get a file handle that open_by_handle_at() on a different filesystem would result in a potentially dangerous path being opened. For instance (M is management process, C is the malicious container process): C: Bind-mounts the root of the container filesystem at /foo. M: name_to_handle_at($CONTAINER/foo) -> gets an fhandle of / of the container filesystem -> stores a copy of the (recycled) mount id C: Swaps /foo with a bind-mount of the host root filesystem such that the mount id is recycled, before M can scan /proc/self/mountinfo. C: Triggers M to try to use the filehandle for some administrative process. M: open_by_handle_at(...) on the wrong mount id, getting an fd of the host / directory. Possibly does something bad to this directory (deleting files, passing the fd back to the container, etc). It seems possible that this could happen, so having a unique mount id is kind of important if you plan to use open_by_handle_at() with malicious actors in control of the target filesystem tree. Though, regardless of the attack you are worried about, I guess we are in agreement that a unique mount id from name_to_handle_at() would be a good idea if we are planning for userspace to use file handles for everything. > For example, for anything that uses fs/libfs.c:generic_encode_ino32_fh() > it's easy to construct a file handle by retrieving the inode number via > stat and the generation number via FS_IOC_GETVERSION. > > Encoding using the inode number and the inode generation number is > probably not very strong so it's not impossible to generate a file > handle that is not unique without knowing in which filesystem to > interpret it in. > > The problem is with what name_to_handle_at() returns imho. A mnt_id > doesn't pin the filesystem and the old mnt_id isn't unique. That means > the filesystem can be unmounted and go away and the mnt_id can be > recycled almost immediately for another mount but the file handle is > still there. > > So to guarantee that a (weakly encoded) file handle is interpreted in > the right filesystem the file handle must either be accompanied by a > file descriptor that pins the relevant mount or have any other guarantee > that the filesystem doesn't go away (Or of course, the file handle > encodes the uuid of the filesystem or something or uses some sort of > hashing scheme.). > > One of the features of file handles is that they're globally usable so > they're interesting to use as handles that can be shared. IOW, one can > send around a file handle to another process without having to pin > anything or have a file descriptor open that needs to be sent via > AF_UNIX. > > But as stated above that's potentially risky so one might still have to > send around an fd together with the file handle if sender and receiver > don't share the filesystem for the handle. > > However, with the unique mount id things improve quite a bit. Because > now it's possible to send around the unique mount id and the file > handle. Then one can use statmount() to figure out which filesystem this > file handle needs to be interpreted in.
On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > Now that we have stabilised the unique 64-bit mount ID interface in > statx, we can now provide a race-free way for name_to_handle_at(2) to > provide a file handle and corresponding mount without needing to worry > about racing with /proc/mountinfo parsing. What are the guarantees for the mount ID? Is it stable across reboots? If not mixing it with file handles is a very bad idea.
On Fri, May 24, 2024 at 02:25:30PM +0200, Christian Brauner wrote: > On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote: > > On 2024-05-21, Christian Brauner <brauner@kernel.org> wrote: > > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > > provide a file handle and corresponding mount without needing to worry > > > > about racing with /proc/mountinfo parsing. > > > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > > that doesn't make sense for name_to_handle_at(2). > > > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > > --- > > > > > > So I think overall this is probably fine (famous last words). If it's > > > just about being able to retrieve the new mount id without having to > > > take the hit of another statx system call it's indeed a bit much to > > > add a revised system call for this. Althoug I did say earlier that I > > > wouldn't rule that out. > > > > > > But if we'd that then it'll be a long discussion on the form of the new > > > system call and the information it exposes. > > > > > > For example, I lack the grey hair needed to understand why > > > name_to_handle_at() returns a mount id at all. The pitch in commit > > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > > the (old) mount id can be used to "lookup file system specific > > > information [...] in /proc/<pid>/mountinfo". > > > > The logic was presumably to allow you to know what mount the resolved > > file handle came from. If you use AT_EMPTY_PATH this is not needed > > because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if > > you just give name_to_handle_at() almost any path, there is no race-free > > way to make sure that you know which filesystem the file handle came > > from. > > > > I don't know if that could lead to security issues (I guess an attacker > > could find a way to try to manipulate the file handle you get back, and > > then try to trick you into operating on the wrong filesystem with > > open_by_handle_at()) but it is definitely something you'd want to avoid. > > So the following paragraphs are prefaced with: I'm not an expert on file > handle encoding and so might be totally wrong. > > Afaiu, the uniqueness guarantee of the file handle mostly depends on: > > (1) the filesystem > (2) the actual file handling encoding > > Looking at file handle encoding to me it looks like it's fairly easy to > fake them in userspace (I guess that's ok if you think about them like a > path but with a weird permission model built around them.) for quite a > few filesystems. This is a feature specifically used by XFS utilities like xfs_fsr and xfsdump to take pathless inode information retrieved by bulkstat operations to a file handle to enable arbitrary inodes in the filesystem to be opened. i.e. they scan the filesystem by walking the filesystem's internal inode index, not by walking paths to scan the filesytsem. This is *much* faster than path walking, especially on seek-limited storage hardware. Knowing the inode number, generation and fs uuid, we can construct a valid filehandle for that inode, and then do whatever operations we need to do (defrag, backup, move offline (HSM), etc) without needing to know the path to that inode.... > The problem is with what name_to_handle_at() returns imho. A mnt_id > doesn't pin the filesystem and the old mnt_id isn't unique. That means > the filesystem can be unmounted and go away and the mnt_id can be > recycled almost immediately for another mount but the file handle is > still there. This is no different to the NFS server unexporting a filesystem - all attempts to resolve the file handle the client holds for that export must now fail. Hence the filehandle itself must have a superblock specific identifier in it. > So to guarantee that a (weakly encoded) file handle is interpreted in > the right filesystem the file handle must either be accompanied by a > file descriptor that pins the relevant mount or have any other guarantee > that the filesystem doesn't go away (Or of course, the file handle > encodes the uuid of the filesystem or something or uses some sort of > hashing scheme.). Yes, it does, and that's the superblock based identifier, not something that is mount based. i.e. the handle is valid regardless of where the filesystem is mounted, even if the application does not have visibility or permitted access to the given mount. And the filehandle is persistent - it must work across unmount/mount, reboots, change of mount location, etc. That means that if you are using file handles, any filesystem that is shared across multiple containers can by fully accessed by user generated file handles from any container if no special permissions are required. i.e. there are no real path or mount based security boundaries for filehandles in existence righ now. If filehandles are going to be restricted to a specific container (e.g. a single mount) so that less permissions are needed to open the filehandle, then the filehandle itself needs to encode those restrictions. Decoding the filehandle needs to ensure that the opening context has permission to access whatever the filehandle points to in a restricted environment. This will prevent existing persistent, global filehandles from being used as restricted filehandles and vice versa. Restricting filehandles for use as persistent filehandles is going to be tricky - mount IDs are ephermeral and only valid while a mount is active. I'd suggest that restricted filehandles should only be ephemeral, too. That would also allow restricted filehandles to use kernel side crypto based encoding so nobody can ever construct them from userspace. Hence I think we have to look at what we are encoding in the filehandle itself to solve the "shared access from restricted environments" problem, not try to add stuff around the outside of the filehandle API to paper over the fact that filehandles themselves have no permission/restriction model built into them. This would also avoid the whole problem of "users can access any file in the underlying filesystem by constructing their own handles" problem that relaxed permissions on filehandle decoding creates, hence opening the door for more extensive use of filehandles in untrusted environments. > One of the features of file handles is that they're globally usable so > they're interesting to use as handles that can be shared. IOW, one can > send around a file handle to another process without having to pin > anything or have a file descriptor open that needs to be sent via > AF_UNIX. > > But as stated above that's potentially risky so one might still have to > send around an fd together with the file handle if sender and receiver > don't share the filesystem for the handle. Adding a trust context around the outside of an untrusted handle isn't the right way to address this problem - define a filehandle format that can be trusted and constrained within specific security domains and the need for external permission contexts (whatever they look like) to be passed with the filehandle to make it valid go away entirely. -Dave.
On Fri, May 24, 2024 at 08:58:55AM -0700, Aleksa Sarai wrote: > > Though, regardless of the attack you are worried about, I guess we are > in agreement that a unique mount id from name_to_handle_at() would be a > good idea if we are planning for userspace to use file handles for > everything. I somewhat disagree - the information needed to validate and restrict the scope of the filehandle needs to be encoded into the filehandle itself. Otherwise we've done nothing to reduce the potential abuse scope of the filehandle object itself, nor prevented users from generating their own filehandles to objects they don't have direct access to that are still accessible on the given "mount id" that are outside their direct path based permission scope. IOWs, the filehandle must encode the restrictions on it's use internally so that random untrusted third parties cannot use it outside the context in which is was intended for... Whether that internal encoding is a mount ID, and mount namespace identifier or something else completely different is just a detail. I suspect that the creation of a restricted filehandle should be done by a simple API flag (e.g. AT_FH_RESTRICTED), and the kernel decides entirely what goes into the filehandle to restrict it to the context that the file handle has been created within. -Dave.
On Mon, May 27, 2024 at 10:06:15AM +1000, Dave Chinner wrote: > On Fri, May 24, 2024 at 02:25:30PM +0200, Christian Brauner wrote: > > On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote: > > > On 2024-05-21, Christian Brauner <brauner@kernel.org> wrote: > > > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote: > > > > > Now that we have stabilised the unique 64-bit mount ID interface in > > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to > > > > > provide a file handle and corresponding mount without needing to worry > > > > > about racing with /proc/mountinfo parsing. > > > > > > > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit > > > > > that doesn't make sense for name_to_handle_at(2). > > > > > > > > > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > > > > > --- > > > > > > > > So I think overall this is probably fine (famous last words). If it's > > > > just about being able to retrieve the new mount id without having to > > > > take the hit of another statx system call it's indeed a bit much to > > > > add a revised system call for this. Althoug I did say earlier that I > > > > wouldn't rule that out. > > > > > > > > But if we'd that then it'll be a long discussion on the form of the new > > > > system call and the information it exposes. > > > > > > > > For example, I lack the grey hair needed to understand why > > > > name_to_handle_at() returns a mount id at all. The pitch in commit > > > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that > > > > the (old) mount id can be used to "lookup file system specific > > > > information [...] in /proc/<pid>/mountinfo". > > > > > > The logic was presumably to allow you to know what mount the resolved > > > file handle came from. If you use AT_EMPTY_PATH this is not needed > > > because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if > > > you just give name_to_handle_at() almost any path, there is no race-free > > > way to make sure that you know which filesystem the file handle came > > > from. > > > > > > I don't know if that could lead to security issues (I guess an attacker > > > could find a way to try to manipulate the file handle you get back, and > > > then try to trick you into operating on the wrong filesystem with > > > open_by_handle_at()) but it is definitely something you'd want to avoid. > > > > So the following paragraphs are prefaced with: I'm not an expert on file > > handle encoding and so might be totally wrong. > > > > Afaiu, the uniqueness guarantee of the file handle mostly depends on: > > > > (1) the filesystem > > (2) the actual file handling encoding > > > > Looking at file handle encoding to me it looks like it's fairly easy to > > fake them in userspace (I guess that's ok if you think about them like a > > path but with a weird permission model built around them.) for quite a > > few filesystems. > > This is a feature specifically used by XFS utilities like xfs_fsr > and xfsdump to take pathless inode information retrieved by bulkstat > operations to a file handle to enable arbitrary inodes in the > filesystem to be opened. > > i.e. they scan the filesystem by walking the filesystem's internal > inode index, not by walking paths to scan the filesytsem. This is > *much* faster than path walking, especially on seek-limited storage > hardware. > > Knowing the inode number, generation and fs uuid, we can > construct a valid filehandle for that inode, and then do whatever > operations we need to do (defrag, backup, move offline (HSM), etc) > without needing to know the path to that inode.... Yeah, I think you mentioned this in another context before. > > The problem is with what name_to_handle_at() returns imho. A mnt_id > > doesn't pin the filesystem and the old mnt_id isn't unique. That means > > the filesystem can be unmounted and go away and the mnt_id can be > > recycled almost immediately for another mount but the file handle is > > still there. > > This is no different to the NFS server unexporting a filesystem - > all attempts to resolve the file handle the client holds for that > export must now fail. Hence the filehandle itself must have a > superblock specific identifier in it. > > > So to guarantee that a (weakly encoded) file handle is interpreted in > > the right filesystem the file handle must either be accompanied by a > > file descriptor that pins the relevant mount or have any other guarantee > > that the filesystem doesn't go away (Or of course, the file handle > > encodes the uuid of the filesystem or something or uses some sort of > > hashing scheme.). > > Yes, it does, and that's the superblock based identifier, not > something that is mount based. i.e. the handle is valid regardless > of where the filesystem is mounted, even if the application does not > have visibility or permitted access to the given mount. And the > filehandle is persistent - it must work across unmount/mount, > reboots, change of mount location, etc. Agreed, and no one is disputing that. The old mount id as exposed by name_to_handle_at() is one means to get to a persisent identifier and that is racy. But we do have a 64bit non-recyled mount id and statmount() since v6.7 now which allow to get a persistent identifier for the filesystem in a race-free manner.
diff --git a/fs/fhandle.c b/fs/fhandle.c index 8a7f86c2139a..6bc7ffccff8c 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -16,7 +16,8 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, - int __user *mnt_id, int fh_flags) + void __user *mnt_id, bool unique_mntid, + int fh_flags) { long retval; struct file_handle f_handle; @@ -69,10 +70,16 @@ static long do_sys_name_to_handle(const struct path *path, } else retval = 0; /* copy the mount id */ - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || - copy_to_user(ufh, handle, - struct_size(handle, f_handle, handle_bytes))) - retval = -EFAULT; + if (unique_mntid) + retval = put_user(real_mount(path->mnt)->mnt_id_unique, + (u64 __user *) mnt_id); + else + retval = put_user(real_mount(path->mnt)->mnt_id, + (int __user *) mnt_id); + /* copy the handle */ + if (!retval) + retval = copy_to_user(ufh, handle, + struct_size(handle, f_handle, handle_bytes)); kfree(handle); return retval; } @@ -83,6 +90,7 @@ static long do_sys_name_to_handle(const struct path *path, * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file + * (u64 if AT_HANDLE_UNIQUE_MNT_ID, otherwise int) * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * @@ -92,7 +100,7 @@ static long do_sys_name_to_handle(const struct path *path, * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, - struct file_handle __user *, handle, int __user *, mnt_id, + struct file_handle __user *, handle, void __user *, mnt_id, int, flag) { struct path path; @@ -100,7 +108,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, int fh_flags; int err; - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | + AT_HANDLE_UNIQUE_MNT_ID)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; @@ -109,7 +118,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); + err = do_sys_name_to_handle(&path, handle, mnt_id, + flag & AT_HANDLE_UNIQUE_MNT_ID, + fh_flags); path_put(&path); } return err; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index c0bcc185fa48..fda970f92fba 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -118,6 +118,8 @@ #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to compare object identity and may not be usable to open_by_handle_at(2) */ +#define AT_HANDLE_UNIQUE_MNT_ID AT_STATX_FORCE_SYNC /* returned mount id is + the u64 unique mount id */ #if defined(__KERNEL__) #define AT_GETATTR_NOSEC 0x80000000 #endif
Now that we have stabilised the unique 64-bit mount ID interface in statx, we can now provide a race-free way for name_to_handle_at(2) to provide a file handle and corresponding mount without needing to worry about racing with /proc/mountinfo parsing. As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit that doesn't make sense for name_to_handle_at(2). Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> --- fs/fhandle.c | 27 +++++++++++++++++++-------- include/uapi/linux/fcntl.h | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) --- base-commit: 584bbf439d0fa83d728ec49f3a38c581bdc828b4 change-id: 20240515-exportfs-u64-mount-id-9ebb5c58b53c Best regards,