diff mbox series

[v2,3/3] pidfs: implement file handle support

Message ID 20241113-pidfs_fh-v2-3-9a4d28155a37@e43.eu (mailing list archive)
State New
Headers show
Series [v2,1/3] pseudofs: add support for export_ops | expand

Commit Message

Erin Shepherd Nov. 13, 2024, 5:55 p.m. UTC
On 64-bit platforms, userspace can read the pidfd's inode in order to
get a never-repeated PID identifier. On 32-bit platforms this identifier
is not exposed, as inodes are limited to 32 bits. Instead expose the
identifier via export_fh, which makes it available to userspace via
name_to_handle_at

In addition we implement fh_to_dentry, which allows userspace to
recover a pidfd from a PID file handle.

We stash the process' PID in the root pid namespace inside the handle,
and use that to recover the pid (validating that pid->ino matches the
value in the handle, i.e. that the pid has not been reused).

We use the root namespace in order to ensure that file handles can be
moved across namespaces; however, we validate that the PID exists in
the current namespace before returning the inode.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
---
 fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

Comments

Amir Goldstein Nov. 14, 2024, 7:07 a.m. UTC | #1
On Wed, Nov 13, 2024 at 7:11 PM Erin Shepherd <erin.shepherd@e43.eu> wrote:
>
> On 64-bit platforms, userspace can read the pidfd's inode in order to
> get a never-repeated PID identifier. On 32-bit platforms this identifier
> is not exposed, as inodes are limited to 32 bits. Instead expose the
> identifier via export_fh, which makes it available to userspace via
> name_to_handle_at
>
> In addition we implement fh_to_dentry, which allows userspace to
> recover a pidfd from a PID file handle.

"In addition" is a good indication that a separate patch was a good idea..

>
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
>
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
>
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>

This patch has changed enough that you should not have kept my RVB.

BTW, please refrain from using the same subject for the cover letter and
a single patch. They are not the same thing, so if they get the same
name, one of them has an inaccurate description.

> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> ---
>  fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <linux/anon_inodes.h>
> +#include <linux/exportfs.h>
>  #include <linux/file.h>
>  #include <linux/fs.h>
>  #include <linux/magic.h>
> @@ -347,11 +348,69 @@ static const struct dentry_operations pidfs_dentry_operations = {
>         .d_prune        = stashed_dentry_prune,
>  };
>
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +       u64 ino;
> +       s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> +                          struct inode *parent)
> +{
> +       struct pid *pid = inode->i_private;
> +       struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +       if (*max_len < PIDFD_FID_LEN) {
> +               *max_len = PIDFD_FID_LEN;
> +               return FILEID_INVALID;
> +       }
> +
> +       fid->ino = pid->ino;
> +       fid->pid = pid_nr(pid);
> +       *max_len = PIDFD_FID_LEN;
> +       return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +                                        struct fid *gen_fid,
> +                                        int fh_len, int fh_type)
> +{
> +       int ret;
> +       struct path path;
> +       struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +       struct pid *pid;
> +
> +       if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +               return NULL;
> +
> +       scoped_guard(rcu) {
> +               pid = find_pid_ns(fid->pid, &init_pid_ns);
> +               if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
> +                       return NULL;
> +
> +               pid = get_pid(pid);
> +       }
> +
> +       ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +       if (ret < 0)
> +               return ERR_PTR(ret);

How come no need to put_pid() in case of error?

> +
> +       mntput(path.mnt);
> +       return path.dentry;
> +}
> +
> +static const struct export_operations pidfs_export_operations = {
> +       .encode_fh = pidfs_encode_fh,
> +       .fh_to_dentry = pidfs_fh_to_dentry,
> +       .flags = EXPORT_OP_UNRESTRICTED_OPEN,
> +};
> +
>  static int pidfs_init_inode(struct inode *inode, void *data)
>  {
>         inode->i_private = data;
>         inode->i_flags |= S_PRIVATE;
> -       inode->i_mode |= S_IRWXU;
> +       inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;

This change is not explained.
Why is it here?

Thanks,
Amir.
Erin Shepherd Nov. 14, 2024, 12:42 p.m. UTC | #2
On 14/11/2024 08:07, Amir Goldstein wrote:
> On Wed, Nov 13, 2024 at 7:11 PM Erin Shepherd <erin.shepherd@e43.eu> wrote:
>> On 64-bit platforms, userspace can read the pidfd's inode in order to
>> get a never-repeated PID identifier. On 32-bit platforms this identifier
>> is not exposed, as inodes are limited to 32 bits. Instead expose the
>> identifier via export_fh, which makes it available to userspace via
>> name_to_handle_at
>>
>> In addition we implement fh_to_dentry, which allows userspace to
>> recover a pidfd from a PID file handle.
> "In addition" is a good indication that a separate patch was a good idea..
>
>> We stash the process' PID in the root pid namespace inside the handle,
>> and use that to recover the pid (validating that pid->ino matches the
>> value in the handle, i.e. that the pid has not been reused).
>>
>> We use the root namespace in order to ensure that file handles can be
>> moved across namespaces; however, we validate that the PID exists in
>> the current namespace before returning the inode.
>>
>> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> This patch has changed enough that you should not have kept my RVB.
>
> BTW, please refrain from using the same subject for the cover letter and
> a single patch. They are not the same thing, so if they get the same
> name, one of them has an inaccurate description.
>
ACK to all three.


>> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
>> ---
>>  fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 61 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/pidfs.c b/fs/pidfs.c
>> index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
>> --- a/fs/pidfs.c
>> +++ b/fs/pidfs.c
>> @@ -1,5 +1,6 @@
>>  // SPDX-License-Identifier: GPL-2.0
>>  #include <linux/anon_inodes.h>
>> +#include <linux/exportfs.h>
>>  #include <linux/file.h>
>>  #include <linux/fs.h>
>>  #include <linux/magic.h>
>> @@ -347,11 +348,69 @@ static const struct dentry_operations pidfs_dentry_operations = {
>>         .d_prune        = stashed_dentry_prune,
>>  };
>>
>> +#define PIDFD_FID_LEN 3
>> +
>> +struct pidfd_fid {
>> +       u64 ino;
>> +       s32 pid;
>> +} __packed;
>> +
>> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
>> +                          struct inode *parent)
>> +{
>> +       struct pid *pid = inode->i_private;
>> +       struct pidfd_fid *fid = (struct pidfd_fid *)fh;
>> +
>> +       if (*max_len < PIDFD_FID_LEN) {
>> +               *max_len = PIDFD_FID_LEN;
>> +               return FILEID_INVALID;
>> +       }
>> +
>> +       fid->ino = pid->ino;
>> +       fid->pid = pid_nr(pid);
>> +       *max_len = PIDFD_FID_LEN;
>> +       return FILEID_INO64_GEN;
>> +}
>> +
>> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
>> +                                        struct fid *gen_fid,
>> +                                        int fh_len, int fh_type)
>> +{
>> +       int ret;
>> +       struct path path;
>> +       struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
>> +       struct pid *pid;
>> +
>> +       if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
>> +               return NULL;
>> +
>> +       scoped_guard(rcu) {
>> +               pid = find_pid_ns(fid->pid, &init_pid_ns);
>> +               if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
>> +                       return NULL;
>> +
>> +               pid = get_pid(pid);
>> +       }
>> +
>> +       ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
>> +       if (ret < 0)
>> +               return ERR_PTR(ret);
> How come no need to put_pid() in case of error?

This one confused me at first too, but path_from_stashed frees it (via
stashed_ops.put_data) on error. You can see the same pattern in
pidfs_alloc_file.

(It already needs to know how to free it for the case where a stashed
dentry already exists)

>> +
>> +       mntput(path.mnt);
>> +       return path.dentry;
>> +}
>> +
>> +static const struct export_operations pidfs_export_operations = {
>> +       .encode_fh = pidfs_encode_fh,
>> +       .fh_to_dentry = pidfs_fh_to_dentry,
>> +       .flags = EXPORT_OP_UNRESTRICTED_OPEN,
>> +};
>> +
>>  static int pidfs_init_inode(struct inode *inode, void *data)
>>  {
>>         inode->i_private = data;
>>         inode->i_flags |= S_PRIVATE;
>> -       inode->i_mode |= S_IRWXU;
>> +       inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;
> This change is not explained.
> Why is it here?

open_by_handle_at eventually passes through the may_open permission check.
The existing permissions only permits root to open them (since the owning
uid & gid is 0). So, it was necessary to widen them to align with how
pidfd_open works.

If I stick with this approach (see [1]) I'll ensure to document this change
better in the commit message.

[1] https://lore.kernel.org/all/6a3ed633-311d-47ff-8a7e-5121d6186139@e43.eu/
Christian Brauner Nov. 14, 2024, 12:52 p.m. UTC | #3
On Wed, Nov 13, 2024 at 05:55:25PM +0000, Erin Shepherd wrote:
> On 64-bit platforms, userspace can read the pidfd's inode in order to
> get a never-repeated PID identifier. On 32-bit platforms this identifier
> is not exposed, as inodes are limited to 32 bits. Instead expose the
> identifier via export_fh, which makes it available to userspace via
> name_to_handle_at
> 
> In addition we implement fh_to_dentry, which allows userspace to
> recover a pidfd from a PID file handle.
> 
> We stash the process' PID in the root pid namespace inside the handle,
> and use that to recover the pid (validating that pid->ino matches the
> value in the handle, i.e. that the pid has not been reused).
> 
> We use the root namespace in order to ensure that file handles can be
> moved across namespaces; however, we validate that the PID exists in
> the current namespace before returning the inode.
> 
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> Signed-off-by: Erin Shepherd <erin.shepherd@e43.eu>
> ---

I think you need at least something like the following completely
untested draft on top:

- the pidfs_finish_open_by_handle_at() is somewhat of a clutch to handle
  thread vs thread-group pidfds but it works.

- In contrast to pidfd_open() that uses dentry_open() to create a pidfd
  open_by_handle_at() uses file_open_root(). That's overall fine but
  makes pidfds subject to security hooks which they aren't via
  pidfd_open(). It also necessitats pidfs_finish_open_by_handle_at().
  There's probably other solutions I'm not currently seeing.

- The exportfs_decode_fh_raw() call that's used to decode the pidfd is
  passed vfs_dentry_acceptable() as acceptability callback. For pidfds
  we don't need any of that functionality and we don't need any of the
  disconnected dentry handling logic. So the easiest way to fix that is
  to rely on EXPORT_OP_UNRESTRICTED_OPEN to skip everything. That in
  turns means the only acceptability we have is the nop->fh_to_dentry()
  callback for pidfs.

- This all really needs rigorous selftests before we can even think of
  merging any of this.

Anway, here's the _completely untested, quickly drafted_ diff on top:

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4f2dd4ab4486..65c93f7132d4 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -450,6 +450,13 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
 		goto err_result;
 	}
 
+	/*
+	 * The filesystem has no acceptance criteria other than those in
+	 * nop->fh_to_dentry().
+	 */
+	if (nop->flags & EXPORT_OP_UNRESTRICTED_OPEN)
+		return result;
+
 	/*
 	 * If no acceptance criteria was specified by caller, a disconnected
 	 * dentry is also accepatable. Callers may use this mode to query if
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 056116e58f43..89c2efacc0c3 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
 #include <linux/personality.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/pidfs.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -218,20 +219,21 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
 {
 	int handle_dwords;
 	struct vfsmount *mnt = ctx->root.mnt;
+	struct dentry *dentry;
 
 	/* change the handle size to multiple of sizeof(u32) */
 	handle_dwords = handle->handle_bytes >> 2;
-	path->dentry = exportfs_decode_fh_raw(mnt,
-					  (struct fid *)handle->f_handle,
-					  handle_dwords, handle->handle_type,
-					  ctx->fh_flags,
-					  vfs_dentry_acceptable, ctx);
-	if (IS_ERR_OR_NULL(path->dentry)) {
-		if (path->dentry == ERR_PTR(-ENOMEM))
+	dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
+					handle_dwords, handle->handle_type,
+					ctx->fh_flags, vfs_dentry_acceptable,
+					ctx);
+	if (IS_ERR_OR_NULL(dentry)) {
+		if (dentry == ERR_PTR(-ENOMEM))
 			return -ENOMEM;
 		return -ESTALE;
 	}
 	path->mnt = mntget(mnt);
+	path->dentry = dentry;
 	return 0;
 }
 
@@ -239,7 +241,7 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
 				 unsigned int o_flags)
 {
 	struct path *root = &ctx->root;
-	struct export_operations *nop = root->mnt->mnt_sb->s_export_op;
+	const struct export_operations *nop = root->mnt->mnt_sb->s_export_op;
 
 	if (nop && nop->flags & EXPORT_OP_UNRESTRICTED_OPEN)
 		return true;
@@ -342,7 +344,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 			   int open_flag)
 {
 	long retval = 0;
-	struct path path;
+	struct path path __free(path_put) = {};
 	struct file *file;
 	int fd;
 
@@ -351,19 +353,24 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 		return retval;
 
 	fd = get_unused_fd_flags(open_flag);
-	if (fd < 0) {
-		path_put(&path);
+	if (fd < 0)
 		return fd;
-	}
+
 	file = file_open_root(&path, "", open_flag, 0);
 	if (IS_ERR(file)) {
 		put_unused_fd(fd);
-		retval =  PTR_ERR(file);
-	} else {
-		retval = fd;
-		fd_install(fd, file);
+		return PTR_ERR(file);
 	}
-	path_put(&path);
+
+	retval = pidfs_finish_open_by_handle_at(file, open_flag);
+	if (retval) {
+		put_unused_fd(fd);
+		fput(file);
+		return retval;
+	}
+
+	retval = fd;
+	fd_install(fd, file);
 	return retval;
 }
 
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 0684a9b8fe71..19948002f395 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -237,6 +237,24 @@ struct pid *pidfd_pid(const struct file *file)
 	return file_inode(file)->i_private;
 }
 
+int pidfs_finish_open_by_handle_at(struct file *file, unsigned int oflags)
+{
+	struct pid *pid;
+	bool thread = oflags & PIDFD_THREAD;
+
+	pid = pidfd_pid(file);
+	if (IS_ERR(pid))
+		return 0;
+
+	if (!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
+		return -EINVAL;
+
+	if (thread)
+		file->f_flags |= PIDFD_THREAD;
+
+	return 0;
+}
+
 static struct vfsmount *pidfs_mnt __ro_after_init;
 
 #if BITS_PER_LONG == 32
@@ -377,7 +395,7 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
 					 int fh_len, int fh_type)
 {
 	int ret;
-	struct path path;
+	struct path path __free(path_put) = {};
 	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
 	struct pid *pid;
 
@@ -393,11 +411,12 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
 	}
 
 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
-	if (ret < 0)
+	if (ret < 0) {
+		put_pid(pid);
 		return ERR_PTR(ret);
+	}
 
-	mntput(path.mnt);
-	return path.dentry;
+	return dget(path.dentry);
 }
 
 static const struct export_operations pidfs_export_operations = {
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..9a4130056e7d 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_FS_H
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
+int pidfs_finish_open_by_handle_at(struct file *file, unsigned int oflags);
 void __init pidfs_init(void);
 
 #endif /* _LINUX_PID_FS_H */


>  fs/pidfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 61 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <linux/anon_inodes.h>
> +#include <linux/exportfs.h>
>  #include <linux/file.h>
>  #include <linux/fs.h>
>  #include <linux/magic.h>
> @@ -347,11 +348,69 @@ static const struct dentry_operations pidfs_dentry_operations = {
>  	.d_prune	= stashed_dentry_prune,
>  };
>  
> +#define PIDFD_FID_LEN 3
> +
> +struct pidfd_fid {
> +	u64 ino;
> +	s32 pid;
> +} __packed;
> +
> +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> +			   struct inode *parent)
> +{
> +	struct pid *pid = inode->i_private;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
> +
> +	if (*max_len < PIDFD_FID_LEN) {
> +		*max_len = PIDFD_FID_LEN;
> +		return FILEID_INVALID;
> +	}
> +
> +	fid->ino = pid->ino;
> +	fid->pid = pid_nr(pid);
> +	*max_len = PIDFD_FID_LEN;
> +	return FILEID_INO64_GEN;
> +}
> +
> +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
> +					 struct fid *gen_fid,
> +					 int fh_len, int fh_type)
> +{
> +	int ret;
> +	struct path path;
> +	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
> +	struct pid *pid;
> +
> +	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
> +		return NULL;
> +
> +	scoped_guard(rcu) {
> +		pid = find_pid_ns(fid->pid, &init_pid_ns);
> +		if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
> +			return NULL;
> +
> +		pid = get_pid(pid);
> +	}
> +
> +	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	mntput(path.mnt);
> +	return path.dentry;
> +}
> +
> +static const struct export_operations pidfs_export_operations = {
> +	.encode_fh = pidfs_encode_fh,
> +	.fh_to_dentry = pidfs_fh_to_dentry,
> +	.flags = EXPORT_OP_UNRESTRICTED_OPEN,
> +};
> +
>  static int pidfs_init_inode(struct inode *inode, void *data)
>  {
>  	inode->i_private = data;
>  	inode->i_flags |= S_PRIVATE;
> -	inode->i_mode |= S_IRWXU;
> +	inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;
>  	inode->i_op = &pidfs_inode_operations;
>  	inode->i_fop = &pidfs_file_operations;
>  	/*
> @@ -382,6 +441,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
>  		return -ENOMEM;
>  
>  	ctx->ops = &pidfs_sops;
> +	ctx->eops = &pidfs_export_operations;
>  	ctx->dops = &pidfs_dentry_operations;
>  	fc->s_fs_info = (void *)&pidfs_stashed_ops;
>  	return 0;
> 
> -- 
> 2.46.1
>
Erin Shepherd Nov. 14, 2024, 1:13 p.m. UTC | #4
On 14/11/2024 13:52, Christian Brauner wrote:

> I think you need at least something like the following completely
> untested draft on top:
>
> - the pidfs_finish_open_by_handle_at() is somewhat of a clutch to handle
>   thread vs thread-group pidfds but it works.
>
> - In contrast to pidfd_open() that uses dentry_open() to create a pidfd
>   open_by_handle_at() uses file_open_root(). That's overall fine but
>   makes pidfds subject to security hooks which they aren't via
>   pidfd_open(). It also necessitats pidfs_finish_open_by_handle_at().
>   There's probably other solutions I'm not currently seeing.

These two concerns combined with the special flag make me wonder if pidfs
is so much of a special snowflake we should just special case it up front
and skip all of the shared handle decode logic?

> - The exportfs_decode_fh_raw() call that's used to decode the pidfd is
>   passed vfs_dentry_acceptable() as acceptability callback. For pidfds
>   we don't need any of that functionality and we don't need any of the
>   disconnected dentry handling logic. So the easiest way to fix that is
>   to rely on EXPORT_OP_UNRESTRICTED_OPEN to skip everything. That in
>   turns means the only acceptability we have is the nop->fh_to_dentry()
>   callback for pidfs.

With the current logic we go exportfs_decode_fh_raw(...) ->
find_acceptable_alias(result) -> vfs_dentry_acceptable(context, result).


vfs_dentry_acceptable immediately returns 1 if ctx->flags is 0, which will
always be the case if EXPORT_OP_UNRESTRICTED_OPEN was set, so we immediately
fall back out of the call tree and return result.

So I'm not 100% sure we actually need this special case but I'm not opposed.

> - This all really needs rigorous selftests before we can even think of
>   merging any of this.
Christian Brauner Nov. 14, 2024, 2:13 p.m. UTC | #5
On Thu, Nov 14, 2024 at 02:13:06PM +0100, Erin Shepherd wrote:
> On 14/11/2024 13:52, Christian Brauner wrote:
> 
> > I think you need at least something like the following completely
> > untested draft on top:
> >
> > - the pidfs_finish_open_by_handle_at() is somewhat of a clutch to handle
> >   thread vs thread-group pidfds but it works.
> >
> > - In contrast to pidfd_open() that uses dentry_open() to create a pidfd
> >   open_by_handle_at() uses file_open_root(). That's overall fine but
> >   makes pidfds subject to security hooks which they aren't via
> >   pidfd_open(). It also necessitats pidfs_finish_open_by_handle_at().
> >   There's probably other solutions I'm not currently seeing.
> 
> These two concerns combined with the special flag make me wonder if pidfs
> is so much of a special snowflake we should just special case it up front
> and skip all of the shared handle decode logic?

Care to try a patch and see what it looks like?

> 
> > - The exportfs_decode_fh_raw() call that's used to decode the pidfd is
> >   passed vfs_dentry_acceptable() as acceptability callback. For pidfds
> >   we don't need any of that functionality and we don't need any of the
> >   disconnected dentry handling logic. So the easiest way to fix that is
> >   to rely on EXPORT_OP_UNRESTRICTED_OPEN to skip everything. That in
> >   turns means the only acceptability we have is the nop->fh_to_dentry()
> >   callback for pidfs.
> 
> With the current logic we go exportfs_decode_fh_raw(...) ->
> find_acceptable_alias(result) -> vfs_dentry_acceptable(context, result).
> vfs_dentry_acceptable immediately returns 1 if ctx->flags is 0, which will
> always be the case if EXPORT_OP_UNRESTRICTED_OPEN was set, so we immediately
> fall back out of the call tree and return result.
> 
> So I'm not 100% sure we actually need this special case but I'm not opposed.

Oh right, I completely forgot that find_acceptable_alias() is a no-op
for pidfs.
diff mbox series

Patch

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6bf88459c22787edaa68db360bdc0d0782..0684a9b8fe71c5205fb153b2714bc9c672045fd5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,5 +1,6 @@ 
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/anon_inodes.h>
+#include <linux/exportfs.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/magic.h>
@@ -347,11 +348,69 @@  static const struct dentry_operations pidfs_dentry_operations = {
 	.d_prune	= stashed_dentry_prune,
 };
 
+#define PIDFD_FID_LEN 3
+
+struct pidfd_fid {
+	u64 ino;
+	s32 pid;
+} __packed;
+
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
+{
+	struct pid *pid = inode->i_private;
+	struct pidfd_fid *fid = (struct pidfd_fid *)fh;
+
+	if (*max_len < PIDFD_FID_LEN) {
+		*max_len = PIDFD_FID_LEN;
+		return FILEID_INVALID;
+	}
+
+	fid->ino = pid->ino;
+	fid->pid = pid_nr(pid);
+	*max_len = PIDFD_FID_LEN;
+	return FILEID_INO64_GEN;
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+					 struct fid *gen_fid,
+					 int fh_len, int fh_type)
+{
+	int ret;
+	struct path path;
+	struct pidfd_fid *fid = (struct pidfd_fid *)gen_fid;
+	struct pid *pid;
+
+	if (fh_type != FILEID_INO64_GEN || fh_len < PIDFD_FID_LEN)
+		return NULL;
+
+	scoped_guard(rcu) {
+		pid = find_pid_ns(fid->pid, &init_pid_ns);
+		if (!pid || pid->ino != fid->ino || pid_vnr(pid) == 0)
+			return NULL;
+
+		pid = get_pid(pid);
+	}
+
+	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	mntput(path.mnt);
+	return path.dentry;
+}
+
+static const struct export_operations pidfs_export_operations = {
+	.encode_fh = pidfs_encode_fh,
+	.fh_to_dentry = pidfs_fh_to_dentry,
+	.flags = EXPORT_OP_UNRESTRICTED_OPEN,
+};
+
 static int pidfs_init_inode(struct inode *inode, void *data)
 {
 	inode->i_private = data;
 	inode->i_flags |= S_PRIVATE;
-	inode->i_mode |= S_IRWXU;
+	inode->i_mode |= S_IRWXU | S_IRWXG | S_IRWXO;
 	inode->i_op = &pidfs_inode_operations;
 	inode->i_fop = &pidfs_file_operations;
 	/*
@@ -382,6 +441,7 @@  static int pidfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	ctx->ops = &pidfs_sops;
+	ctx->eops = &pidfs_export_operations;
 	ctx->dops = &pidfs_dentry_operations;
 	fc->s_fs_info = (void *)&pidfs_stashed_ops;
 	return 0;