diff mbox series

[v5,2/3] pid: Introduce pidfd_getfd syscall

Message ID 20191220232810.GA20233@ircssh-2.c.rugged-nimbus-611.internal (mailing list archive)
State New, archived
Headers show
Series Add pidfd_getfd syscall | expand

Commit Message

Sargun Dhillon Dec. 20, 2019, 11:28 p.m. UTC
This syscall allows for the retrieval of file descriptors from other
processes, based on their pidfd. This is possible using ptrace, and
injection of parasitic code along with using SCM_RIGHTS to move
file descriptors between a tracee and a tracer. Unfortunately, ptrace
comes with a high cost of requiring the process to be stopped, and
breaks debuggers. This does not require stopping the process under
manipulation.

One reason to use this is to allow sandboxers to take actions on file
descriptors on the behalf of another process. For example, this can be
combined with seccomp-bpf's user notification to do on-demand fd
extraction and take privileged actions. For example, it can be used
to bind a socket to a privileged port.

/* prototype */
  /*
   * pidfd_getfd_options is an extensible struct which can have options
   * added to it. If options is NULL, size, and it will be ignored be
   * ignored, otherwise, size should be set to sizeof(*options). If
   * option is newer than the current kernel version, E2BIG will be
   * returned.
   */
  struct pidfd_getfd_options {};
  long pidfd_getfd(int pidfd, int fd, unsigned int flags,
		   struct pidfd_getfd_options *options, size_t size);

/* testing */
Ran self-test suite on x86_64

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
 MAINTAINERS                                 |   1 +
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 include/linux/syscalls.h                    |   4 +
 include/uapi/asm-generic/unistd.h           |   3 +-
 include/uapi/linux/pidfd.h                  |  10 ++
 kernel/pid.c                                | 115 ++++++++++++++++++++
 23 files changed, 151 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/pidfd.h

Comments

Aleksa Sarai Dec. 21, 2019, 12:27 a.m. UTC | #1
On 2019-12-20, Sargun Dhillon <sargun@sargun.me> wrote:
> This syscall allows for the retrieval of file descriptors from other
> processes, based on their pidfd. This is possible using ptrace, and
> injection of parasitic code along with using SCM_RIGHTS to move
> file descriptors between a tracee and a tracer. Unfortunately, ptrace
> comes with a high cost of requiring the process to be stopped, and
> breaks debuggers. This does not require stopping the process under
> manipulation.
> 
> One reason to use this is to allow sandboxers to take actions on file
> descriptors on the behalf of another process. For example, this can be
> combined with seccomp-bpf's user notification to do on-demand fd
> extraction and take privileged actions. For example, it can be used
> to bind a socket to a privileged port.
> 
> /* prototype */
>   /*
>    * pidfd_getfd_options is an extensible struct which can have options
>    * added to it. If options is NULL, size, and it will be ignored be
>    * ignored, otherwise, size should be set to sizeof(*options). If
>    * option is newer than the current kernel version, E2BIG will be
>    * returned.
>    */
>   struct pidfd_getfd_options {};
>   long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> 		   struct pidfd_getfd_options *options, size_t size);
> 
> /* testing */
> Ran self-test suite on x86_64
> 
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
>  MAINTAINERS                                 |   1 +
>  arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
>  arch/arm/tools/syscall.tbl                  |   1 +
>  arch/arm64/include/asm/unistd.h             |   2 +-
>  arch/arm64/include/asm/unistd32.h           |   2 +
>  arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
>  arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
>  arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
>  arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
>  arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
>  arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
>  arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
>  arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
>  arch/s390/kernel/syscalls/syscall.tbl       |   1 +
>  arch/sh/kernel/syscalls/syscall.tbl         |   1 +
>  arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
>  arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
>  arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
>  include/linux/syscalls.h                    |   4 +
>  include/uapi/asm-generic/unistd.h           |   3 +-
>  include/uapi/linux/pidfd.h                  |  10 ++
>  kernel/pid.c                                | 115 ++++++++++++++++++++
>  23 files changed, 151 insertions(+), 2 deletions(-)
>  create mode 100644 include/uapi/linux/pidfd.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cc0a4a8ae06a..bc370ff59dbf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13014,6 +13014,7 @@ M:	Christian Brauner <christian@brauner.io>
>  L:	linux-kernel@vger.kernel.org
>  S:	Maintained
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
> +F:	include/uapi/linux/pidfd.h
>  F:	samples/pidfd/
>  F:	tools/testing/selftests/pidfd/
>  F:	tools/testing/selftests/clone3/
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 8e13b0b2928d..d1cac0d657b7 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -475,3 +475,4 @@
>  543	common	fspick				sys_fspick
>  544	common	pidfd_open			sys_pidfd_open
>  # 545 reserved for clone3
> +548	common	pidfd_getfd			sys_pidfd
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index 6da7dc4d79cc..ba045e2f3a60 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -449,3 +449,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 2629a68b8724..b722e47377a5 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -38,7 +38,7 @@
>  #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
>  #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
>  
> -#define __NR_compat_syscalls		436
> +#define __NR_compat_syscalls		439
>  #endif
>  
>  #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index 94ab29cf4f00..a8da97a2de41 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -879,6 +879,8 @@ __SYSCALL(__NR_fspick, sys_fspick)
>  __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>  #define __NR_clone3 435
>  __SYSCALL(__NR_clone3, sys_clone3)
> +#define __NR_pidfd_getfd 438
> +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
>  
>  /*
>   * Please add new compat syscalls above this comment and update
> diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
> index 36d5faf4c86c..2b11adfc860c 100644
> --- a/arch/ia64/kernel/syscalls/syscall.tbl
> +++ b/arch/ia64/kernel/syscalls/syscall.tbl
> @@ -356,3 +356,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index a88a285a0e5f..44e879e98459 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -435,3 +435,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 09b0cd7dab0a..7afa00125cc4 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -441,3 +441,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index e7c5ab38e403..856d5ba34461 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -374,3 +374,4 @@
>  433	n32	fspick				sys_fspick
>  434	n32	pidfd_open			sys_pidfd_open
>  435	n32	clone3				__sys_clone3
> +438	n32	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 13cd66581f3b..2db6075352f3 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -350,3 +350,4 @@
>  433	n64	fspick				sys_fspick
>  434	n64	pidfd_open			sys_pidfd_open
>  435	n64	clone3				__sys_clone3
> +438	n64	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 353539ea4140..e9f9d4a9b105 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -423,3 +423,4 @@
>  433	o32	fspick				sys_fspick
>  434	o32	pidfd_open			sys_pidfd_open
>  435	o32	clone3				__sys_clone3
> +438	o32	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index 285ff516150c..c58c7eb144ca 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -433,3 +433,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3_wrapper
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 43f736ed47f2..707609bfe3ea 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -517,3 +517,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	nospu	clone3				ppc_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index 3054e9c035a3..185cd624face 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
>  433  common	fspick			sys_fspick			sys_fspick
>  434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
>  435  common	clone3			sys_clone3			sys_clone3
> +438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index b5ed26c4c005..88f90895aad8 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index 8c8cc7537fb2..218df6a2326e 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -481,3 +481,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 15908eb9b17e..9c3101b65e0f 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -440,3 +440,4 @@
>  433	i386	fspick			sys_fspick			__ia32_sys_fspick
>  434	i386	pidfd_open		sys_pidfd_open			__ia32_sys_pidfd_open
>  435	i386	clone3			sys_clone3			__ia32_sys_clone3
> +438	i386	pidfd_getfd		sys_pidfd_getfd			__ia32_sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index c29976eca4a8..cef85db75a62 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -357,6 +357,7 @@
>  433	common	fspick			__x64_sys_fspick
>  434	common	pidfd_open		__x64_sys_pidfd_open
>  435	common	clone3			__x64_sys_clone3/ptregs
> +438	common	pidfd_getfd		__x64_sys_pidfd_getfd
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index 25f4de729a6d..ae15183def12 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -406,3 +406,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2960dedcfde8..62fe706329d1 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -69,6 +69,7 @@ struct rseq;
>  union bpf_attr;
>  struct io_uring_params;
>  struct clone_args;
> +struct pidfd_getfd_options;
>  
>  #include <linux/types.h>
>  #include <linux/aio_abi.h>
> @@ -1000,6 +1001,9 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
>  asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
>  				       siginfo_t __user *info,
>  				       unsigned int flags);
> +asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
> +				struct pidfd_getfd_options __user *options,
> +				size_t, usize);
>  
>  /*
>   * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1fc8faa6e973..f358488366f6 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -850,9 +850,10 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>  #define __NR_clone3 435
>  __SYSCALL(__NR_clone3, sys_clone3)
>  #endif
> +#define __NR_pidfd_getfd 438
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 436
> +#define __NR_syscalls 439
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> new file mode 100644
> index 000000000000..0a3fc922661d
> --- /dev/null
> +++ b/include/uapi/linux/pidfd.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PIDFD_H
> +#define _UAPI_LINUX_PIDFD_H
> +
> +struct pidfd_getfd_options {};

Are empty structs well-defined in C (from memory, some compilers make
them non-zero in size)? Since we probably plan to add a flags field in
the future anyway, why not just have a __u64 flags which must be zeroed?

> +
> +#define PIDFD_GETFD_OPTIONS_SIZE_VER0	0
> +#define PIDFD_GETFD_OPTIONS_SIZE_LATEST	PIDFD_GETFD_OPTIONS_SIZE_VER0
> +
> +#endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 2278e249141d..2a9cb4be383f 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -42,6 +42,7 @@
>  #include <linux/sched/signal.h>
>  #include <linux/sched/task.h>
>  #include <linux/idr.h>
> +#include <uapi/linux/pidfd.h>
>  
>  struct pid init_struct_pid = {
>  	.count		= REFCOUNT_INIT(1),
> @@ -578,3 +579,117 @@ void __init pid_idr_init(void)
>  	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
>  			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
>  }
> +
> +static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
> +{
> +	struct file *file;
> +	int ret;
> +
> +	ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
> +		file = ERR_PTR(-EPERM);
> +		goto out;
> +	}
> +
> +	file = fget_task(task, fd);
> +	if (!file)
> +		file = ERR_PTR(-EBADF);
> +
> +out:
> +	mutex_unlock(&task->signal->cred_guard_mutex);
> +	return file;
> +}
> +
> +static long pidfd_getfd(struct pid *pid, u32 fd)
> +{
> +	struct task_struct *task;
> +	struct file *file;
> +	int ret, retfd;
> +
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task)
> +		return -ESRCH;
> +
> +	file = __pidfd_getfd_fget_task(task, fd);
> +	put_task_struct(task);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	retfd = get_unused_fd_flags(O_CLOEXEC);
> +	if (retfd < 0) {
> +		ret = retfd;
> +		goto out;
> +	}
> +
> +	/*
> +	 * security_file_receive must come last since it may have side effects
> +	 * and cannot be reversed.
> +	 */
> +	ret = security_file_receive(file);
> +	if (ret)
> +		goto out_put_fd;
> +
> +	fd_install(retfd, file);
> +	return retfd;
> +
> +out_put_fd:
> +	put_unused_fd(retfd);
> +out:
> +	fput(file);
> +	return ret;
> +}
> +
> +/**
> + * sys_pidfd_getfd() - Get a file descriptor from another process
> + *
> + * @pidfd:	file descriptor of the process
> + * @fd:		the file descriptor number to get
> + * @options:	options on how to get the fd
> + * @usize:	the size of options
> + *
> + * This syscall requires that the process has the ability to ptrace the
> + * process represented by the pidfd. It will return a duplicated version
> + * of the file descriptor on success. The process who which is having
> + * its file descriptor taken is otherwise unaffected. If options is NULL
> + * it is ignored along with usize.
> + *
> + * Return: On success, a file descriptor with cloexec is returned.
> + *         On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> +		struct pidfd_getfd_options __user *, options, size_t, usize)
> +{
> +	struct pid *pid;
> +	struct fd f;
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(struct pidfd_getfd_options) != PIDFD_GETFD_OPTIONS_SIZE_LATEST);
> +
> +	/*
> +	 * options is currently unused, verify it's unset or if it is set,
> +	 * ensure that size is 0.
> +	 *
> +	 * In the future, this will need to adopt copy_struct_from_user.
> +	 */
> +	if (options && usize > PIDFD_GETFD_OPTIONS_SIZE_VER0)
> +		return -E2BIG;

I wouldn't suggest doing this. I understand that this is simpler, but it
will cause problems once we add a field to pidfd_getfd_options -- newer
programs compiled with a larger struct won't work on older kernels with
this check (even if the struct is completely zeroed).

If copy_struct_from_user() doesn't work with usize == 0, I would *much*
prefer a patch which fixes that (effectively copy_struct_from_user()
with a usize == 0 should just be a call to check_zeroed_user()).

> +	f = fdget(pidfd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	pid = pidfd_pid(f.file);
> +	if (IS_ERR(pid)) {
> +		ret = PTR_ERR(pid);
> +		goto out;
> +	}
> +
> +	ret = pidfd_getfd(pid, fd);
> +
> +out:
> +	fdput(f);
> +	return ret;
> +}
> -- 
> 2.20.1
Sargun Dhillon Dec. 21, 2019, 1:45 a.m. UTC | #2
On Fri, Dec 20, 2019 at 4:27 PM Aleksa Sarai <asarai@suse.de> wrote:
>
> On 2019-12-20, Sargun Dhillon <sargun@sargun.me> wrote:
> > diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> > new file mode 100644
> > index 000000000000..0a3fc922661d
> > --- /dev/null
> > +++ b/include/uapi/linux/pidfd.h
> > @@ -0,0 +1,10 @@
> > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_PIDFD_H
> > +#define _UAPI_LINUX_PIDFD_H
> > +
> > +struct pidfd_getfd_options {};
>
> Are empty structs well-defined in C (from memory, some compilers make
> them non-zero in size)? Since we probably plan to add a flags field in
> the future anyway, why not just have a __u64 flags which must be zeroed?
>
It's allowed in GCC:
https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/Empty-Structures.html

I can add an __aligned_u64 flags for now, and just say something like
"reserved". This will also solve the latter issue, and I'll just use
copy_struct_from_user,
as long as Christian is okay with having an unused (reserved) flag member.


> > +     f = fdget(pidfd);
> > +     if (!f.file)
> > +             return -EBADF;
> > +
> > +     pid = pidfd_pid(f.file);
> > +     if (IS_ERR(pid)) {
> > +             ret = PTR_ERR(pid);
> > +             goto out;
> > +     }
> > +
> > +     ret = pidfd_getfd(pid, fd);
> > +
> > +out:
> > +     fdput(f);
> > +     return ret;
> > +}
> > --
> > 2.20.1
>
> --
> Aleksa Sarai
> Senior Software Engineer (Containers)
> SUSE Linux GmbH
> <https://www.cyphar.com/>
Christian Brauner Dec. 22, 2019, 12:47 p.m. UTC | #3
On Fri, Dec 20, 2019 at 11:28:13PM +0000, Sargun Dhillon wrote:
> This syscall allows for the retrieval of file descriptors from other
> processes, based on their pidfd. This is possible using ptrace, and
> injection of parasitic code along with using SCM_RIGHTS to move
> file descriptors between a tracee and a tracer. Unfortunately, ptrace
> comes with a high cost of requiring the process to be stopped, and
> breaks debuggers. This does not require stopping the process under
> manipulation.
> 
> One reason to use this is to allow sandboxers to take actions on file
> descriptors on the behalf of another process. For example, this can be
> combined with seccomp-bpf's user notification to do on-demand fd
> extraction and take privileged actions. For example, it can be used
> to bind a socket to a privileged port.
> 
> /* prototype */
>   /*
>    * pidfd_getfd_options is an extensible struct which can have options
>    * added to it. If options is NULL, size, and it will be ignored be
>    * ignored, otherwise, size should be set to sizeof(*options). If
>    * option is newer than the current kernel version, E2BIG will be
>    * returned.
>    */
>   struct pidfd_getfd_options {};
>   long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> 		   struct pidfd_getfd_options *options, size_t size);

The prototype advertises a flags argument but the actual 

+SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
+		struct pidfd_getfd_options __user *, options, size_t, usize)

does not have a flags argument...

I think having a flags argument makes a lot of sense.

I'm not sure what to think about the struct. I agree with Aleksa that
having an empty struct is not a great idea. From a design perspective it
seems very out of place. If we do a struct at all putting at least a
single reserved field in there might makes more sense.

In general, I think we need to have a _concrete_ reason why putting a
struct versioned by size as arguments for this syscall.
That means we need to have at least a concrete example for a new feature
for this syscall where a flag would not convey enough information.

And I'm not sure that there is a good one... I guess one thing I can
think of is that a caller might want dup-like semantics, i.e. a caller
might want to say:

pidfd_getfd(<pidfd>, <fd-to-get>, <fd-number-to-want>, <flags>, ...)

such that after pidfd_getfd() returns <fd-to-get> corresponds to
<fd-number-to-want> in the caller. But that can also be achieved via:
int fd = pidfd_getfd(<pidfd>, <fd-to-get>, <flags>, ...)
int final_fd = dup3(fd, <newfd>, O_CLOEXEC)

> 
> /* testing */
> Ran self-test suite on x86_64

+1

> 
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
>  MAINTAINERS                                 |   1 +
>  arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
>  arch/arm/tools/syscall.tbl                  |   1 +
>  arch/arm64/include/asm/unistd.h             |   2 +-
>  arch/arm64/include/asm/unistd32.h           |   2 +
>  arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
>  arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
>  arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
>  arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
>  arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
>  arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
>  arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
>  arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
>  arch/s390/kernel/syscalls/syscall.tbl       |   1 +
>  arch/sh/kernel/syscalls/syscall.tbl         |   1 +
>  arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
>  arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
>  arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
>  include/linux/syscalls.h                    |   4 +
>  include/uapi/asm-generic/unistd.h           |   3 +-
>  include/uapi/linux/pidfd.h                  |  10 ++
>  kernel/pid.c                                | 115 ++++++++++++++++++++
>  23 files changed, 151 insertions(+), 2 deletions(-)
>  create mode 100644 include/uapi/linux/pidfd.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cc0a4a8ae06a..bc370ff59dbf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13014,6 +13014,7 @@ M:	Christian Brauner <christian@brauner.io>
>  L:	linux-kernel@vger.kernel.org
>  S:	Maintained
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
> +F:	include/uapi/linux/pidfd.h
>  F:	samples/pidfd/
>  F:	tools/testing/selftests/pidfd/
>  F:	tools/testing/selftests/clone3/
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 8e13b0b2928d..d1cac0d657b7 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -475,3 +475,4 @@
>  543	common	fspick				sys_fspick
>  544	common	pidfd_open			sys_pidfd_open
>  # 545 reserved for clone3
> +548	common	pidfd_getfd			sys_pidfd
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index 6da7dc4d79cc..ba045e2f3a60 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -449,3 +449,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 2629a68b8724..b722e47377a5 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -38,7 +38,7 @@
>  #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
>  #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
>  
> -#define __NR_compat_syscalls		436
> +#define __NR_compat_syscalls		439
>  #endif
>  
>  #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index 94ab29cf4f00..a8da97a2de41 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -879,6 +879,8 @@ __SYSCALL(__NR_fspick, sys_fspick)
>  __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>  #define __NR_clone3 435
>  __SYSCALL(__NR_clone3, sys_clone3)
> +#define __NR_pidfd_getfd 438
> +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
>  
>  /*
>   * Please add new compat syscalls above this comment and update
> diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
> index 36d5faf4c86c..2b11adfc860c 100644
> --- a/arch/ia64/kernel/syscalls/syscall.tbl
> +++ b/arch/ia64/kernel/syscalls/syscall.tbl
> @@ -356,3 +356,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index a88a285a0e5f..44e879e98459 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -435,3 +435,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 09b0cd7dab0a..7afa00125cc4 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -441,3 +441,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index e7c5ab38e403..856d5ba34461 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -374,3 +374,4 @@
>  433	n32	fspick				sys_fspick
>  434	n32	pidfd_open			sys_pidfd_open
>  435	n32	clone3				__sys_clone3
> +438	n32	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 13cd66581f3b..2db6075352f3 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -350,3 +350,4 @@
>  433	n64	fspick				sys_fspick
>  434	n64	pidfd_open			sys_pidfd_open
>  435	n64	clone3				__sys_clone3
> +438	n64	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 353539ea4140..e9f9d4a9b105 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -423,3 +423,4 @@
>  433	o32	fspick				sys_fspick
>  434	o32	pidfd_open			sys_pidfd_open
>  435	o32	clone3				__sys_clone3
> +438	o32	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index 285ff516150c..c58c7eb144ca 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -433,3 +433,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3_wrapper
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 43f736ed47f2..707609bfe3ea 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -517,3 +517,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	nospu	clone3				ppc_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index 3054e9c035a3..185cd624face 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
>  433  common	fspick			sys_fspick			sys_fspick
>  434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
>  435  common	clone3			sys_clone3			sys_clone3
> +438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index b5ed26c4c005..88f90895aad8 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index 8c8cc7537fb2..218df6a2326e 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -481,3 +481,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  # 435 reserved for clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 15908eb9b17e..9c3101b65e0f 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -440,3 +440,4 @@
>  433	i386	fspick			sys_fspick			__ia32_sys_fspick
>  434	i386	pidfd_open		sys_pidfd_open			__ia32_sys_pidfd_open
>  435	i386	clone3			sys_clone3			__ia32_sys_clone3
> +438	i386	pidfd_getfd		sys_pidfd_getfd			__ia32_sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index c29976eca4a8..cef85db75a62 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -357,6 +357,7 @@
>  433	common	fspick			__x64_sys_fspick
>  434	common	pidfd_open		__x64_sys_pidfd_open
>  435	common	clone3			__x64_sys_clone3/ptregs
> +438	common	pidfd_getfd		__x64_sys_pidfd_getfd
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index 25f4de729a6d..ae15183def12 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -406,3 +406,4 @@
>  433	common	fspick				sys_fspick
>  434	common	pidfd_open			sys_pidfd_open
>  435	common	clone3				sys_clone3
> +438	common	pidfd_getfd			sys_pidfd_getfd
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2960dedcfde8..62fe706329d1 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -69,6 +69,7 @@ struct rseq;
>  union bpf_attr;
>  struct io_uring_params;
>  struct clone_args;
> +struct pidfd_getfd_options;
>  
>  #include <linux/types.h>
>  #include <linux/aio_abi.h>
> @@ -1000,6 +1001,9 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
>  asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
>  				       siginfo_t __user *info,
>  				       unsigned int flags);
> +asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
> +				struct pidfd_getfd_options __user *options,
> +				size_t, usize);
>  
>  /*
>   * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1fc8faa6e973..f358488366f6 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -850,9 +850,10 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>  #define __NR_clone3 435
>  __SYSCALL(__NR_clone3, sys_clone3)
>  #endif
> +#define __NR_pidfd_getfd 438
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 436
> +#define __NR_syscalls 439
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> new file mode 100644
> index 000000000000..0a3fc922661d
> --- /dev/null
> +++ b/include/uapi/linux/pidfd.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PIDFD_H
> +#define _UAPI_LINUX_PIDFD_H
> +
> +struct pidfd_getfd_options {};
> +
> +#define PIDFD_GETFD_OPTIONS_SIZE_VER0	0
> +#define PIDFD_GETFD_OPTIONS_SIZE_LATEST	PIDFD_GETFD_OPTIONS_SIZE_VER0
> +
> +#endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 2278e249141d..2a9cb4be383f 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -42,6 +42,7 @@
>  #include <linux/sched/signal.h>
>  #include <linux/sched/task.h>
>  #include <linux/idr.h>
> +#include <uapi/linux/pidfd.h>
>  
>  struct pid init_struct_pid = {
>  	.count		= REFCOUNT_INIT(1),
> @@ -578,3 +579,117 @@ void __init pid_idr_init(void)
>  	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
>  			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
>  }
> +
> +static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
> +{
> +	struct file *file;
> +	int ret;
> +
> +	ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
> +		file = ERR_PTR(-EPERM);
> +		goto out;
> +	}
> +
> +	file = fget_task(task, fd);
> +	if (!file)
> +		file = ERR_PTR(-EBADF);
> +
> +out:
> +	mutex_unlock(&task->signal->cred_guard_mutex);
> +	return file;
> +}
> +
> +static long pidfd_getfd(struct pid *pid, u32 fd)
> +{
> +	struct task_struct *task;
> +	struct file *file;
> +	int ret, retfd;
> +
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task)
> +		return -ESRCH;
> +
> +	file = __pidfd_getfd_fget_task(task, fd);
> +	put_task_struct(task);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	retfd = get_unused_fd_flags(O_CLOEXEC);
> +	if (retfd < 0) {
> +		ret = retfd;
> +		goto out;
> +	}
> +
> +	/*
> +	 * security_file_receive must come last since it may have side effects
> +	 * and cannot be reversed.
> +	 */
> +	ret = security_file_receive(file);
> +	if (ret)
> +		goto out_put_fd;
> +
> +	fd_install(retfd, file);
> +	return retfd;
> +
> +out_put_fd:
> +	put_unused_fd(retfd);
> +out:
> +	fput(file);
> +	return ret;
> +}
> +
> +/**
> + * sys_pidfd_getfd() - Get a file descriptor from another process
> + *
> + * @pidfd:	file descriptor of the process
> + * @fd:		the file descriptor number to get
> + * @options:	options on how to get the fd
> + * @usize:	the size of options
> + *
> + * This syscall requires that the process has the ability to ptrace the
> + * process represented by the pidfd. It will return a duplicated version
> + * of the file descriptor on success. The process who which is having

s/who which/which/

> + * its file descriptor taken is otherwise unaffected. If options is NULL
> + * it is ignored along with usize.
> + *
> + * Return: On success, a file descriptor with cloexec is returned.
> + *         On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> +		struct pidfd_getfd_options __user *, options, size_t, usize)
> +{
> +	struct pid *pid;
> +	struct fd f;
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(struct pidfd_getfd_options) != PIDFD_GETFD_OPTIONS_SIZE_LATEST);
> +
> +	/*
> +	 * options is currently unused, verify it's unset or if it is set,
> +	 * ensure that size is 0.
> +	 *
> +	 * In the future, this will need to adopt copy_struct_from_user.
> +	 */
> +	if (options && usize > PIDFD_GETFD_OPTIONS_SIZE_VER0)
> +		return -E2BIG;
> +
> +	f = fdget(pidfd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	pid = pidfd_pid(f.file);
> +	if (IS_ERR(pid)) {
> +		ret = PTR_ERR(pid);
> +		goto out;
> +	}
> +
> +	ret = pidfd_getfd(pid, fd);
> +
> +out:
> +	fdput(f);
> +	return ret;
> +}
> -- 
> 2.20.1
>
Sargun Dhillon Dec. 22, 2019, 6:36 p.m. UTC | #4
, On Sun, Dec 22, 2019 at 4:48 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
>
> On Fri, Dec 20, 2019 at 11:28:13PM +0000, Sargun Dhillon wrote:
> > This syscall allows for the retrieval of file descriptors from other
> > processes, based on their pidfd. This is possible using ptrace, and
> > injection of parasitic code along with using SCM_RIGHTS to move
> > file descriptors between a tracee and a tracer. Unfortunately, ptrace
> > comes with a high cost of requiring the process to be stopped, and
> > breaks debuggers. This does not require stopping the process under
> > manipulation.
> >
> > One reason to use this is to allow sandboxers to take actions on file
> > descriptors on the behalf of another process. For example, this can be
> > combined with seccomp-bpf's user notification to do on-demand fd
> > extraction and take privileged actions. For example, it can be used
> > to bind a socket to a privileged port.
> >
> > /* prototype */
> >   /*
> >    * pidfd_getfd_options is an extensible struct which can have options
> >    * added to it. If options is NULL, size, and it will be ignored be
> >    * ignored, otherwise, size should be set to sizeof(*options). If
> >    * option is newer than the current kernel version, E2BIG will be
> >    * returned.
> >    */
> >   struct pidfd_getfd_options {};
> >   long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> >                  struct pidfd_getfd_options *options, size_t size);
That's embarrassing. This was supposed to read:
long pidfd_getfd(int pidfd, int fd, struct pidfd_get_options *options,
size_t size);

>
> The prototype advertises a flags argument but the actual
>
> +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> +               struct pidfd_getfd_options __user *, options, size_t, usize)
>
> does not have a flags argument...
>
> I think having a flags argument makes a lot of sense.
>
> I'm not sure what to think about the struct. I agree with Aleksa that
> having an empty struct is not a great idea. From a design perspective it
> seems very out of place. If we do a struct at all putting at least a
> single reserved field in there might makes more sense.
>
> In general, I think we need to have a _concrete_ reason why putting a
> struct versioned by size as arguments for this syscall.
> That means we need to have at least a concrete example for a new feature
> for this syscall where a flag would not convey enough information.
I can think of at least two reasons we need flags:
* Clearing cgroup flags
* Closing the process under manipulation's FD when we fetch it.

The original reason for wanting to have two places where we can put
flags was to have a different field for fd flags vs. call flags. I'm not sure
there's any flags you'd want to set.

Given this, if we want to go down the route of a syscall, we should just
leave it as a __u64 flags, and drop the pointer to the struct, if we're
not worried about that.

>
> And I'm not sure that there is a good one... I guess one thing I can
> think of is that a caller might want dup-like semantics, i.e. a caller
> might want to say:
>
> pidfd_getfd(<pidfd>, <fd-to-get>, <fd-number-to-want>, <flags>, ...)
>
> such that after pidfd_getfd() returns <fd-to-get> corresponds to
> <fd-number-to-want> in the caller. But that can also be achieved via:
> int fd = pidfd_getfd(<pidfd>, <fd-to-get>, <flags>, ...)
> int final_fd = dup3(fd, <newfd>, O_CLOEXEC)
>
> >
> > /* testing */
> > Ran self-test suite on x86_64
>
> +1
>
Christian Brauner Dec. 22, 2019, 8:15 p.m. UTC | #5
On Sun, Dec 22, 2019 at 10:36:42AM -0800, Sargun Dhillon wrote:
> , On Sun, Dec 22, 2019 at 4:48 AM Christian Brauner
> <christian.brauner@ubuntu.com> wrote:
> >
> > On Fri, Dec 20, 2019 at 11:28:13PM +0000, Sargun Dhillon wrote:
> > > This syscall allows for the retrieval of file descriptors from other
> > > processes, based on their pidfd. This is possible using ptrace, and
> > > injection of parasitic code along with using SCM_RIGHTS to move
> > > file descriptors between a tracee and a tracer. Unfortunately, ptrace
> > > comes with a high cost of requiring the process to be stopped, and
> > > breaks debuggers. This does not require stopping the process under
> > > manipulation.
> > >
> > > One reason to use this is to allow sandboxers to take actions on file
> > > descriptors on the behalf of another process. For example, this can be
> > > combined with seccomp-bpf's user notification to do on-demand fd
> > > extraction and take privileged actions. For example, it can be used
> > > to bind a socket to a privileged port.
> > >
> > > /* prototype */
> > >   /*
> > >    * pidfd_getfd_options is an extensible struct which can have options
> > >    * added to it. If options is NULL, size, and it will be ignored be
> > >    * ignored, otherwise, size should be set to sizeof(*options). If
> > >    * option is newer than the current kernel version, E2BIG will be
> > >    * returned.
> > >    */
> > >   struct pidfd_getfd_options {};
> > >   long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> > >                  struct pidfd_getfd_options *options, size_t size);
> That's embarrassing. This was supposed to read:
> long pidfd_getfd(int pidfd, int fd, struct pidfd_get_options *options,
> size_t size);
> 
> >
> > The prototype advertises a flags argument but the actual
> >
> > +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> > +               struct pidfd_getfd_options __user *, options, size_t, usize)
> >
> > does not have a flags argument...
> >
> > I think having a flags argument makes a lot of sense.
> >
> > I'm not sure what to think about the struct. I agree with Aleksa that
> > having an empty struct is not a great idea. From a design perspective it
> > seems very out of place. If we do a struct at all putting at least a
> > single reserved field in there might makes more sense.
> >
> > In general, I think we need to have a _concrete_ reason why putting a
> > struct versioned by size as arguments for this syscall.
> > That means we need to have at least a concrete example for a new feature
> > for this syscall where a flag would not convey enough information.
> I can think of at least two reasons we need flags:
> * Clearing cgroup flags
> * Closing the process under manipulation's FD when we fetch it.
> 
> The original reason for wanting to have two places where we can put
> flags was to have a different field for fd flags vs. call flags. I'm not sure
> there's any flags you'd want to set.
> 
> Given this, if we want to go down the route of a syscall, we should just
> leave it as a __u64 flags, and drop the pointer to the struct, if we're

I think it needs to be an unsigned int. Having a 64bit register arg is
really messy on 32bit and means you need to have a compat syscall
implementation which handles this.

Christian
kernel test robot Dec. 24, 2019, 11:09 a.m. UTC | #6
Hi Sargun,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kselftest/next]
[also build test ERROR on arm64/for-next/core linus/master v5.5-rc3]
[cannot apply to tip/x86/asm next-20191220]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Sargun-Dhillon/Add-pidfd_getfd-syscall/20191224-061915
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: i386-randconfig-a001-20191224 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.2-10+deb8u1) 4.9.2
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from drivers//ptp/ptp_clock.c:16:0:
>> include/linux/syscalls.h:1006:13: error: unknown type name 'usize'
        size_t, usize);
                ^

vim +/usize +1006 include/linux/syscalls.h

   864	
   865	/* mm/, CONFIG_MMU only */
   866	asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
   867	asmlinkage long sys_swapoff(const char __user *specialfile);
   868	asmlinkage long sys_mprotect(unsigned long start, size_t len,
   869					unsigned long prot);
   870	asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
   871	asmlinkage long sys_mlock(unsigned long start, size_t len);
   872	asmlinkage long sys_munlock(unsigned long start, size_t len);
   873	asmlinkage long sys_mlockall(int flags);
   874	asmlinkage long sys_munlockall(void);
   875	asmlinkage long sys_mincore(unsigned long start, size_t len,
   876					unsigned char __user * vec);
   877	asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
   878	asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
   879				unsigned long prot, unsigned long pgoff,
   880				unsigned long flags);
   881	asmlinkage long sys_mbind(unsigned long start, unsigned long len,
   882					unsigned long mode,
   883					const unsigned long __user *nmask,
   884					unsigned long maxnode,
   885					unsigned flags);
   886	asmlinkage long sys_get_mempolicy(int __user *policy,
   887					unsigned long __user *nmask,
   888					unsigned long maxnode,
   889					unsigned long addr, unsigned long flags);
   890	asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
   891					unsigned long maxnode);
   892	asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
   893					const unsigned long __user *from,
   894					const unsigned long __user *to);
   895	asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
   896					const void __user * __user *pages,
   897					const int __user *nodes,
   898					int __user *status,
   899					int flags);
   900	
   901	asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
   902			siginfo_t __user *uinfo);
   903	asmlinkage long sys_perf_event_open(
   904			struct perf_event_attr __user *attr_uptr,
   905			pid_t pid, int cpu, int group_fd, unsigned long flags);
   906	asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
   907	asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
   908				     unsigned int vlen, unsigned flags,
   909				     struct __kernel_timespec __user *timeout);
   910	asmlinkage long sys_recvmmsg_time32(int fd, struct mmsghdr __user *msg,
   911				     unsigned int vlen, unsigned flags,
   912				     struct old_timespec32 __user *timeout);
   913	
   914	asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
   915					int options, struct rusage __user *ru);
   916	asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
   917					const struct rlimit64 __user *new_rlim,
   918					struct rlimit64 __user *old_rlim);
   919	asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
   920	asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
   921					  u64 mask, int fd,
   922					  const char  __user *pathname);
   923	asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
   924					      struct file_handle __user *handle,
   925					      int __user *mnt_id, int flag);
   926	asmlinkage long sys_open_by_handle_at(int mountdirfd,
   927					      struct file_handle __user *handle,
   928					      int flags);
   929	asmlinkage long sys_clock_adjtime(clockid_t which_clock,
   930					struct __kernel_timex __user *tx);
   931	asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
   932					struct old_timex32 __user *tx);
   933	asmlinkage long sys_syncfs(int fd);
   934	asmlinkage long sys_setns(int fd, int nstype);
   935	asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
   936	asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
   937				     unsigned int vlen, unsigned flags);
   938	asmlinkage long sys_process_vm_readv(pid_t pid,
   939					     const struct iovec __user *lvec,
   940					     unsigned long liovcnt,
   941					     const struct iovec __user *rvec,
   942					     unsigned long riovcnt,
   943					     unsigned long flags);
   944	asmlinkage long sys_process_vm_writev(pid_t pid,
   945					      const struct iovec __user *lvec,
   946					      unsigned long liovcnt,
   947					      const struct iovec __user *rvec,
   948					      unsigned long riovcnt,
   949					      unsigned long flags);
   950	asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
   951				 unsigned long idx1, unsigned long idx2);
   952	asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
   953	asmlinkage long sys_sched_setattr(pid_t pid,
   954						struct sched_attr __user *attr,
   955						unsigned int flags);
   956	asmlinkage long sys_sched_getattr(pid_t pid,
   957						struct sched_attr __user *attr,
   958						unsigned int size,
   959						unsigned int flags);
   960	asmlinkage long sys_renameat2(int olddfd, const char __user *oldname,
   961				      int newdfd, const char __user *newname,
   962				      unsigned int flags);
   963	asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
   964				    void __user *uargs);
   965	asmlinkage long sys_getrandom(char __user *buf, size_t count,
   966				      unsigned int flags);
   967	asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
   968	asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
   969	asmlinkage long sys_execveat(int dfd, const char __user *filename,
   970				const char __user *const __user *argv,
   971				const char __user *const __user *envp, int flags);
   972	asmlinkage long sys_userfaultfd(int flags);
   973	asmlinkage long sys_membarrier(int cmd, int flags);
   974	asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
   975	asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
   976					    int fd_out, loff_t __user *off_out,
   977					    size_t len, unsigned int flags);
   978	asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
   979				    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
   980				    rwf_t flags);
   981	asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
   982				    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
   983				    rwf_t flags);
   984	asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
   985					  unsigned long prot, int pkey);
   986	asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
   987	asmlinkage long sys_pkey_free(int pkey);
   988	asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
   989				  unsigned mask, struct statx __user *buffer);
   990	asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
   991				 int flags, uint32_t sig);
   992	asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
   993	asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
   994				       int to_dfd, const char __user *to_path,
   995				       unsigned int ms_flags);
   996	asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
   997	asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
   998				     const void __user *value, int aux);
   999	asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
  1000	asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
  1001	asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
  1002					       siginfo_t __user *info,
  1003					       unsigned int flags);
  1004	asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
  1005					struct pidfd_getfd_options __user *options,
> 1006					size_t, usize);
  1007	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index cc0a4a8ae06a..bc370ff59dbf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13014,6 +13014,7 @@  M:	Christian Brauner <christian@brauner.io>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
+F:	include/uapi/linux/pidfd.h
 F:	samples/pidfd/
 F:	tools/testing/selftests/pidfd/
 F:	tools/testing/selftests/clone3/
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 8e13b0b2928d..d1cac0d657b7 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -475,3 +475,4 @@ 
 543	common	fspick				sys_fspick
 544	common	pidfd_open			sys_pidfd_open
 # 545 reserved for clone3
+548	common	pidfd_getfd			sys_pidfd
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 6da7dc4d79cc..ba045e2f3a60 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -449,3 +449,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 2629a68b8724..b722e47377a5 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@ 
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		436
+#define __NR_compat_syscalls		439
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 94ab29cf4f00..a8da97a2de41 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -879,6 +879,8 @@  __SYSCALL(__NR_fspick, sys_fspick)
 __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 #define __NR_clone3 435
 __SYSCALL(__NR_clone3, sys_clone3)
+#define __NR_pidfd_getfd 438
+__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 36d5faf4c86c..2b11adfc860c 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -356,3 +356,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index a88a285a0e5f..44e879e98459 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -435,3 +435,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 09b0cd7dab0a..7afa00125cc4 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index e7c5ab38e403..856d5ba34461 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -374,3 +374,4 @@ 
 433	n32	fspick				sys_fspick
 434	n32	pidfd_open			sys_pidfd_open
 435	n32	clone3				__sys_clone3
+438	n32	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 13cd66581f3b..2db6075352f3 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -350,3 +350,4 @@ 
 433	n64	fspick				sys_fspick
 434	n64	pidfd_open			sys_pidfd_open
 435	n64	clone3				__sys_clone3
+438	n64	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 353539ea4140..e9f9d4a9b105 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -423,3 +423,4 @@ 
 433	o32	fspick				sys_fspick
 434	o32	pidfd_open			sys_pidfd_open
 435	o32	clone3				__sys_clone3
+438	o32	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 285ff516150c..c58c7eb144ca 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -433,3 +433,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3_wrapper
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 43f736ed47f2..707609bfe3ea 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -517,3 +517,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	nospu	clone3				ppc_clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 3054e9c035a3..185cd624face 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@ 
 433  common	fspick			sys_fspick			sys_fspick
 434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
 435  common	clone3			sys_clone3			sys_clone3
+438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index b5ed26c4c005..88f90895aad8 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 8c8cc7537fb2..218df6a2326e 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -481,3 +481,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 15908eb9b17e..9c3101b65e0f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -440,3 +440,4 @@ 
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
 434	i386	pidfd_open		sys_pidfd_open			__ia32_sys_pidfd_open
 435	i386	clone3			sys_clone3			__ia32_sys_clone3
+438	i386	pidfd_getfd		sys_pidfd_getfd			__ia32_sys_pidfd_getfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c29976eca4a8..cef85db75a62 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -357,6 +357,7 @@ 
 433	common	fspick			__x64_sys_fspick
 434	common	pidfd_open		__x64_sys_pidfd_open
 435	common	clone3			__x64_sys_clone3/ptregs
+438	common	pidfd_getfd		__x64_sys_pidfd_getfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 25f4de729a6d..ae15183def12 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -406,3 +406,4 @@ 
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3
+438	common	pidfd_getfd			sys_pidfd_getfd
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2960dedcfde8..62fe706329d1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,7 @@  struct rseq;
 union bpf_attr;
 struct io_uring_params;
 struct clone_args;
+struct pidfd_getfd_options;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1000,6 +1001,9 @@  asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
+asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
+				struct pidfd_getfd_options __user *options,
+				size_t, usize);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1fc8faa6e973..f358488366f6 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -850,9 +850,10 @@  __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 #define __NR_clone3 435
 __SYSCALL(__NR_clone3, sys_clone3)
 #endif
+#define __NR_pidfd_getfd 438
 
 #undef __NR_syscalls
-#define __NR_syscalls 436
+#define __NR_syscalls 439
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
new file mode 100644
index 000000000000..0a3fc922661d
--- /dev/null
+++ b/include/uapi/linux/pidfd.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_PIDFD_H
+#define _UAPI_LINUX_PIDFD_H
+
+struct pidfd_getfd_options {};
+
+#define PIDFD_GETFD_OPTIONS_SIZE_VER0	0
+#define PIDFD_GETFD_OPTIONS_SIZE_LATEST	PIDFD_GETFD_OPTIONS_SIZE_VER0
+
+#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 2278e249141d..2a9cb4be383f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@ 
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
+#include <uapi/linux/pidfd.h>
 
 struct pid init_struct_pid = {
 	.count		= REFCOUNT_INIT(1),
@@ -578,3 +579,117 @@  void __init pid_idr_init(void)
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 }
+
+static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
+{
+	struct file *file;
+	int ret;
+
+	ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
+		file = ERR_PTR(-EPERM);
+		goto out;
+	}
+
+	file = fget_task(task, fd);
+	if (!file)
+		file = ERR_PTR(-EBADF);
+
+out:
+	mutex_unlock(&task->signal->cred_guard_mutex);
+	return file;
+}
+
+static long pidfd_getfd(struct pid *pid, u32 fd)
+{
+	struct task_struct *task;
+	struct file *file;
+	int ret, retfd;
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return -ESRCH;
+
+	file = __pidfd_getfd_fget_task(task, fd);
+	put_task_struct(task);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	retfd = get_unused_fd_flags(O_CLOEXEC);
+	if (retfd < 0) {
+		ret = retfd;
+		goto out;
+	}
+
+	/*
+	 * security_file_receive must come last since it may have side effects
+	 * and cannot be reversed.
+	 */
+	ret = security_file_receive(file);
+	if (ret)
+		goto out_put_fd;
+
+	fd_install(retfd, file);
+	return retfd;
+
+out_put_fd:
+	put_unused_fd(retfd);
+out:
+	fput(file);
+	return ret;
+}
+
+/**
+ * sys_pidfd_getfd() - Get a file descriptor from another process
+ *
+ * @pidfd:	file descriptor of the process
+ * @fd:		the file descriptor number to get
+ * @options:	options on how to get the fd
+ * @usize:	the size of options
+ *
+ * This syscall requires that the process has the ability to ptrace the
+ * process represented by the pidfd. It will return a duplicated version
+ * of the file descriptor on success. The process who which is having
+ * its file descriptor taken is otherwise unaffected. If options is NULL
+ * it is ignored along with usize.
+ *
+ * Return: On success, a file descriptor with cloexec is returned.
+ *         On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
+		struct pidfd_getfd_options __user *, options, size_t, usize)
+{
+	struct pid *pid;
+	struct fd f;
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct pidfd_getfd_options) != PIDFD_GETFD_OPTIONS_SIZE_LATEST);
+
+	/*
+	 * options is currently unused, verify it's unset or if it is set,
+	 * ensure that size is 0.
+	 *
+	 * In the future, this will need to adopt copy_struct_from_user.
+	 */
+	if (options && usize > PIDFD_GETFD_OPTIONS_SIZE_VER0)
+		return -E2BIG;
+
+	f = fdget(pidfd);
+	if (!f.file)
+		return -EBADF;
+
+	pid = pidfd_pid(f.file);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto out;
+	}
+
+	ret = pidfd_getfd(pid, fd);
+
+out:
+	fdput(f);
+	return ret;
+}