diff mbox series

[v2] signal: add procfd_signal() syscall

Message ID 20181120105124.14733-1-christian@brauner.io (mailing list archive)
State New, archived
Headers show
Series [v2] signal: add procfd_signal() syscall | expand

Commit Message

Christian Brauner Nov. 20, 2018, 10:51 a.m. UTC
The kill() syscall operates on process identifiers. After a process has
exited its pid can be reused by another process. If a caller sends a signal
to a reused pid it will end up signaling the wrong process. This issue has
often surfaced and there has been a push [1] to address this problem.

This patch uses file descriptors from proc/<pid> as stable handles on
struct pid. Even if a pid is recycled the handle will not change. The file
descriptor can be used to send signals to the referenced process.
Thus, the  new syscall procfd_signal() is introduced to solve this problem.
It operates on a process file descriptor.
The syscall takes an additional siginfo_t and flags argument. If siginfo_t
is NULL then procfd_signal() behaves like kill() if it is not NULL it
behaves like rt_sigqueueinfo.
The flags argument is added to allow for future extensions of this syscall.
It currently needs to be passed as 0.

With this patch a process can be killed via:

 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>

 int main(int argc, char *argv[])
 {
         int ret;
         char buf[1000];

         if (argc < 2)
                 exit(EXIT_FAILURE);

         ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
         if (ret < 0)
                 exit(EXIT_FAILURE);

         int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
         if (fd < 0) {
                 printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
                 exit(EXIT_FAILURE);
         }

         ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
         if (ret < 0) {
                 printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
                 close(fd);
                 exit(EXIT_FAILURE);
         }

         close(fd);

         exit(EXIT_SUCCESS);
 }

[1]: https://lkml.org/lkml/2018/11/18/130

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <christian@brauner.io>
---
Changelog:
v2:
- define __NR_procfd_signal in unistd.h
- wire up compat syscall
- s/proc_is_procfd/proc_is_tgid_procfd/g
- provide stubs when CONFIG_PROC_FS=n
- move proc_pid() to linux/proc_fs.h header
- use proc_pid() to grab struct pid from /proc/<pid> fd
v1:
- patch introduced
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   2 +
 fs/proc/base.c                         |  11 ++-
 fs/proc/internal.h                     |   5 -
 include/linux/proc_fs.h                |  12 +++
 include/linux/syscalls.h               |   2 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/signal.c                        | 127 +++++++++++++++++++++++--
 8 files changed, 151 insertions(+), 13 deletions(-)

Comments

Serge E. Hallyn Nov. 22, 2018, 8 a.m. UTC | #1
On Tue, Nov 20, 2018 at 11:51:23AM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers. After a process has
> exited its pid can be reused by another process. If a caller sends a signal
> to a reused pid it will end up signaling the wrong process. This issue has
> often surfaced and there has been a push [1] to address this problem.
> 
> This patch uses file descriptors from proc/<pid> as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The file
> descriptor can be used to send signals to the referenced process.
> Thus, the  new syscall procfd_signal() is introduced to solve this problem.
> It operates on a process file descriptor.
> The syscall takes an additional siginfo_t and flags argument. If siginfo_t
> is NULL then procfd_signal() behaves like kill() if it is not NULL it
> behaves like rt_sigqueueinfo.
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0.
> 
> With this patch a process can be killed via:
> 
>  #define _GNU_SOURCE
>  #include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <signal.h>
>  #include <sys/stat.h>
>  #include <sys/syscall.h>
>  #include <sys/types.h>
>  #include <unistd.h>
> 
>  int main(int argc, char *argv[])
>  {
>          int ret;
>          char buf[1000];
> 
>          if (argc < 2)
>                  exit(EXIT_FAILURE);
> 
>          ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
>          if (ret < 0)

  || ret > sizeof(buf) ? :-)  I mean, you *are* passing the string...

>                  exit(EXIT_FAILURE);
> 
>          int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
>          if (fd < 0) {
>                  printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
>                  exit(EXIT_FAILURE);
>          }
> 
>          ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
>          if (ret < 0) {
>                  printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
>                  close(fd);
>                  exit(EXIT_FAILURE);
>          }
> 
>          close(fd);
> 
>          exit(EXIT_SUCCESS);
>  }
> 
> [1]: https://lkml.org/lkml/2018/11/18/130
> 
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> Cc: Serge Hallyn <serge@hallyn.com>

Acked-by: Serge Hallyn <serge@hallyn.com>

> Cc: Jann Horn <jannh@google.com>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Andy Lutomirsky <luto@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: Aleksa Sarai <cyphar@cyphar.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Christian Brauner <christian@brauner.io>
> ---
> Changelog:
> v2:
> - define __NR_procfd_signal in unistd.h
> - wire up compat syscall
> - s/proc_is_procfd/proc_is_tgid_procfd/g
> - provide stubs when CONFIG_PROC_FS=n
> - move proc_pid() to linux/proc_fs.h header
> - use proc_pid() to grab struct pid from /proc/<pid> fd
> v1:
> - patch introduced
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   2 +
>  fs/proc/base.c                         |  11 ++-
>  fs/proc/internal.h                     |   5 -
>  include/linux/proc_fs.h                |  12 +++
>  include/linux/syscalls.h               |   2 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/signal.c                        | 127 +++++++++++++++++++++++--
>  8 files changed, 151 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
>  385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
>  386	i386	rseq			sys_rseq			__ia32_sys_rseq
> +387	i386	procfd_signal		sys_procfd_signal		__ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332	common	statx			__x64_sys_statx
>  333	common	io_pgetevents		__x64_sys_io_pgetevents
>  334	common	rseq			__x64_sys_rseq
> +335	64	procfd_signal		__x64_sys_procfd_signal
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
>  545	x32	execveat		__x32_compat_sys_execveat/ptregs
>  546	x32	preadv2			__x32_compat_sys_preadv64v2
>  547	x32	pwritev2		__x32_compat_sys_pwritev64v2
> +548	x32	procfd_signal		__x32_compat_sys_procfd_signal
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ce3465479447..771c6bd1cac6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int mask)
>  	return generic_permission(inode, mask);
>  }
>  
> -
> +struct pid *proc_pid(const struct inode *inode)
> +{
> +       return PROC_I(inode)->pid;
> +}
>  
>  static const struct inode_operations proc_def_inode_operations = {
>  	.setattr	= proc_setattr,
> @@ -3038,6 +3041,12 @@ static const struct file_operations proc_tgid_base_operations = {
>  	.llseek		= generic_file_llseek,
>  };
>  
> +bool proc_is_tgid_procfd(const struct file *file)
> +{
> +	return d_is_dir(file->f_path.dentry) &&
> +	       (file->f_op == &proc_tgid_base_operations);
> +}
> +
>  static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
>  {
>  	return proc_pident_lookup(dir, dentry,
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 5185d7f6a51e..eb69afba83f3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -113,11 +113,6 @@ static inline void *__PDE_DATA(const struct inode *inode)
>  	return PDE(inode)->data;
>  }
>  
> -static inline struct pid *proc_pid(const struct inode *inode)
> -{
> -	return PROC_I(inode)->pid;
> -}
> -
>  static inline struct task_struct *get_proc_task(const struct inode *inode)
>  {
>  	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index d0e1f1522a78..96df2fe6311d 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -73,6 +73,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
>  						    int (*show)(struct seq_file *, void *),
>  						    proc_write_t write,
>  						    void *data);
> +extern bool proc_is_tgid_procfd(const struct file *file);
> +extern struct pid *proc_pid(const struct inode *inode);
>  
>  #else /* CONFIG_PROC_FS */
>  
> @@ -114,6 +116,16 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
>  #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
>  #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
>  
> +static inline bool proc_is_tgid_procfd(const struct file *file)
> +{
> +	return false;
> +}
> +
> +static inline struct pid *proc_pid(const struct inode *inode)
> +{
> +	return NULL;
> +}
> +
>  #endif /* CONFIG_PROC_FS */
>  
>  struct net;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2ac3d13a915b..a5ca8cb84566 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -907,6 +907,8 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
>  asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
>  			 int flags, uint32_t sig);
> +asmlinkage long sys_procfd_signal(int fd, int sig, siginfo_t __user *info,
> +				  int flags);
>  
>  /*
>   * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 538546edbfbd..4dc81a994ad1 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -738,9 +738,11 @@ __SYSCALL(__NR_statx,     sys_statx)
>  __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
>  #define __NR_rseq 293
>  __SYSCALL(__NR_rseq, sys_rseq)
> +#define __NR_procfd_signal 294
> +__SC_COMP(__NR_procfd_signal, sys_procfd_signal, compat_sys_procfd_signal)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 294
> +#define __NR_syscalls 295
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9a32bc2088c9..13695342f150 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -19,7 +19,9 @@
>  #include <linux/sched/task.h>
>  #include <linux/sched/task_stack.h>
>  #include <linux/sched/cputime.h>
> +#include <linux/file.h>
>  #include <linux/fs.h>
> +#include <linux/proc_fs.h>
>  #include <linux/tty.h>
>  #include <linux/binfmts.h>
>  #include <linux/coredump.h>
> @@ -3286,6 +3288,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
>  }
>  #endif
>  
> +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
> +{
> +	clear_siginfo(info);
> +	info->si_signo = sig;
> +	info->si_errno = 0;
> +	info->si_code = SI_USER;
> +	info->si_pid = task_tgid_vnr(current);
> +	info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
> +}
> +
>  /**
>   *  sys_kill - send a signal to a process
>   *  @pid: the PID of the process
> @@ -3295,16 +3307,119 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
>  {
>  	struct kernel_siginfo info;
>  
> -	clear_siginfo(&info);
> -	info.si_signo = sig;
> -	info.si_errno = 0;
> -	info.si_code = SI_USER;
> -	info.si_pid = task_tgid_vnr(current);
> -	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
> +	prepare_kill_siginfo(sig, &info);
>  
>  	return kill_something_info(sig, &info, pid);
>  }
>  
> +/*
> + * Verify that the signaler and signalee either are in the same pid namespace
> + * or that the signaler's pid namespace is an ancestor of the signalee's pid
> + * namespace.
> + */
> +static bool may_signal_procfd(struct pid *pid)
> +{
> +	struct pid_namespace *active = task_active_pid_ns(current);
> +	struct pid_namespace *p = ns_of_pid(pid);
> +
> +	for (;;) {
> +		if (!p)
> +			return false;
> +		if (p == active)
> +			break;
> +		p = p->parent;
> +	}
> +
> +	return true;
> +}
> +
> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
> +			    bool had_siginfo)
> +{
> +	int ret;
> +	struct fd f;
> +	struct pid *pid;
> +
> +	/* Enforce flags be set to 0 until we add an extension. */
> +	if (flags)
> +		return -EINVAL;
> +
> +	f = fdget_raw(fd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	/* Is this a process file descriptor? */
> +	ret = -EINVAL;
> +	if (!proc_is_tgid_procfd(f.file))
> +		goto err;
> +
> +	/* Without CONFIG_PROC_FS proc_pid() returns NULL. */
> +	pid = proc_pid(file_inode(f.file));
> +	if (!pid)
> +		goto err;
> +
> +	if (!may_signal_procfd(pid))
> +		goto err;
> +
> +	if (had_siginfo) {
> +		/*
> +		 * Not even root can pretend to send signals from the kernel.
> +		 * Nor can they impersonate a kill()/tgkill(), which adds
> +		 * source info.
> +		 */
> +		ret = -EPERM;
> +		if ((kinfo->si_code >= 0 || kinfo->si_code == SI_TKILL) &&
> +		    (task_pid(current) != pid))
> +			goto err;
> +	} else {
> +		prepare_kill_siginfo(sig, kinfo);
> +	}
> +
> +	ret = kill_pid_info(sig, kinfo, pid);
> +
> +err:
> +	fdput(f);
> +	return ret;
> +}
> +
> +/**
> + *  sys_procfd_signal - send a signal to a process through a process file
> + *                      descriptor
> + *  @fd: the file descriptor of the process
> + *  @sig: signal to be sent
> + *  @info: the signal info
> + *  @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> +		int, flags)
> +{
> +	kernel_siginfo_t kinfo;
> +
> +	if (info) {
> +		int ret = __copy_siginfo_from_user(sig, &kinfo, info);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
> +	return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +
> +#ifdef CONFIG_COMPAT
> +COMPAT_SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig,
> +		       struct compat_siginfo __user *, info, int, flags)
> +{
> +	kernel_siginfo_t kinfo;
> +
> +	if (info) {
> +		int ret = __copy_siginfo_from_user32(sig, &kinfo, info);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
> +	return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +#endif
> +
>  static int
>  do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
>  {
> -- 
> 2.19.1
Aleksa Sarai Nov. 22, 2018, 8:23 a.m. UTC | #2
On 2018-11-20, Christian Brauner <christian@brauner.io> wrote:
> The kill() syscall operates on process identifiers. After a process has
> exited its pid can be reused by another process. If a caller sends a signal
> to a reused pid it will end up signaling the wrong process. This issue has
> often surfaced and there has been a push [1] to address this problem.
> 
> This patch uses file descriptors from proc/<pid> as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The file
> descriptor can be used to send signals to the referenced process.
> Thus, the  new syscall procfd_signal() is introduced to solve this problem.
> It operates on a process file descriptor.
> The syscall takes an additional siginfo_t and flags argument. If siginfo_t
> is NULL then procfd_signal() behaves like kill() if it is not NULL it
> behaves like rt_sigqueueinfo.
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0.
> 
> With this patch a process can be killed via:
> 
>  #define _GNU_SOURCE
>  #include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <signal.h>
>  #include <sys/stat.h>
>  #include <sys/syscall.h>
>  #include <sys/types.h>
>  #include <unistd.h>
> 
>  int main(int argc, char *argv[])
>  {
>          int ret;
>          char buf[1000];
> 
>          if (argc < 2)
>                  exit(EXIT_FAILURE);
> 
>          ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
>          if (ret < 0)
>                  exit(EXIT_FAILURE);
> 
>          int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
>          if (fd < 0) {
>                  printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
>                  exit(EXIT_FAILURE);
>          }
> 
>          ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
>          if (ret < 0) {
>                  printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
>                  close(fd);
>                  exit(EXIT_FAILURE);
>          }
> 
>          close(fd);
> 
>          exit(EXIT_SUCCESS);
>  }
> 
> [1]: https://lkml.org/lkml/2018/11/18/130
> 
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> Cc: Serge Hallyn <serge@hallyn.com>
> Cc: Jann Horn <jannh@google.com>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Andy Lutomirsky <luto@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: Aleksa Sarai <cyphar@cyphar.com>

Acked-by: Aleksa Sarai <cyphar@cyphar.com>

> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Christian Brauner <christian@brauner.io>
> ---
> Changelog:
> v2:
> - define __NR_procfd_signal in unistd.h
> - wire up compat syscall
> - s/proc_is_procfd/proc_is_tgid_procfd/g
> - provide stubs when CONFIG_PROC_FS=n
> - move proc_pid() to linux/proc_fs.h header
> - use proc_pid() to grab struct pid from /proc/<pid> fd
> v1:
> - patch introduced
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   2 +
>  fs/proc/base.c                         |  11 ++-
>  fs/proc/internal.h                     |   5 -
>  include/linux/proc_fs.h                |  12 +++
>  include/linux/syscalls.h               |   2 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/signal.c                        | 127 +++++++++++++++++++++++--
>  8 files changed, 151 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
>  385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
>  386	i386	rseq			sys_rseq			__ia32_sys_rseq
> +387	i386	procfd_signal		sys_procfd_signal		__ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332	common	statx			__x64_sys_statx
>  333	common	io_pgetevents		__x64_sys_io_pgetevents
>  334	common	rseq			__x64_sys_rseq
> +335	64	procfd_signal		__x64_sys_procfd_signal
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
>  545	x32	execveat		__x32_compat_sys_execveat/ptregs
>  546	x32	preadv2			__x32_compat_sys_preadv64v2
>  547	x32	pwritev2		__x32_compat_sys_pwritev64v2
> +548	x32	procfd_signal		__x32_compat_sys_procfd_signal
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ce3465479447..771c6bd1cac6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int mask)
>  	return generic_permission(inode, mask);
>  }
>  
> -
> +struct pid *proc_pid(const struct inode *inode)
> +{
> +       return PROC_I(inode)->pid;
> +}
>  
>  static const struct inode_operations proc_def_inode_operations = {
>  	.setattr	= proc_setattr,
> @@ -3038,6 +3041,12 @@ static const struct file_operations proc_tgid_base_operations = {
>  	.llseek		= generic_file_llseek,
>  };
>  
> +bool proc_is_tgid_procfd(const struct file *file)
> +{
> +	return d_is_dir(file->f_path.dentry) &&
> +	       (file->f_op == &proc_tgid_base_operations);
> +}
> +
>  static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
>  {
>  	return proc_pident_lookup(dir, dentry,
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 5185d7f6a51e..eb69afba83f3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -113,11 +113,6 @@ static inline void *__PDE_DATA(const struct inode *inode)
>  	return PDE(inode)->data;
>  }
>  
> -static inline struct pid *proc_pid(const struct inode *inode)
> -{
> -	return PROC_I(inode)->pid;
> -}
> -
>  static inline struct task_struct *get_proc_task(const struct inode *inode)
>  {
>  	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index d0e1f1522a78..96df2fe6311d 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -73,6 +73,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
>  						    int (*show)(struct seq_file *, void *),
>  						    proc_write_t write,
>  						    void *data);
> +extern bool proc_is_tgid_procfd(const struct file *file);
> +extern struct pid *proc_pid(const struct inode *inode);
>  
>  #else /* CONFIG_PROC_FS */
>  
> @@ -114,6 +116,16 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
>  #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
>  #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
>  
> +static inline bool proc_is_tgid_procfd(const struct file *file)
> +{
> +	return false;
> +}
> +
> +static inline struct pid *proc_pid(const struct inode *inode)
> +{
> +	return NULL;
> +}
> +
>  #endif /* CONFIG_PROC_FS */
>  
>  struct net;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2ac3d13a915b..a5ca8cb84566 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -907,6 +907,8 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
>  asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
>  			 int flags, uint32_t sig);
> +asmlinkage long sys_procfd_signal(int fd, int sig, siginfo_t __user *info,
> +				  int flags);
>  
>  /*
>   * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 538546edbfbd..4dc81a994ad1 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -738,9 +738,11 @@ __SYSCALL(__NR_statx,     sys_statx)
>  __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
>  #define __NR_rseq 293
>  __SYSCALL(__NR_rseq, sys_rseq)
> +#define __NR_procfd_signal 294
> +__SC_COMP(__NR_procfd_signal, sys_procfd_signal, compat_sys_procfd_signal)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 294
> +#define __NR_syscalls 295
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9a32bc2088c9..13695342f150 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -19,7 +19,9 @@
>  #include <linux/sched/task.h>
>  #include <linux/sched/task_stack.h>
>  #include <linux/sched/cputime.h>
> +#include <linux/file.h>
>  #include <linux/fs.h>
> +#include <linux/proc_fs.h>
>  #include <linux/tty.h>
>  #include <linux/binfmts.h>
>  #include <linux/coredump.h>
> @@ -3286,6 +3288,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
>  }
>  #endif
>  
> +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
> +{
> +	clear_siginfo(info);
> +	info->si_signo = sig;
> +	info->si_errno = 0;
> +	info->si_code = SI_USER;
> +	info->si_pid = task_tgid_vnr(current);
> +	info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
> +}
> +
>  /**
>   *  sys_kill - send a signal to a process
>   *  @pid: the PID of the process
> @@ -3295,16 +3307,119 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
>  {
>  	struct kernel_siginfo info;
>  
> -	clear_siginfo(&info);
> -	info.si_signo = sig;
> -	info.si_errno = 0;
> -	info.si_code = SI_USER;
> -	info.si_pid = task_tgid_vnr(current);
> -	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
> +	prepare_kill_siginfo(sig, &info);
>  
>  	return kill_something_info(sig, &info, pid);
>  }
>  
> +/*
> + * Verify that the signaler and signalee either are in the same pid namespace
> + * or that the signaler's pid namespace is an ancestor of the signalee's pid
> + * namespace.
> + */
> +static bool may_signal_procfd(struct pid *pid)
> +{
> +	struct pid_namespace *active = task_active_pid_ns(current);
> +	struct pid_namespace *p = ns_of_pid(pid);
> +
> +	for (;;) {
> +		if (!p)
> +			return false;
> +		if (p == active)
> +			break;
> +		p = p->parent;
> +	}
> +
> +	return true;
> +}
> +
> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
> +			    bool had_siginfo)
> +{
> +	int ret;
> +	struct fd f;
> +	struct pid *pid;
> +
> +	/* Enforce flags be set to 0 until we add an extension. */
> +	if (flags)
> +		return -EINVAL;
> +
> +	f = fdget_raw(fd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	/* Is this a process file descriptor? */
> +	ret = -EINVAL;
> +	if (!proc_is_tgid_procfd(f.file))
> +		goto err;
> +
> +	/* Without CONFIG_PROC_FS proc_pid() returns NULL. */
> +	pid = proc_pid(file_inode(f.file));
> +	if (!pid)
> +		goto err;
> +
> +	if (!may_signal_procfd(pid))
> +		goto err;
> +
> +	if (had_siginfo) {
> +		/*
> +		 * Not even root can pretend to send signals from the kernel.
> +		 * Nor can they impersonate a kill()/tgkill(), which adds
> +		 * source info.
> +		 */
> +		ret = -EPERM;
> +		if ((kinfo->si_code >= 0 || kinfo->si_code == SI_TKILL) &&
> +		    (task_pid(current) != pid))
> +			goto err;
> +	} else {
> +		prepare_kill_siginfo(sig, kinfo);
> +	}
> +
> +	ret = kill_pid_info(sig, kinfo, pid);
> +
> +err:
> +	fdput(f);
> +	return ret;
> +}
> +
> +/**
> + *  sys_procfd_signal - send a signal to a process through a process file
> + *                      descriptor
> + *  @fd: the file descriptor of the process
> + *  @sig: signal to be sent
> + *  @info: the signal info
> + *  @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> +		int, flags)
> +{
> +	kernel_siginfo_t kinfo;
> +
> +	if (info) {
> +		int ret = __copy_siginfo_from_user(sig, &kinfo, info);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
> +	return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +
> +#ifdef CONFIG_COMPAT
> +COMPAT_SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig,
> +		       struct compat_siginfo __user *, info, int, flags)
> +{
> +	kernel_siginfo_t kinfo;
> +
> +	if (info) {
> +		int ret = __copy_siginfo_from_user32(sig, &kinfo, info);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
> +	return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +#endif
> +
>  static int
>  do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
>  {
> -- 
> 2.19.1
>
Arnd Bergmann Nov. 28, 2018, 2:05 p.m. UTC | #3
On Tue, Nov 20, 2018 at 11:54 AM Christian Brauner <christian@brauner.io> wrote:
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   2 +
>  fs/proc/base.c                         |  11 ++-
>  fs/proc/internal.h                     |   5 -
>  include/linux/proc_fs.h                |  12 +++
>  include/linux/syscalls.h               |   2 +
>  include/uapi/asm-generic/unistd.h      |   4 +-
>  kernel/signal.c                        | 127 +++++++++++++++++++++++--
>  8 files changed, 151 insertions(+), 13 deletions(-)

For asm-generic:

Acked-by: Arnd Bergmann <arnd@arndb.de>

I checked that the system call wired up correctly in a way that
works on all architectures using the generic syscall table.
Florian Weimer Nov. 29, 2018, 12:28 p.m. UTC | #4
Disclaimer: I'm looking at this patch because Christian requested it.
I'm not a kernel developer.

* Christian Brauner:

> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
>  385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
>  386	i386	rseq			sys_rseq			__ia32_sys_rseq
> +387	i386	procfd_signal		sys_procfd_signal		__ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332	common	statx			__x64_sys_statx
>  333	common	io_pgetevents		__x64_sys_io_pgetevents
>  334	common	rseq			__x64_sys_rseq
> +335	64	procfd_signal		__x64_sys_procfd_signal
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
>  545	x32	execveat		__x32_compat_sys_execveat/ptregs
>  546	x32	preadv2			__x32_compat_sys_preadv64v2
>  547	x32	pwritev2		__x32_compat_sys_pwritev64v2
> +548	x32	procfd_signal		__x32_compat_sys_procfd_signal

Is there a reason why these numbers have to be different?

(See the recent discussion with Andy Lutomirski.)

> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
> +			    bool had_siginfo)
> +{
> +	int ret;
> +	struct fd f;
> +	struct pid *pid;
> +
> +	/* Enforce flags be set to 0 until we add an extension. */
> +	if (flags)
> +		return -EINVAL;
> +
> +	f = fdget_raw(fd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	/* Is this a process file descriptor? */
> +	ret = -EINVAL;
> +	if (!proc_is_tgid_procfd(f.file))
> +		goto err;
[…]
> +	ret = kill_pid_info(sig, kinfo, pid);

I would like to see some comment here what happens to zombie processes.

> +/**
> + *  sys_procfd_signal - send a signal to a process through a process file
> + *                      descriptor
> + *  @fd: the file descriptor of the process
> + *  @sig: signal to be sent
> + *  @info: the signal info
> + *  @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> +		int, flags)

Sorry, I'm quite unhappy with the name.  “signal” is for signal handler
management.  procfd_sendsignal, procfd_sigqueueinfo or something like
that would be fine.  Even procfd_kill would be better IMHO.

Looking at the rt_tgsigqueueinfo interface, is there a way to implement
the “tg” part with the current procfd_signal interface?  Would you use
openat to retrieve the Tgid: line from "status"?

Thanks,
Florian
Andy Lutomirski Nov. 29, 2018, 4:54 p.m. UTC | #5
> On Nov 29, 2018, at 4:28 AM, Florian Weimer <fweimer@redhat.com> wrote:
> 
> Disclaimer: I'm looking at this patch because Christian requested it.
> I'm not a kernel developer.
> 
> * Christian Brauner:
> 
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -398,3 +398,4 @@
>> 384    i386    arch_prctl        sys_arch_prctl            __ia32_compat_sys_arch_prctl
>> 385    i386    io_pgetevents        sys_io_pgetevents        __ia32_compat_sys_io_pgetevents
>> 386    i386    rseq            sys_rseq            __ia32_sys_rseq
>> +387    i386    procfd_signal        sys_procfd_signal        __ia32_compat_sys_procfd_signal
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index f0b1709a5ffb..8a30cde82450 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -343,6 +343,7 @@
>> 332    common    statx            __x64_sys_statx
>> 333    common    io_pgetevents        __x64_sys_io_pgetevents
>> 334    common    rseq            __x64_sys_rseq
>> +335    64    procfd_signal        __x64_sys_procfd_signal
>> 
>> #
>> # x32-specific system call numbers start at 512 to avoid cache impact
>> @@ -386,3 +387,4 @@
>> 545    x32    execveat        __x32_compat_sys_execveat/ptregs
>> 546    x32    preadv2            __x32_compat_sys_preadv64v2
>> 547    x32    pwritev2        __x32_compat_sys_pwritev64v2
>> +548    x32    procfd_signal        __x32_compat_sys_procfd_signal
> 
> Is there a reason why these numbers have to be different?
> 
> (See the recent discussion with Andy Lutomirski.)

Hah, I missed this part of the patch.  Let’s not add new x32 syscall numbers.

Also, can we perhaps rework this a bit to get rid of the compat entry point?  The easier way would be to check in_compat_syscall(). The nicer way IMO would be to use the 64-bit structure for 32-bit as well.
Christian Brauner Nov. 29, 2018, 7:16 p.m. UTC | #6
On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
>
>
>> On Nov 29, 2018, at 4:28 AM, Florian Weimer <fweimer@redhat.com>
>wrote:
>> 
>> Disclaimer: I'm looking at this patch because Christian requested it.
>> I'm not a kernel developer.
>> 
>> * Christian Brauner:
>> 
>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
>b/arch/x86/entry/syscalls/syscall_32.tbl
>>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>>> @@ -398,3 +398,4 @@
>>> 384    i386    arch_prctl        sys_arch_prctl           
>__ia32_compat_sys_arch_prctl
>>> 385    i386    io_pgetevents        sys_io_pgetevents       
>__ia32_compat_sys_io_pgetevents
>>> 386    i386    rseq            sys_rseq            __ia32_sys_rseq
>>> +387    i386    procfd_signal        sys_procfd_signal       
>__ia32_compat_sys_procfd_signal
>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>b/arch/x86/entry/syscalls/syscall_64.tbl
>>> index f0b1709a5ffb..8a30cde82450 100644
>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>> @@ -343,6 +343,7 @@
>>> 332    common    statx            __x64_sys_statx
>>> 333    common    io_pgetevents        __x64_sys_io_pgetevents
>>> 334    common    rseq            __x64_sys_rseq
>>> +335    64    procfd_signal        __x64_sys_procfd_signal
>>> 
>>> #
>>> # x32-specific system call numbers start at 512 to avoid cache
>impact
>>> @@ -386,3 +387,4 @@
>>> 545    x32    execveat        __x32_compat_sys_execveat/ptregs
>>> 546    x32    preadv2            __x32_compat_sys_preadv64v2
>>> 547    x32    pwritev2        __x32_compat_sys_pwritev64v2
>>> +548    x32    procfd_signal        __x32_compat_sys_procfd_signal
>> 
>> Is there a reason why these numbers have to be different?
>> 
>> (See the recent discussion with Andy Lutomirski.)
>
>Hah, I missed this part of the patch.  Let’s not add new x32 syscall
>numbers.
>
>Also, can we perhaps rework this a bit to get rid of the compat entry
>point?  The easier way would be to check in_compat_syscall(). The nicer
>way IMO would be to use the 64-bit structure for 32-bit as well.

Do you have a syscall which set precedence/did this before I could look at?
Just if you happen to remember one.
Fwiw, I followed the other signal syscalls.
They all introduce compat syscalls.
Andy Lutomirski Nov. 29, 2018, 7:22 p.m. UTC | #7
On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
>
> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
> >
> >
> >> On Nov 29, 2018, at 4:28 AM, Florian Weimer <fweimer@redhat.com>
> >wrote:
> >>
> >> Disclaimer: I'm looking at this patch because Christian requested it.
> >> I'm not a kernel developer.
> >>
> >> * Christian Brauner:
> >>
> >>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
> >b/arch/x86/entry/syscalls/syscall_32.tbl
> >>> index 3cf7b533b3d1..3f27ffd8ae87 100644
> >>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> >>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> >>> @@ -398,3 +398,4 @@
> >>> 384    i386    arch_prctl        sys_arch_prctl
> >__ia32_compat_sys_arch_prctl
> >>> 385    i386    io_pgetevents        sys_io_pgetevents
> >__ia32_compat_sys_io_pgetevents
> >>> 386    i386    rseq            sys_rseq            __ia32_sys_rseq
> >>> +387    i386    procfd_signal        sys_procfd_signal
> >__ia32_compat_sys_procfd_signal
> >>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> >b/arch/x86/entry/syscalls/syscall_64.tbl
> >>> index f0b1709a5ffb..8a30cde82450 100644
> >>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> >>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> >>> @@ -343,6 +343,7 @@
> >>> 332    common    statx            __x64_sys_statx
> >>> 333    common    io_pgetevents        __x64_sys_io_pgetevents
> >>> 334    common    rseq            __x64_sys_rseq
> >>> +335    64    procfd_signal        __x64_sys_procfd_signal
> >>>
> >>> #
> >>> # x32-specific system call numbers start at 512 to avoid cache
> >impact
> >>> @@ -386,3 +387,4 @@
> >>> 545    x32    execveat        __x32_compat_sys_execveat/ptregs
> >>> 546    x32    preadv2            __x32_compat_sys_preadv64v2
> >>> 547    x32    pwritev2        __x32_compat_sys_pwritev64v2
> >>> +548    x32    procfd_signal        __x32_compat_sys_procfd_signal
> >>
> >> Is there a reason why these numbers have to be different?
> >>
> >> (See the recent discussion with Andy Lutomirski.)
> >
> >Hah, I missed this part of the patch.  Let’s not add new x32 syscall
> >numbers.
> >
> >Also, can we perhaps rework this a bit to get rid of the compat entry
> >point?  The easier way would be to check in_compat_syscall(). The nicer
> >way IMO would be to use the 64-bit structure for 32-bit as well.
>
> Do you have a syscall which set precedence/did this before I could look at?
> Just if you happen to remember one.
> Fwiw, I followed the other signal syscalls.
> They all introduce compat syscalls.
>

Not really.

Let me try to explain.  I have three issues with the approach in your patchset:

1. You're introducing a new syscall, and it behaves differently on
32-bit and 64-bit because the structure you pass in is different.
This is necessary for old syscalls where compatibility matters, but
maybe we can get rid of it for new syscalls.   Could we define a
siginfo64_t that is identical to the 64-bit siginfo_t and just use
that in all cases?

2. Assuming that #1 doesn't work, then we need compat support.  But
you're doing it by having two different entry points.  Instead, you
could have a single entry point that calls in_compat_syscall() to
decide which structure to read.  This would simplify things because
x86 doesn't really support the separate compat entry points, which
leads me to #3.

3. The separate x32 numbers are a huge turd that may have security
holes and certainly have comprehensibility holes.  I will object to
any patch that adds a new one (like yours).  Fixing #1 or #2 makes
this problem go away.

Does that make any sense?  The #2 fix would be something like:

if (in_compat_syscall)
  copy...user32();
else
  copy_from_user();

The #1 fix would add a copy_siginfo_from_user64() or similar.
Christian Brauner Nov. 29, 2018, 7:55 p.m. UTC | #8
On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
> > >
> > >
> > >> On Nov 29, 2018, at 4:28 AM, Florian Weimer <fweimer@redhat.com>
> > >wrote:
> > >>
> > >> Disclaimer: I'm looking at this patch because Christian requested it.
> > >> I'm not a kernel developer.
> > >>
> > >> * Christian Brauner:
> > >>
> > >>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
> > >b/arch/x86/entry/syscalls/syscall_32.tbl
> > >>> index 3cf7b533b3d1..3f27ffd8ae87 100644
> > >>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > >>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > >>> @@ -398,3 +398,4 @@
> > >>> 384    i386    arch_prctl        sys_arch_prctl
> > >__ia32_compat_sys_arch_prctl
> > >>> 385    i386    io_pgetevents        sys_io_pgetevents
> > >__ia32_compat_sys_io_pgetevents
> > >>> 386    i386    rseq            sys_rseq            __ia32_sys_rseq
> > >>> +387    i386    procfd_signal        sys_procfd_signal
> > >__ia32_compat_sys_procfd_signal
> > >>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> > >b/arch/x86/entry/syscalls/syscall_64.tbl
> > >>> index f0b1709a5ffb..8a30cde82450 100644
> > >>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > >>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > >>> @@ -343,6 +343,7 @@
> > >>> 332    common    statx            __x64_sys_statx
> > >>> 333    common    io_pgetevents        __x64_sys_io_pgetevents
> > >>> 334    common    rseq            __x64_sys_rseq
> > >>> +335    64    procfd_signal        __x64_sys_procfd_signal
> > >>>
> > >>> #
> > >>> # x32-specific system call numbers start at 512 to avoid cache
> > >impact
> > >>> @@ -386,3 +387,4 @@
> > >>> 545    x32    execveat        __x32_compat_sys_execveat/ptregs
> > >>> 546    x32    preadv2            __x32_compat_sys_preadv64v2
> > >>> 547    x32    pwritev2        __x32_compat_sys_pwritev64v2
> > >>> +548    x32    procfd_signal        __x32_compat_sys_procfd_signal
> > >>
> > >> Is there a reason why these numbers have to be different?
> > >>
> > >> (See the recent discussion with Andy Lutomirski.)
> > >
> > >Hah, I missed this part of the patch.  Let’s not add new x32 syscall
> > >numbers.
> > >
> > >Also, can we perhaps rework this a bit to get rid of the compat entry
> > >point?  The easier way would be to check in_compat_syscall(). The nicer
> > >way IMO would be to use the 64-bit structure for 32-bit as well.
> >
> > Do you have a syscall which set precedence/did this before I could look at?
> > Just if you happen to remember one.
> > Fwiw, I followed the other signal syscalls.
> > They all introduce compat syscalls.
> >
> 
> Not really.
> 
> Let me try to explain.  I have three issues with the approach in your patchset:
> 
> 1. You're introducing a new syscall, and it behaves differently on
> 32-bit and 64-bit because the structure you pass in is different.
> This is necessary for old syscalls where compatibility matters, but
> maybe we can get rid of it for new syscalls.   Could we define a
> siginfo64_t that is identical to the 64-bit siginfo_t and just use
> that in all cases?
> 
> 2. Assuming that #1 doesn't work, then we need compat support.  But
> you're doing it by having two different entry points.  Instead, you
> could have a single entry point that calls in_compat_syscall() to
> decide which structure to read.  This would simplify things because
> x86 doesn't really support the separate compat entry points, which
> leads me to #3.
> 
> 3. The separate x32 numbers are a huge turd that may have security
> holes and certainly have comprehensibility holes.  I will object to
> any patch that adds a new one (like yours).  Fixing #1 or #2 makes
> this problem go away.
> 
> Does that make any sense?  The #2 fix would be something like:
> 
> if (in_compat_syscall)
>   copy...user32();
> else
>   copy_from_user();
> 
> The #1 fix would add a copy_siginfo_from_user64() or similar.

Thanks very much! That all helped a bunch already! I'll try to go the
copy_siginfo_from_user64() way first and see if I can make this work. If
we do this I would however only want to use it for the new syscall first
and not change all other signal syscalls over to it too. I'd rather keep
this patchset focussed and small and do such conversions caused by the
new approach later. Does that sound reasonable?
Andy Lutomirski Nov. 29, 2018, 8:14 p.m. UTC | #9
> On Nov 29, 2018, at 11:55 AM, Christian Brauner <christian@brauner.io> wrote:
> 
>> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
>>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
>>> 
>>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> 
>>>> 
>>>>> On Nov 29, 2018, at 4:28 AM, Florian Weimer <fweimer@redhat.com>
>>>> wrote:
>>>>> 
>>>>> Disclaimer: I'm looking at this patch because Christian requested it.
>>>>> I'm not a kernel developer.
>>>>> 
>>>>> * Christian Brauner:
>>>>> 
>>>>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
>>>> b/arch/x86/entry/syscalls/syscall_32.tbl
>>>>>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>>>>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>>>>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>>>>>> @@ -398,3 +398,4 @@
>>>>>> 384    i386    arch_prctl        sys_arch_prctl
>>>> __ia32_compat_sys_arch_prctl
>>>>>> 385    i386    io_pgetevents        sys_io_pgetevents
>>>> __ia32_compat_sys_io_pgetevents
>>>>>> 386    i386    rseq            sys_rseq            __ia32_sys_rseq
>>>>>> +387    i386    procfd_signal        sys_procfd_signal
>>>> __ia32_compat_sys_procfd_signal
>>>>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>>>> b/arch/x86/entry/syscalls/syscall_64.tbl
>>>>>> index f0b1709a5ffb..8a30cde82450 100644
>>>>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>>>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>>>>> @@ -343,6 +343,7 @@
>>>>>> 332    common    statx            __x64_sys_statx
>>>>>> 333    common    io_pgetevents        __x64_sys_io_pgetevents
>>>>>> 334    common    rseq            __x64_sys_rseq
>>>>>> +335    64    procfd_signal        __x64_sys_procfd_signal
>>>>>> 
>>>>>> #
>>>>>> # x32-specific system call numbers start at 512 to avoid cache
>>>> impact
>>>>>> @@ -386,3 +387,4 @@
>>>>>> 545    x32    execveat        __x32_compat_sys_execveat/ptregs
>>>>>> 546    x32    preadv2            __x32_compat_sys_preadv64v2
>>>>>> 547    x32    pwritev2        __x32_compat_sys_pwritev64v2
>>>>>> +548    x32    procfd_signal        __x32_compat_sys_procfd_signal
>>>>> 
>>>>> Is there a reason why these numbers have to be different?
>>>>> 
>>>>> (See the recent discussion with Andy Lutomirski.)
>>>> 
>>>> Hah, I missed this part of the patch.  Let’s not add new x32 syscall
>>>> numbers.
>>>> 
>>>> Also, can we perhaps rework this a bit to get rid of the compat entry
>>>> point?  The easier way would be to check in_compat_syscall(). The nicer
>>>> way IMO would be to use the 64-bit structure for 32-bit as well.
>>> 
>>> Do you have a syscall which set precedence/did this before I could look at?
>>> Just if you happen to remember one.
>>> Fwiw, I followed the other signal syscalls.
>>> They all introduce compat syscalls.
>>> 
>> 
>> Not really.
>> 
>> Let me try to explain.  I have three issues with the approach in your patchset:
>> 
>> 1. You're introducing a new syscall, and it behaves differently on
>> 32-bit and 64-bit because the structure you pass in is different.
>> This is necessary for old syscalls where compatibility matters, but
>> maybe we can get rid of it for new syscalls.   Could we define a
>> siginfo64_t that is identical to the 64-bit siginfo_t and just use
>> that in all cases?
>> 
>> 2. Assuming that #1 doesn't work, then we need compat support.  But
>> you're doing it by having two different entry points.  Instead, you
>> could have a single entry point that calls in_compat_syscall() to
>> decide which structure to read.  This would simplify things because
>> x86 doesn't really support the separate compat entry points, which
>> leads me to #3.
>> 
>> 3. The separate x32 numbers are a huge turd that may have security
>> holes and certainly have comprehensibility holes.  I will object to
>> any patch that adds a new one (like yours).  Fixing #1 or #2 makes
>> this problem go away.
>> 
>> Does that make any sense?  The #2 fix would be something like:
>> 
>> if (in_compat_syscall)
>>  copy...user32();
>> else
>>  copy_from_user();
>> 
>> The #1 fix would add a copy_siginfo_from_user64() or similar.
> 
> Thanks very much! That all helped a bunch already! I'll try to go the
> copy_siginfo_from_user64() way first and see if I can make this work. If
> we do this I would however only want to use it for the new syscall first
> and not change all other signal syscalls over to it too. I'd rather keep
> this patchset focussed and small and do such conversions caused by the
> new approach later. Does that sound reasonable?

Absolutely. I don’t think we can change old syscalls — the ABI is set in stone. But for new syscalls, I think the always-64-bit behavior makes sense.
Arnd Bergmann Nov. 29, 2018, 9:02 p.m. UTC | #10
On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
> > On Nov 29, 2018, at 11:55 AM, Christian Brauner <christian@brauner.io> wrote:
> >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
> >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
> >>
> >> The #1 fix would add a copy_siginfo_from_user64() or similar.
> >
> > Thanks very much! That all helped a bunch already! I'll try to go the
> > copy_siginfo_from_user64() way first and see if I can make this work. If
> > we do this I would however only want to use it for the new syscall first
> > and not change all other signal syscalls over to it too. I'd rather keep
> > this patchset focussed and small and do such conversions caused by the
> > new approach later. Does that sound reasonable?
>
> Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
> But for new syscalls, I think the always-64-bit behavior makes sense.

It looks like we already have a 'struct signalfd_siginfo' that is defined in a
sane architecture-independent way, so I'd suggest we use that.

We may then also want to make sure that any system call that takes a
siginfo has a replacement that takes a signalfd_siginfo, and that this
replacement can be used to implement the old version purely in
user space.

Is the current procfd_signal() proposal (under whichever name) sufficient
to correctly implement both sys_rt_sigqueueinfo() and sys_rt_tgsigqueueinfo()?
Can we implement sys_rt_sigtimedwait() based on signalfd()?
If yes, that would leave waitid(), which already needs a replacement
for y2038, and that should then also return a signalfd_siginfo.
My current preference for waitid() would be to do a version that
closely resembles the current interface, but takes a signalfd_siginfo
and a __kernel_timespec based rusage replacement (possibly
two of them to let us map wait6), but does not operate on procfd or
take a signal mask. That would require yet another syscall, but I
don't think I can do that before we want to have the set of y2038
safe syscalls.

       Arnd
Christian Brauner Nov. 29, 2018, 9:35 p.m. UTC | #11
On Thu, Nov 29, 2018 at 10:02:13PM +0100, Arnd Bergmann wrote:
> On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
> > > On Nov 29, 2018, at 11:55 AM, Christian Brauner <christian@brauner.io> wrote:
> > >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> > >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
> > >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
> > >>
> > >> The #1 fix would add a copy_siginfo_from_user64() or similar.
> > >
> > > Thanks very much! That all helped a bunch already! I'll try to go the
> > > copy_siginfo_from_user64() way first and see if I can make this work. If
> > > we do this I would however only want to use it for the new syscall first
> > > and not change all other signal syscalls over to it too. I'd rather keep
> > > this patchset focussed and small and do such conversions caused by the
> > > new approach later. Does that sound reasonable?
> >
> > Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
> > But for new syscalls, I think the always-64-bit behavior makes sense.
> 
> It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> sane architecture-independent way, so I'd suggest we use that.

Just so that I understand you correctly: swapping out struct signinfo
for struct signalfd_siginfo in procfd_<whatever-suffix>? If so that
sounds great to me!

> 
> We may then also want to make sure that any system call that takes a
> siginfo has a replacement that takes a signalfd_siginfo, and that this
> replacement can be used to implement the old version purely in
> user space.

Sounds good but is unrelated to this patchset I take it. :)

> 
> Is the current procfd_signal() proposal (under whichever name) sufficient
> to correctly implement both sys_rt_sigqueueinfo() and sys_rt_tgsigqueueinfo()?

Yes, I see no reason why not. My idea is to extend it - after we have a
basic version in - to also work with:
/proc/<pid>/task/<tid>
If I'm not mistaken this should be sufficient to get rt_tgsigqueueinfo.
The thread will be uniquely identified by the tid descriptor and no
combination of /proc/<pid> and /proc/<pid>/task/<tid> is needed. Does
that sound reasonable?

> Can we implement sys_rt_sigtimedwait() based on signalfd()?
> If yes, that would leave waitid(), which already needs a replacement
> for y2038, and that should then also return a signalfd_siginfo.
> My current preference for waitid() would be to do a version that
> closely resembles the current interface, but takes a signalfd_siginfo
> and a __kernel_timespec based rusage replacement (possibly
> two of them to let us map wait6), but does not operate on procfd or
> take a signal mask. That would require yet another syscall, but I
> don't think I can do that before we want to have the set of y2038
> safe syscalls.

All sounds reasonable to me but that's not a blocker for the current
syscall though, is it?

Christian
Arnd Bergmann Nov. 29, 2018, 9:40 p.m. UTC | #12
On Thu, Nov 29, 2018 at 10:35 PM Christian Brauner <christian@brauner.io> wrote:
> On Thu, Nov 29, 2018 at 10:02:13PM +0100, Arnd Bergmann wrote:
> > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
> >
> > Is the current procfd_signal() proposal (under whichever name) sufficient
> > to correctly implement both sys_rt_sigqueueinfo() and sys_rt_tgsigqueueinfo()?
>
> Yes, I see no reason why not. My idea is to extend it - after we have a
> basic version in - to also work with:
> /proc/<pid>/task/<tid>
> If I'm not mistaken this should be sufficient to get rt_tgsigqueueinfo.
> The thread will be uniquely identified by the tid descriptor and no
> combination of /proc/<pid> and /proc/<pid>/task/<tid> is needed. Does
> that sound reasonable?

Yes. So it would currently replace rt_gsigqueueinfo() but
not rt_tgsigqueueinfo(), and could be extended to do both
afterwards, without making the interface ugly in any form?

I suppose we can always add more flags if needed, and you
already ensure that flags is zero for the moment.

> > Can we implement sys_rt_sigtimedwait() based on signalfd()?
> >
> > If yes, that would leave waitid(), which already needs a replacement
> > for y2038, and that should then also return a signalfd_siginfo.
> > My current preference for waitid() would be to do a version that
> > closely resembles the current interface, but takes a signalfd_siginfo
> > and a __kernel_timespec based rusage replacement (possibly
> > two of them to let us map wait6), but does not operate on procfd or
> > take a signal mask. That would require yet another syscall, but I
> > don't think I can do that before we want to have the set of y2038
> > safe syscalls.
>
> All sounds reasonable to me but that's not a blocker for the current
> syscall though, is it?

I'd like to at least understand about sys_rt_sigtimedwait() before
we go on, so we all know what's coming, and document the
plans in the changelog.

waitid() probably remains on my plate anyway, and I hope understand
where we're at with it.

     Arnd
Aleksa Sarai Nov. 30, 2018, 2:40 a.m. UTC | #13
On 2018-11-29, Arnd Bergmann <arnd@arndb.de> wrote:
> waitid() probably remains on my plate anyway, and I hope understand
> where we're at with it.

Having a way to wait on a processfd is something we'll eventually need,
though the semantics of zombies might get a little bit hairy. I propose
we work through that rewrite in a future series once this one goes in.
Eric W. Biederman Nov. 30, 2018, 5:13 a.m. UTC | #14
Arnd Bergmann <arnd@arndb.de> writes:

> On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
>> > On Nov 29, 2018, at 11:55 AM, Christian Brauner <christian@brauner.io> wrote:
>> >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
>> >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
>> >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
>> >>
>> >> The #1 fix would add a copy_siginfo_from_user64() or similar.
>> >
>> > Thanks very much! That all helped a bunch already! I'll try to go the
>> > copy_siginfo_from_user64() way first and see if I can make this work. If
>> > we do this I would however only want to use it for the new syscall first
>> > and not change all other signal syscalls over to it too. I'd rather keep
>> > this patchset focussed and small and do such conversions caused by the
>> > new approach later. Does that sound reasonable?
>>
>> Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
>> But for new syscalls, I think the always-64-bit behavior makes sense.
>
> It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> sane architecture-independent way, so I'd suggest we use that.

Unfortunately it isn't maintained very well.  What you can
express with signalfd_siginfo is a subset what you can express with
siginfo.  Many of the linux extensions to siginfo for exception
information add pointers and have integers right after those pointers.
Not all of those linux specific extentions for exceptions are handled
by signalfd_siginfo (it needs new fields).

As originally defined siginfo had the sigval_t union at the end so it
was perfectly fine on both 32bit and 64bit as it only had a single
pointer in the structure and the other fields were 32bits in size.

Although I do feel the pain as x86_64 has to deal with 3 versions
of siginfo.  A 64bit one.  A 32bit one for ia32.  A 32bit one for x32
with a 64bit si_clock_t.

> We may then also want to make sure that any system call that takes a
> siginfo has a replacement that takes a signalfd_siginfo, and that this
> replacement can be used to implement the old version purely in
> user space.

If you are not implementing CRIU and reinserting exceptions to yourself.
At most user space wants the ability to implement sigqueue:

AKA:
sigqueue(pid_t pid, int sig, const union sigval value);

Well sigqueue with it's own si_codes so the following would cover all
the use cases I know of:
int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);

The si_code could even be set to SI_USER to request that the normal
trusted SI_USER values be filled in.  si_code values of < 0 if not
recognized could reasonably safely be treated as the _rt member of
the siginfo union.  Or perhaps better we error out in such a case.

If we want to be flexible and not have N kinds of system calls that
is the direction I would go.  That is simple, and you don't need any of
the rest.


Alternatively we abandon the mistake of sigqueueinfo and not allow
a signal number in the arguments that differs from the si_signo in the
siginfo and allow passing the entire thing unchanged from sender to
receiver.  That is maximum flexibility.

signalfd_siginfo just sucks in practice.  It is larger that siginfo 104
bytes versus 48.  We must deliver it to userspace as a siginfo so it
must be translated.  Because of the translation signalfd_siginfo adds
no flexiblity in practice, because it can not just be passed through.
Finallay signalfd_siginfo does not have encodings for all of the
siginfo union members, so it fails to be fully general.

Personally if I was to define signalfd_siginfo today I would make it:
struct siginfo_subset {
	__u32 sis_signo;
	__s32 sis_errno;
	__s32 sis_code;
        __u32 sis_pad;
	__u32 sis_pid;
	__u32 sis_uid;
	__u64 sis_data (A pointer or integer data field);
};

That is just 32bytes, and is all that is needed for everything
except for synchronous exceptions.  Oh and that happens to be a proper
subset of a any sane siginfo layout, on both 32bit and 64bit.

This is one of those rare times where POSIX is sane and what linux
has implemented is not.

Eric
Christian Brauner Nov. 30, 2018, 6:56 a.m. UTC | #15
On Thu, Nov 29, 2018 at 11:13:57PM -0600, Eric W. Biederman wrote:
> Arnd Bergmann <arnd@arndb.de> writes:
> 
> > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
> >> > On Nov 29, 2018, at 11:55 AM, Christian Brauner <christian@brauner.io> wrote:
> >> >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> >> >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <christian@brauner.io> wrote:
> >> >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
> >> >>
> >> >> The #1 fix would add a copy_siginfo_from_user64() or similar.
> >> >
> >> > Thanks very much! That all helped a bunch already! I'll try to go the
> >> > copy_siginfo_from_user64() way first and see if I can make this work. If
> >> > we do this I would however only want to use it for the new syscall first
> >> > and not change all other signal syscalls over to it too. I'd rather keep
> >> > this patchset focussed and small and do such conversions caused by the
> >> > new approach later. Does that sound reasonable?
> >>
> >> Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
> >> But for new syscalls, I think the always-64-bit behavior makes sense.
> >
> > It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> > sane architecture-independent way, so I'd suggest we use that.
> 
> Unfortunately it isn't maintained very well.  What you can
> express with signalfd_siginfo is a subset what you can express with
> siginfo.  Many of the linux extensions to siginfo for exception
> information add pointers and have integers right after those pointers.
> Not all of those linux specific extentions for exceptions are handled
> by signalfd_siginfo (it needs new fields).
> 
> As originally defined siginfo had the sigval_t union at the end so it
> was perfectly fine on both 32bit and 64bit as it only had a single
> pointer in the structure and the other fields were 32bits in size.
> 
> Although I do feel the pain as x86_64 has to deal with 3 versions
> of siginfo.  A 64bit one.  A 32bit one for ia32.  A 32bit one for x32
> with a 64bit si_clock_t.
> 
> > We may then also want to make sure that any system call that takes a
> > siginfo has a replacement that takes a signalfd_siginfo, and that this
> > replacement can be used to implement the old version purely in
> > user space.
> 
> If you are not implementing CRIU and reinserting exceptions to yourself.
> At most user space wants the ability to implement sigqueue:
> 
> AKA:
> sigqueue(pid_t pid, int sig, const union sigval value);
> 
> Well sigqueue with it's own si_codes so the following would cover all
> the use cases I know of:
> int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
> 
> The si_code could even be set to SI_USER to request that the normal
> trusted SI_USER values be filled in.  si_code values of < 0 if not
> recognized could reasonably safely be treated as the _rt member of
> the siginfo union.  Or perhaps better we error out in such a case.
> 
> If we want to be flexible and not have N kinds of system calls that
> is the direction I would go.  That is simple, and you don't need any of
> the rest.
> 
> 
> Alternatively we abandon the mistake of sigqueueinfo and not allow
> a signal number in the arguments that differs from the si_signo in the
> siginfo and allow passing the entire thing unchanged from sender to
> receiver.  That is maximum flexibility.
> 
> signalfd_siginfo just sucks in practice.  It is larger that siginfo 104
> bytes versus 48.  We must deliver it to userspace as a siginfo so it
> must be translated.  Because of the translation signalfd_siginfo adds
> no flexiblity in practice, because it can not just be passed through.
> Finallay signalfd_siginfo does not have encodings for all of the
> siginfo union members, so it fails to be fully general.
> 
> Personally if I was to define signalfd_siginfo today I would make it:
> struct siginfo_subset {
> 	__u32 sis_signo;
> 	__s32 sis_errno;
> 	__s32 sis_code;
>         __u32 sis_pad;
> 	__u32 sis_pid;
> 	__u32 sis_uid;
> 	__u64 sis_data (A pointer or integer data field);
> };
> 
> That is just 32bytes, and is all that is needed for everything
> except for synchronous exceptions.  Oh and that happens to be a proper
> subset of a any sane siginfo layout, on both 32bit and 64bit.
> 
> This is one of those rare times where POSIX is sane and what linux
> has implemented is not.

Thanks for the in-depth explanation. So your point is that we are better
off if we stick with siginfo_t instead of struct signalfd_siginfo in
procfd_signal()?
Arnd Bergmann Nov. 30, 2018, 11:41 a.m. UTC | #16
On Fri, Nov 30, 2018 at 7:56 AM Christian Brauner <christian@brauner.io> wrote:
> On Thu, Nov 29, 2018 at 11:13:57PM -0600, Eric W. Biederman wrote:
> > Arnd Bergmann <arnd@arndb.de> writes:
> > > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
> > >
> > > It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> > > sane architecture-independent way, so I'd suggest we use that.
> >
> > Unfortunately it isn't maintained very well.  What you can
> > express with signalfd_siginfo is a subset what you can express with
> > siginfo.  Many of the linux extensions to siginfo for exception
> > information add pointers and have integers right after those pointers.
> > Not all of those linux specific extentions for exceptions are handled
> > by signalfd_siginfo (it needs new fields).

Would those fit in the 30 remaining padding bytes?

> > As originally defined siginfo had the sigval_t union at the end so it
> > was perfectly fine on both 32bit and 64bit as it only had a single
> > pointer in the structure and the other fields were 32bits in size.

The problem with sigval_t of course is that it is incompatible between
32-bit and 64-bit. We can add the same information, but at least on
the syscall level that would have to be a __u64.

> > Although I do feel the pain as x86_64 has to deal with 3 versions
> > of siginfo.  A 64bit one.  A 32bit one for ia32.  A 32bit one for x32
> > with a 64bit si_clock_t.

At least you and Al have managed to get it down to a single source-level
definition across all architectures, but there is also the (lesser) problem
that the structure has a slightly different binary layout on each of the
classic architectures.

If we go with Andy's suggestion of having only a single binary layout
on x86 for the new call, I'd argue that we should also make it have
the exact same layout on all other architectures.

> > > We may then also want to make sure that any system call that takes a
> > > siginfo has a replacement that takes a signalfd_siginfo, and that this
> > > replacement can be used to implement the old version purely in
> > > user space.
> >
> > If you are not implementing CRIU and reinserting exceptions to yourself.
> > At most user space wants the ability to implement sigqueue:
> >
> > AKA:
> > sigqueue(pid_t pid, int sig, const union sigval value);
> >
> > Well sigqueue with it's own si_codes so the following would cover all
> > the use cases I know of:
> > int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
> >
> > The si_code could even be set to SI_USER to request that the normal
> > trusted SI_USER values be filled in.  si_code values of < 0 if not
> > recognized could reasonably safely be treated as the _rt member of
> > the siginfo union.  Or perhaps better we error out in such a case.
> >
> > If we want to be flexible and not have N kinds of system calls that
> > is the direction I would go.  That is simple, and you don't need any of
> > the rest.

I'm not sure I understand what you are suggesting here. Would the
low-level implementation of this be based on procfd, or do you
mean that would be done for pid_t at the kernel level, plus another
syscall for procfd?

> > Alternatively we abandon the mistake of sigqueueinfo and not allow
> > a signal number in the arguments that differs from the si_signo in the
> > siginfo and allow passing the entire thing unchanged from sender to
> > receiver.  That is maximum flexibility.

This would be regardless of which flavor of siginfo (today's arch
specific one, signalfd_siginfo, or a new one) we pass, right?

> > signalfd_siginfo just sucks in practice.  It is larger that siginfo 104
> > bytes versus 48.  We must deliver it to userspace as a siginfo so it
> > must be translated.  Because of the translation signalfd_siginfo adds
> > no flexiblity in practice, because it can not just be passed through.
> > Finallay signalfd_siginfo does not have encodings for all of the
> > siginfo union members, so it fails to be fully general.
> >
> > Personally if I was to define signalfd_siginfo today I would make it:
> > struct siginfo_subset {
> >       __u32 sis_signo;
> >       __s32 sis_errno;
> >       __s32 sis_code;
> >         __u32 sis_pad;
> >       __u32 sis_pid;
> >       __u32 sis_uid;
> >       __u64 sis_data (A pointer or integer data field);
> > };
> >
> > That is just 32bytes, and is all that is needed for everything
> > except for synchronous exceptions.  Oh and that happens to be a proper
> > subset of a any sane siginfo layout, on both 32bit and 64bit.

And that would work for signalfd and waitid_time64, but not for
procfd_signal/kill/sendsiginfo?

> > This is one of those rare times where POSIX is sane and what linux
> > has implemented is not.
>
> Thanks for the in-depth explanation. So your point is that we are better
> off if we stick with siginfo_t instead of struct signalfd_siginfo in
> procfd_signal()?

I think it means we still need more discussion. Using signalfd_siginfo
without further changes doesn't seem sufficient because of the missing
sigval and the excessive length adds some cost.

siginfo_t as it is now still has a number of other downsides, and Andy in
particular didn't like the idea of having three new variants on x86
(depending on how you count). His alternative suggestion of having
a single syscall entry point that takes a 'signfo_t __user *' but interprets
it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
should work correctly, but feels wrong to me, or at least inconsistent
with how we do this elsewhere.

        Arnd
Andy Lutomirski Nov. 30, 2018, 4:35 p.m. UTC | #17
On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
> siginfo_t as it is now still has a number of other downsides, and Andy in
> particular didn't like the idea of having three new variants on x86
> (depending on how you count). His alternative suggestion of having
> a single syscall entry point that takes a 'signfo_t __user *' but interprets
> it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> should work correctly, but feels wrong to me, or at least inconsistent
> with how we do this elsewhere.

If everyone else is okay with it, I can get on board with three
variants on x86.  What I can't get on board with is *five* variants on
x86, which would be:

procfd_signal via int80 / the 32-bit vDSO: the ia32 structure

syscall64 with nr == 335 (presumably): 64-bit

syscall64 with nr == 548 | 0x40000000: x32

syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
true, behavior is arbitrary

syscall64 with nr == 335 | 0x40000000: x32 entry, but
in_compat_syscall() == false, behavior is arbitrary

This mess isn't really Christian's fault -- it's been there for a
while, but it's awful and I don't think we want to perpetuate it.

Obviously, I'd prefer a variant where the structure that's passed in
is always the same.

BTW, do we consider siginfo_t to be extensible?  If so, and if we pass
in a pointer, presumably we should pass a length as well.
Christian Brauner Nov. 30, 2018, 9:57 p.m. UTC | #18
On December 1, 2018 5:35:45 AM GMT+13:00, Andy Lutomirski <luto@kernel.org> wrote:
>On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
>> siginfo_t as it is now still has a number of other downsides, and
>Andy in
>> particular didn't like the idea of having three new variants on x86
>> (depending on how you count). His alternative suggestion of having
>> a single syscall entry point that takes a 'signfo_t __user *' but
>interprets
>> it as compat_siginfo depending on
>in_compat_syscall()/in_x32_syscall()
>> should work correctly, but feels wrong to me, or at least
>inconsistent
>> with how we do this elsewhere.
>
>If everyone else is okay with it, I can get on board with three
>variants on x86.  What I can't get on board with is *five* variants on

Thanks Andy, that helps a lot.
I'm okay with it. Does this require any additional changes to how the syscall
is currently hooked up?

>x86, which would be:
>
>procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
>
>syscall64 with nr == 335 (presumably): 64-bit
>
>syscall64 with nr == 548 | 0x40000000: x32
>
>syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
>true, behavior is arbitrary
>
>syscall64 with nr == 335 | 0x40000000: x32 entry, but
>in_compat_syscall() == false, behavior is arbitrary
>
>This mess isn't really Christian's fault -- it's been there for a
>while, but it's awful and I don't think we want to perpetuate it.
>
>Obviously, I'd prefer a variant where the structure that's passed in
>is always the same.
>
>BTW, do we consider siginfo_t to be extensible?  If so, and if we pass

I would prefer if we could consider it extensible.
If so I would prefer if we could pass in a size argument.

>in a pointer, presumably we should pass a length as well.
Arnd Bergmann Nov. 30, 2018, 10:09 p.m. UTC | #19
On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > siginfo_t as it is now still has a number of other downsides, and Andy in
> > particular didn't like the idea of having three new variants on x86
> > (depending on how you count). His alternative suggestion of having
> > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > should work correctly, but feels wrong to me, or at least inconsistent
> > with how we do this elsewhere.
>
> If everyone else is okay with it, I can get on board with three
> variants on x86.  What I can't get on board with is *five* variants on
> x86, which would be:
>
> procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
>
> syscall64 with nr == 335 (presumably): 64-bit

These seem unavoidable

> syscall64 with nr == 548 | 0x40000000: x32
>
> syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
> true, behavior is arbitrary
>
> syscall64 with nr == 335 | 0x40000000: x32 entry, but
> in_compat_syscall() == false, behavior is arbitrary

Am I misreading the code? The way I understand it, setting the
0x40000000 bit means that both in_compat_syscall() and
in_x32_syscall become() true, based on

static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
                return true;
#endif
        return false;
}

The '548 | 0x40000000' part seems to be the only sensible
way to handle x32 here. What exactly would you propose to
avoid defining the other entry points?

> This mess isn't really Christian's fault -- it's been there for a
> while, but it's awful and I don't think we want to perpetuate it.

I'm not convinced that not assigning an x32 syscall number
improves the situation, it just means that we now have one
syscall that behaves completely differently from all others,
in that the x32 version requires being called through a
SYSCALL_DEFINE() entry point rather than a
COMPAT_SYSCALL_DEFINE() one, and we have to
add more complexity to the copy_siginfo_from_user()
implementation to duplicate the hack that exists in
copy_siginfo_from_user32().

Of course, the nicest option would be to completely remove
x32 so we can stop worrying about it.

      Arnd
Christian Brauner Nov. 30, 2018, 10:26 p.m. UTC | #20
On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
>On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org>
>wrote:
>>
>> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
>> > siginfo_t as it is now still has a number of other downsides, and
>Andy in
>> > particular didn't like the idea of having three new variants on x86
>> > (depending on how you count). His alternative suggestion of having
>> > a single syscall entry point that takes a 'signfo_t __user *' but
>interprets
>> > it as compat_siginfo depending on
>in_compat_syscall()/in_x32_syscall()
>> > should work correctly, but feels wrong to me, or at least
>inconsistent
>> > with how we do this elsewhere.
>>
>> If everyone else is okay with it, I can get on board with three
>> variants on x86.  What I can't get on board with is *five* variants
>on
>> x86, which would be:
>>
>> procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
>>
>> syscall64 with nr == 335 (presumably): 64-bit
>
>These seem unavoidable
>
>> syscall64 with nr == 548 | 0x40000000: x32
>>
>> syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
>> true, behavior is arbitrary
>>
>> syscall64 with nr == 335 | 0x40000000: x32 entry, but
>> in_compat_syscall() == false, behavior is arbitrary
>
>Am I misreading the code? The way I understand it, setting the
>0x40000000 bit means that both in_compat_syscall() and
>in_x32_syscall become() true, based on
>
>static inline bool in_x32_syscall(void)
>{
>#ifdef CONFIG_X86_X32_ABI
>        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
>                return true;
>#endif
>        return false;
>}
>
>The '548 | 0x40000000' part seems to be the only sensible
>way to handle x32 here. What exactly would you propose to
>avoid defining the other entry points?
>
>> This mess isn't really Christian's fault -- it's been there for a
>> while, but it's awful and I don't think we want to perpetuate it.
>
>I'm not convinced that not assigning an x32 syscall number
>improves the situation, it just means that we now have one
>syscall that behaves completely differently from all others,
>in that the x32 version requires being called through a
>SYSCALL_DEFINE() entry point rather than a
>COMPAT_SYSCALL_DEFINE() one, and we have to
>add more complexity to the copy_siginfo_from_user()
>implementation to duplicate the hack that exists in
>copy_siginfo_from_user32().
>
>Of course, the nicest option would be to completely remove
>x32 so we can stop worrying about it.

One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
However, I do sympathize and understand these concerns.
Daniel Colascione Nov. 30, 2018, 11:05 p.m. UTC | #21
On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner <christian@brauner.io> wrote:
>
> On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
> >On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org>
> >wrote:
> >>
> >> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
> >> > siginfo_t as it is now still has a number of other downsides, and
> >Andy in
> >> > particular didn't like the idea of having three new variants on x86
> >> > (depending on how you count). His alternative suggestion of having
> >> > a single syscall entry point that takes a 'signfo_t __user *' but
> >interprets
> >> > it as compat_siginfo depending on
> >in_compat_syscall()/in_x32_syscall()
> >> > should work correctly, but feels wrong to me, or at least
> >inconsistent
> >> > with how we do this elsewhere.
> >>
> >> If everyone else is okay with it, I can get on board with three
> >> variants on x86.  What I can't get on board with is *five* variants
> >on
> >> x86, which would be:
> >>
> >> procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
> >>
> >> syscall64 with nr == 335 (presumably): 64-bit
> >
> >These seem unavoidable
> >
> >> syscall64 with nr == 548 | 0x40000000: x32
> >>
> >> syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
> >> true, behavior is arbitrary
> >>
> >> syscall64 with nr == 335 | 0x40000000: x32 entry, but
> >> in_compat_syscall() == false, behavior is arbitrary
> >
> >Am I misreading the code? The way I understand it, setting the
> >0x40000000 bit means that both in_compat_syscall() and
> >in_x32_syscall become() true, based on
> >
> >static inline bool in_x32_syscall(void)
> >{
> >#ifdef CONFIG_X86_X32_ABI
> >        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
> >                return true;
> >#endif
> >        return false;
> >}
> >
> >The '548 | 0x40000000' part seems to be the only sensible
> >way to handle x32 here. What exactly would you propose to
> >avoid defining the other entry points?
> >
> >> This mess isn't really Christian's fault -- it's been there for a
> >> while, but it's awful and I don't think we want to perpetuate it.
> >
> >I'm not convinced that not assigning an x32 syscall number
> >improves the situation, it just means that we now have one
> >syscall that behaves completely differently from all others,
> >in that the x32 version requires being called through a
> >SYSCALL_DEFINE() entry point rather than a
> >COMPAT_SYSCALL_DEFINE() one, and we have to
> >add more complexity to the copy_siginfo_from_user()
> >implementation to duplicate the hack that exists in
> >copy_siginfo_from_user32().
> >
> >Of course, the nicest option would be to completely remove
> >x32 so we can stop worrying about it.
>
> One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
> I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
> However, I do sympathize and understand these concerns.

IMHO, it's fine to just replicate all the splits we have for the
existing signal system calls. It's ugly, but once it's done, it'll be
done for a long time. I can't see a need to add even more signal
system calls after this one.
Arnd Bergmann Nov. 30, 2018, 11:12 p.m. UTC | #22
On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <dancol@google.com> wrote:
> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner <christian@brauner.io> wrote:
> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
> > I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
> > However, I do sympathize and understand these concerns.
>
> IMHO, it's fine to just replicate all the splits we have for the
> existing signal system calls. It's ugly, but once it's done, it'll be
> done for a long time. I can't see a need to add even more signal
> system calls after this one.

We definitely need waitid_time64() and rt_sigtimedwait_time64()
in the very near future.

       Arnd
Arnd Bergmann Nov. 30, 2018, 11:15 p.m. UTC | #23
On Sat, Dec 1, 2018 at 12:12 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <dancol@google.com> wrote:
> > On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner <christian@brauner.io> wrote:
> > > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
> > > I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
> > > However, I do sympathize and understand these concerns.
> >
> > IMHO, it's fine to just replicate all the splits we have for the
> > existing signal system calls. It's ugly, but once it's done, it'll be
> > done for a long time. I can't see a need to add even more signal
> > system calls after this one.
>
> We definitely need waitid_time64() and rt_sigtimedwait_time64()
> in the very near future.

To clarify: we probably don't need rt_sigtimedwait_time64() for
x32, as it already has a 64-bit time_t. We might need waitid_time64()
or something similar though, since the plan now is to change the
time resolution for rusage to nanoseconds (__kernel_timespec)
now. The exact behavior and name of waitid_time64() is still
a matter of discussion.

       Arnd
Christian Brauner Nov. 30, 2018, 11:37 p.m. UTC | #24
On December 1, 2018 12:12:53 PM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
>On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <dancol@google.com>
>wrote:
>> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner
><christian@brauner.io> wrote:
>> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann
><arnd@arndb.de> wrote:
>> >
>> > One humble point I would like to make is that what I care about
>most is a sensible way forward without having to redo essential parts
>of how syscalls work.
>> > I don't want to introduce a sane, small syscall that ends up
>breaking all over the place because we decided to fix past mistakes
>that technically have nothing to do with the patch itself.
>> > However, I do sympathize and understand these concerns.
>>
>> IMHO, it's fine to just replicate all the splits we have for the
>> existing signal system calls. It's ugly, but once it's done, it'll be
>> done for a long time. I can't see a need to add even more signal
>> system calls after this one.
>
>We definitely need waitid_time64() and rt_sigtimedwait_time64()
>in the very near future.

Right, I remember you pointing this out in a prior mail.
Thanks for working on this for such a long time now, Arnd!
Can we agree to move on with the procfd syscall given the current constraints?
I just don't want to see the syscall being 
blocked by a generic problem whose
ultimate solution is to get rid of weird
architectural constraints. :)
Andy Lutomirski Nov. 30, 2018, 11:46 p.m. UTC | #25
On Fri, Nov 30, 2018 at 3:40 PM Christian Brauner <christian@brauner.io> wrote:
>
> On December 1, 2018 12:12:53 PM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
> >On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <dancol@google.com>
> >wrote:
> >> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner
> ><christian@brauner.io> wrote:
> >> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann
> ><arnd@arndb.de> wrote:
> >> >
> >> > One humble point I would like to make is that what I care about
> >most is a sensible way forward without having to redo essential parts
> >of how syscalls work.
> >> > I don't want to introduce a sane, small syscall that ends up
> >breaking all over the place because we decided to fix past mistakes
> >that technically have nothing to do with the patch itself.
> >> > However, I do sympathize and understand these concerns.
> >>
> >> IMHO, it's fine to just replicate all the splits we have for the
> >> existing signal system calls. It's ugly, but once it's done, it'll be
> >> done for a long time. I can't see a need to add even more signal
> >> system calls after this one.
> >
> >We definitely need waitid_time64() and rt_sigtimedwait_time64()
> >in the very near future.
>
> Right, I remember you pointing this out in a prior mail.
> Thanks for working on this for such a long time now, Arnd!
> Can we agree to move on with the procfd syscall given the current constraints?
> I just don't want to see the syscall being
> blocked by a generic problem whose
> ultimate solution is to get rid of weird
> architectural constraints.

Creating and using a copy_siginfo_from_user64() function would work
for everyone, no?
Christian Brauner Nov. 30, 2018, 11:52 p.m. UTC | #26
On November 30, 2018 1:28:15 AM GMT+13:00, Florian Weimer <fweimer@redhat.com> wrote:
>Disclaimer: I'm looking at this patch because Christian requested it.
>I'm not a kernel developer.

Given all your expertise this really doesn't matter. :)
You're the one having to deal with this
in glibc after all.
Thanks for doing this and sorry for the late reply.
I missed that mail.

>
>* Christian Brauner:
>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
>b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -398,3 +398,4 @@
>>  384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
>> 
>385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
>>  386	i386	rseq			sys_rseq			__ia32_sys_rseq
>>
>+387	i386	procfd_signal		sys_procfd_signal		__ia32_compat_sys_procfd_signal
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>b/arch/x86/entry/syscalls/syscall_64.tbl
>> index f0b1709a5ffb..8a30cde82450 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -343,6 +343,7 @@
>>  332	common	statx			__x64_sys_statx
>>  333	common	io_pgetevents		__x64_sys_io_pgetevents
>>  334	common	rseq			__x64_sys_rseq
>> +335	64	procfd_signal		__x64_sys_procfd_signal
>>  
>>  #
>>  # x32-specific system call numbers start at 512 to avoid cache
>impact
>> @@ -386,3 +387,4 @@
>>  545	x32	execveat		__x32_compat_sys_execveat/ptregs
>>  546	x32	preadv2			__x32_compat_sys_preadv64v2
>>  547	x32	pwritev2		__x32_compat_sys_pwritev64v2
>> +548	x32	procfd_signal		__x32_compat_sys_procfd_signal
>
>Is there a reason why these numbers have to be different?
>
>(See the recent discussion with Andy Lutomirski.)
>
>> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t
>*kinfo, int flags,
>> +			    bool had_siginfo)
>> +{
>> +	int ret;
>> +	struct fd f;
>> +	struct pid *pid;
>> +
>> +	/* Enforce flags be set to 0 until we add an extension. */
>> +	if (flags)
>> +		return -EINVAL;
>> +
>> +	f = fdget_raw(fd);
>> +	if (!f.file)
>> +		return -EBADF;
>> +
>> +	/* Is this a process file descriptor? */
>> +	ret = -EINVAL;
>> +	if (!proc_is_tgid_procfd(f.file))
>> +		goto err;
>[…]
>> +	ret = kill_pid_info(sig, kinfo, pid);
>
>I would like to see some comment here what happens to zombie processes.

You'd get ESRCH.
I'm not sure if that has always been the case.
Eric recently did some excellent refactoring of the signal code.
Iirc, part of that involved not delivering signals to zombies.
That's at least how I remember it.
I don't have access to source code though atm.

>
>> +/**
>> + *  sys_procfd_signal - send a signal to a process through a process
>file
>> + *                      descriptor
>> + *  @fd: the file descriptor of the process
>> + *  @sig: signal to be sent
>> + *  @info: the signal info
>> + *  @flags: future flags to be passed
>> + */
>> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user
>*, info,
>> +		int, flags)
>
>Sorry, I'm quite unhappy with the name.  “signal” is for signal handler
>management.  procfd_sendsignal, procfd_sigqueueinfo or something like
>that would be fine.  Even procfd_kill would be better IMHO.

Ok. I only have strong opinions to procfd_kill().
Mainly because the new syscall takes
the job of multiple other syscalls
so kill gives the wrong impression.
I'll come up with a better name in the next iteration.

>
>Looking at the rt_tgsigqueueinfo interface, is there a way to implement
>the “tg” part with the current procfd_signal interface?  Would you use
>openat to retrieve the Tgid: line from "status"?

Yes, the tg part can be implemented.
As I pointed out in another mail my
I is to make this work by using file
descriptors for /proc/<pid>/task/<tid>.
I don't want this in the initial patchset though.
I prefer to slowly add those features
once we have gotten the basic functionality
in.


>
>Thanks,
>Florian
Andy Lutomirski Nov. 30, 2018, 11:53 p.m. UTC | #27
On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > > siginfo_t as it is now still has a number of other downsides, and Andy in
> > > particular didn't like the idea of having three new variants on x86
> > > (depending on how you count). His alternative suggestion of having
> > > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > > should work correctly, but feels wrong to me, or at least inconsistent
> > > with how we do this elsewhere.
> >
> > If everyone else is okay with it, I can get on board with three
> > variants on x86.  What I can't get on board with is *five* variants on
> > x86, which would be:
> >
> > procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
> >
> > syscall64 with nr == 335 (presumably): 64-bit
>
> These seem unavoidable

Indeed, although, in hindsight, they should have had the same numbers.

>
> > syscall64 with nr == 548 | 0x40000000: x32
> >
> > syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
> > true, behavior is arbitrary
> >
> > syscall64 with nr == 335 | 0x40000000: x32 entry, but
> > in_compat_syscall() == false, behavior is arbitrary
>
> Am I misreading the code? The way I understand it, setting the
> 0x40000000 bit means that both in_compat_syscall() and
> in_x32_syscall become() true, based on
>
> static inline bool in_x32_syscall(void)
> {
> #ifdef CONFIG_X86_X32_ABI
>         if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
>                 return true;
> #endif
>         return false;
> }

Yes.

>
> The '548 | 0x40000000' part seems to be the only sensible
> way to handle x32 here. What exactly would you propose to
> avoid defining the other entry points?

I would propose that it should be 335 | 0x40000000.  I can't see any
reasonable way to teach the kernel to reject 335 | 0x40000000 that
wouldn't work just as well to accept it and make it do the right
thing.  Currently we accept it and do the *wrong* thing, which is no
good.

>
> > This mess isn't really Christian's fault -- it's been there for a
> > while, but it's awful and I don't think we want to perpetuate it.
>
> I'm not convinced that not assigning an x32 syscall number
> improves the situation, it just means that we now have one
> syscall that behaves completely differently from all others,
> in that the x32 version requires being called through a
> SYSCALL_DEFINE() entry point rather than a
> COMPAT_SYSCALL_DEFINE() one,

There's nothing particularly novel about this.  seccomp(), for
example, already works like this.

> and we have to
> add more complexity to the copy_siginfo_from_user()
> implementation to duplicate the hack that exists in
> copy_siginfo_from_user32().

What hack are you referring to here?

>
> Of course, the nicest option would be to completely remove
> x32 so we can stop worrying about it.

True :)
Christian Brauner Dec. 1, 2018, 1:20 a.m. UTC | #28
On December 1, 2018 12:46:22 PM GMT+13:00, Andy Lutomirski <luto@kernel.org> wrote:
>On Fri, Nov 30, 2018 at 3:40 PM Christian Brauner
><christian@brauner.io> wrote:
>>
>> On December 1, 2018 12:12:53 PM GMT+13:00, Arnd Bergmann
><arnd@arndb.de> wrote:
>> >On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione
><dancol@google.com>
>> >wrote:
>> >> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner
>> ><christian@brauner.io> wrote:
>> >> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann
>> ><arnd@arndb.de> wrote:
>> >> >
>> >> > One humble point I would like to make is that what I care about
>> >most is a sensible way forward without having to redo essential
>parts
>> >of how syscalls work.
>> >> > I don't want to introduce a sane, small syscall that ends up
>> >breaking all over the place because we decided to fix past mistakes
>> >that technically have nothing to do with the patch itself.
>> >> > However, I do sympathize and understand these concerns.
>> >>
>> >> IMHO, it's fine to just replicate all the splits we have for the
>> >> existing signal system calls. It's ugly, but once it's done, it'll
>be
>> >> done for a long time. I can't see a need to add even more signal
>> >> system calls after this one.
>> >
>> >We definitely need waitid_time64() and rt_sigtimedwait_time64()
>> >in the very near future.
>>
>> Right, I remember you pointing this out in a prior mail.
>> Thanks for working on this for such a long time now, Arnd!
>> Can we agree to move on with the procfd syscall given the current
>constraints?
>> I just don't want to see the syscall being
>> blocked by a generic problem whose
>> ultimate solution is to get rid of weird
>> architectural constraints.
>
>Creating and using a copy_siginfo_from_user64() function would work
>for everyone, no?

Meaning, no compat syscalls, introduce 
new struct siginfo64_t and the copy 
function you named above?
Christian Brauner Dec. 1, 2018, 1:25 a.m. UTC | #29
On November 30, 2018 10:40:49 AM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
>On Thu, Nov 29, 2018 at 10:35 PM Christian Brauner
><christian@brauner.io> wrote:
>> On Thu, Nov 29, 2018 at 10:02:13PM +0100, Arnd Bergmann wrote:
>> > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski
><luto@amacapital.net> wrote:
>> >
>> > Is the current procfd_signal() proposal (under whichever name)
>sufficient
>> > to correctly implement both sys_rt_sigqueueinfo() and
>sys_rt_tgsigqueueinfo()?
>>
>> Yes, I see no reason why not. My idea is to extend it - after we have
>a
>> basic version in - to also work with:
>> /proc/<pid>/task/<tid>
>> If I'm not mistaken this should be sufficient to get
>rt_tgsigqueueinfo.
>> The thread will be uniquely identified by the tid descriptor and no
>> combination of /proc/<pid> and /proc/<pid>/task/<tid> is needed. Does
>> that sound reasonable?
>
>Yes. So it would currently replace rt_gsigqueueinfo() but
>not rt_tgsigqueueinfo(), and could be extended to do both
>afterwards, without making the interface ugly in any form?

Yes. :)

>
>I suppose we can always add more flags if needed, and you
>already ensure that flags is zero for the moment.

Yep.

>
>> > Can we implement sys_rt_sigtimedwait() based on signalfd()?
>> >
>> > If yes, that would leave waitid(), which already needs a
>replacement
>> > for y2038, and that should then also return a signalfd_siginfo.
>> > My current preference for waitid() would be to do a version that
>> > closely resembles the current interface, but takes a
>signalfd_siginfo
>> > and a __kernel_timespec based rusage replacement (possibly
>> > two of them to let us map wait6), but does not operate on procfd or
>> > take a signal mask. That would require yet another syscall, but I
>> > don't think I can do that before we want to have the set of y2038
>> > safe syscalls.
>>
>> All sounds reasonable to me but that's not a blocker for the current
>> syscall though, is it?
>
>I'd like to at least understand about sys_rt_sigtimedwait() before
>we go on, so we all know what's coming, and document the
>plans in the changelog.
>
>waitid() probably remains on my plate anyway, and I hope understand
>where we're at with it.
>
>     Arnd
Arnd Bergmann Dec. 1, 2018, 8:51 a.m. UTC | #30
On Sat, Dec 1, 2018 at 12:54 AM Andy Lutomirski <luto@kernel.org> wrote:
> On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > > > siginfo_t as it is now still has a number of other downsides, and Andy in
> > > > particular didn't like the idea of having three new variants on x86
> > > > (depending on how you count). His alternative suggestion of having
> > > > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > > > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > > > should work correctly, but feels wrong to me, or at least inconsistent
> > > > with how we do this elsewhere.

> > The '548 | 0x40000000' part seems to be the only sensible
> > way to handle x32 here. What exactly would you propose to
> > avoid defining the other entry points?
>
> I would propose that it should be 335 | 0x40000000.  I can't see any
> reasonable way to teach the kernel to reject 335 | 0x40000000 that
> wouldn't work just as well to accept it and make it do the right
> thing.  Currently we accept it and do the *wrong* thing, which is no
> good.
>
> > and we have to
> > add more complexity to the copy_siginfo_from_user()
> > implementation to duplicate the hack that exists in
> > copy_siginfo_from_user32().
>
> What hack are you referring to here?

I mean this part:

#ifdef CONFIG_COMPAT
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
{
        return __copy_siginfo_to_user32(to, from, in_x32_syscall());
}
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                             const struct kernel_siginfo *from, bool x32_ABI)
#endif
{
...
        case SIL_CHLD:
                new.si_pid    = from->si_pid;
                new.si_uid    = from->si_uid;
                new.si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
                if (x32_ABI) {
                        new._sifields._sigchld_x32._utime = from->si_utime;
                        new._sifields._sigchld_x32._stime = from->si_stime;
                } else
#endif
                {
                        new.si_utime = from->si_utime;
                        new.si_stime = from->si_stime;
                }
                break;
...
}
#endif

If we have a '548 | 0x40000000' entry pointing to
__x32_compat_sys_procfd_kill, then that will do the right
thing. If you instead try to have x32 call into the native
sys_procfd_kill, then copy_siginfo_to_user() will also have
to know about x32, effectively duplicating that mess above,
unless you want to also change all users of
copy_siginfo_to_user32() to use copy_siginfo_to_user()
and handle all cases in one function.

        Arnd
Christian Brauner Dec. 1, 2018, 9:17 a.m. UTC | #31
On December 1, 2018 9:51:18 PM GMT+13:00, Arnd Bergmann <arnd@arndb.de> wrote:
>On Sat, Dec 1, 2018 at 12:54 AM Andy Lutomirski <luto@kernel.org>
>wrote:
>> On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <arnd@arndb.de> wrote:
>> > On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org>
>wrote:
>> > > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de>
>wrote:
>> > > > siginfo_t as it is now still has a number of other downsides,
>and Andy in
>> > > > particular didn't like the idea of having three new variants on
>x86
>> > > > (depending on how you count). His alternative suggestion of
>having
>> > > > a single syscall entry point that takes a 'signfo_t __user *'
>but interprets
>> > > > it as compat_siginfo depending on
>in_compat_syscall()/in_x32_syscall()
>> > > > should work correctly, but feels wrong to me, or at least
>inconsistent
>> > > > with how we do this elsewhere.
>
>> > The '548 | 0x40000000' part seems to be the only sensible
>> > way to handle x32 here. What exactly would you propose to
>> > avoid defining the other entry points?
>>
>> I would propose that it should be 335 | 0x40000000.  I can't see any
>> reasonable way to teach the kernel to reject 335 | 0x40000000 that
>> wouldn't work just as well to accept it and make it do the right
>> thing.  Currently we accept it and do the *wrong* thing, which is no
>> good.
>>
>> > and we have to
>> > add more complexity to the copy_siginfo_from_user()
>> > implementation to duplicate the hack that exists in
>> > copy_siginfo_from_user32().
>>
>> What hack are you referring to here?
>
>I mean this part:
>
>#ifdef CONFIG_COMPAT
>int copy_siginfo_to_user32(struct compat_siginfo __user *to,
>                           const struct kernel_siginfo *from)
>#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
>{
>        return __copy_siginfo_to_user32(to, from, in_x32_syscall());
>}
>int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
>                       const struct kernel_siginfo *from, bool x32_ABI)
>#endif
>{
>...
>        case SIL_CHLD:
>                new.si_pid    = from->si_pid;
>                new.si_uid    = from->si_uid;
>                new.si_status = from->si_status;
>#ifdef CONFIG_X86_X32_ABI
>                if (x32_ABI) {
>                    new._sifields._sigchld_x32._utime = from->si_utime;
>                    new._sifields._sigchld_x32._stime = from->si_stime;
>                } else
>#endif
>                {
>                        new.si_utime = from->si_utime;
>                        new.si_stime = from->si_stime;
>                }
>                break;
>...
>}
>#endif
>
>If we have a '548 | 0x40000000' entry pointing to
>__x32_compat_sys_procfd_kill, then that will do the right
>thing. If you instead try to have x32 call into the native
>sys_procfd_kill, then copy_siginfo_to_user() will also have
>to know about x32, effectively duplicating that mess above,
>unless you want to also change all users of
>copy_siginfo_to_user32() to use copy_siginfo_to_user()
>and handle all cases in one function.

I've been looking into having siginfo64_t
with the new copy_siginfo_to_user64()
function. It looks like a pretty intricate 
task. Are we sure that we want to go
down this road? I'm not sure that it'll be
worth it. Especially since we force yet
another signal struct on user space.
Arnd Bergmann Dec. 1, 2018, 10:27 a.m. UTC | #32
On Sat, Dec 1, 2018 at 9:51 AM Arnd Bergmann <arnd@arndb.de> wrote:
> On Sat, Dec 1, 2018 at 12:54 AM Andy Lutomirski <luto@kernel.org> wrote:
> > On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > > > > siginfo_t as it is now still has a number of other downsides, and Andy in
> > > > > particular didn't like the idea of having three new variants on x86
> > > > > (depending on how you count). His alternative suggestion of having
> > > > > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > > > > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > > > > should work correctly, but feels wrong to me, or at least inconsistent
> > > > > with how we do this elsewhere.
>
> > > The '548 | 0x40000000' part seems to be the only sensible
> > > way to handle x32 here. What exactly would you propose to
> > > avoid defining the other entry points?
> >
> > I would propose that it should be 335 | 0x40000000.  I can't see any
> > reasonable way to teach the kernel to reject 335 | 0x40000000 that
> > wouldn't work just as well to accept it and make it do the right
> > thing.  Currently we accept it and do the *wrong* thing, which is no
> > good.

I guess we could start with something like the change below, which
would unify the entry points for rt_{tg,}sigqueueinfo, so that
e.g. the 129 and 536 syscall numbers do the exact same thing, and
that would be the lp64 or ilp32 behavior, depending on the
0x40000000 bit. For the new syscalls, we can then do the same
thing without assigning another number.

      Arnd

diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index d5252bc1e380..3233fb889a51 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -7,6 +7,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>

+#ifdef CONFIG_X86_X32_ABI
+#define __x64_sys_x86_rt_sigqueueinfo  __x64_sys_rt_sigqueueinfo
+#define __x64_sys_x86_rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo
+#endif
+
 /* this is a lie, but it does not hurt as sys_ni_syscall just returns
-EINVAL */
 extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
 #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const
struct pt_regs *);
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
b/arch/x86/entry/syscalls/syscall_64.tbl
index 0823eed2b02e..4a7393d34e03 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -137,7 +137,7 @@
 126    common  capset                  __x64_sys_capset
 127    64      rt_sigpending           __x64_sys_rt_sigpending
 128    64      rt_sigtimedwait         __x64_sys_rt_sigtimedwait
-129    64      rt_sigqueueinfo         __x64_sys_rt_sigqueueinfo
+129    64      rt_sigqueueinfo         __x64_sys_x86_rt_sigqueueinfo
 130    common  rt_sigsuspend           __x64_sys_rt_sigsuspend
 131    64      sigaltstack             __x64_sys_sigaltstack
 132    common  utime                   __x64_sys_utime
@@ -305,7 +305,7 @@
 294    common  inotify_init1           __x64_sys_inotify_init1
 295    64      preadv                  __x64_sys_preadv
 296    64      pwritev                 __x64_sys_pwritev
-297    64      rt_tgsigqueueinfo       __x64_sys_rt_tgsigqueueinfo
+297    64      rt_tgsigqueueinfo       __x64_sys_x86_rt_tgsigqueueinfo
 298    common  perf_event_open         __x64_sys_perf_event_open
 299    64      recvmmsg                __x64_sys_recvmmsg
 300    common  fanotify_init           __x64_sys_fanotify_init
@@ -369,7 +369,7 @@
 521    x32     ptrace                  __x32_compat_sys_ptrace
 522    x32     rt_sigpending           __x32_compat_sys_rt_sigpending
 523    x32     rt_sigtimedwait         __x32_compat_sys_rt_sigtimedwait
-524    x32     rt_sigqueueinfo         __x32_compat_sys_rt_sigqueueinfo
+524    x32     rt_sigqueueinfo         __x64_sys_x86_rt_sigqueueinfo
 525    x32     sigaltstack             __x32_compat_sys_sigaltstack
 526    x32     timer_create            __x32_compat_sys_timer_create
 527    x32     mq_notify               __x32_compat_sys_mq_notify
@@ -381,7 +381,7 @@
 533    x32     move_pages              __x32_compat_sys_move_pages
 534    x32     preadv                  __x32_compat_sys_preadv64
 535    x32     pwritev                 __x32_compat_sys_pwritev64
-536    x32     rt_tgsigqueueinfo       __x32_compat_sys_rt_tgsigqueueinfo
+536    x32     rt_tgsigqueueinfo       __x64_sys_x86_rt_tgsigqueueinfo
 537    x32     recvmmsg                __x32_compat_sys_recvmmsg
 538    x32     sendmmsg                __x32_compat_sys_sendmmsg
 539    x32     process_vm_readv        __x32_compat_sys_process_vm_readv
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 92a3b312a53c..2f16330cac83 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -892,4 +892,38 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
        signal_fault(regs, frame, "x32 rt_sigreturn");
        return 0;
 }
+
+SYSCALL_DEFINE3(x86_rt_sigqueueinfo, pid_t, pid, int, sig,
+                siginfo_t __user *, uinfo)
+{
+        kernel_siginfo_t info;
+        int ret;
+
+       if (!in_x32_syscall()
+               ret = __copy_siginfo_from_user(sig, &info, uinfo);
+       else
+               ret = __copy_siginfo_from_user32(sig, &info, uinfo);
+
+        if (unlikely(ret))
+                return ret;
+        return do_rt_sigqueueinfo(pid, sig, &info);
+}
+
+SYSCALL_DEFINE3(x86_rt_tsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+                siginfo_t __user *, uinfo)
+{
+        kernel_siginfo_t info;
+        int ret;
+
+       if (!in_x32_syscall()
+               ret = __copy_siginfo_from_user(sig, &info, uinfo);
+       else
+               ret = __copy_siginfo_from_user32(sig, &info, uinfo);
+
+        if (unlikely(ret))
+                return ret;
+        return do_rt_tsigqueueinfo(tgid, pid, sig, &info);
+}
+
+
 #endif
Eric W. Biederman Dec. 1, 2018, 1:41 p.m. UTC | #33
Andy Lutomirski <luto@kernel.org> writes:

> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <arnd@arndb.de> wrote:
>> siginfo_t as it is now still has a number of other downsides, and Andy in
>> particular didn't like the idea of having three new variants on x86
>> (depending on how you count). His alternative suggestion of having
>> a single syscall entry point that takes a 'signfo_t __user *' but interprets
>> it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
>> should work correctly, but feels wrong to me, or at least inconsistent
>> with how we do this elsewhere.
>
> BTW, do we consider siginfo_t to be extensible?  If so, and if we pass
> in a pointer, presumably we should pass a length as well.

siginfo is extensible in the sense that the structure is 128 bytes
and we use at most 48 bytes.  siginfo gets embedded in stack frames when
signals get delivered so a size change upwards is non-trivial, and is
possibly and ABI break so I believe a length field would be pointless.

Eric
Eric W. Biederman Dec. 1, 2018, 2:46 p.m. UTC | #34
Arnd Bergmann <arnd@arndb.de> writes:

> On Fri, Nov 30, 2018 at 7:56 AM Christian Brauner <christian@brauner.io> wrote:
>> On Thu, Nov 29, 2018 at 11:13:57PM -0600, Eric W. Biederman wrote:
>> > Arnd Bergmann <arnd@arndb.de> writes:
>> > > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <luto@amacapital.net> wrote:
>> > >
>> > > It looks like we already have a 'struct signalfd_siginfo' that is defined in a
>> > > sane architecture-independent way, so I'd suggest we use that.
>> >
>> > Unfortunately it isn't maintained very well.  What you can
>> > express with signalfd_siginfo is a subset what you can express with
>> > siginfo.  Many of the linux extensions to siginfo for exception
>> > information add pointers and have integers right after those pointers.
>> > Not all of those linux specific extentions for exceptions are handled
>> > by signalfd_siginfo (it needs new fields).
>
> Would those fit in the 30 remaining padding bytes?

Probably.  I expect at some point the technique signalfd has been using
won't scale.  Most of what are missing are synchronous exceptions but
the crazy seccomp extensions might be missing as well.  I say crazy
because seccomp places an integer immediately after a pointer which
makes 32bit 64bit compatibility impossible.

>> > As originally defined siginfo had the sigval_t union at the end so it
>> > was perfectly fine on both 32bit and 64bit as it only had a single
>> > pointer in the structure and the other fields were 32bits in size.
>
> The problem with sigval_t of course is that it is incompatible between
> 32-bit and 64-bit. We can add the same information, but at least on
> the syscall level that would have to be a __u64.

But sigval_t with nothing after it is not incompatible between 32bit and
64bit.  With nothing after it sigval_t in struct siginfo already has
the extra 32bits you would need.  It is sort of like transparently
adding a __u64 member to the union.

That is where we really are with sigval_t.   Except for some crazy
corner cases.

>> > Although I do feel the pain as x86_64 has to deal with 3 versions
>> > of siginfo.  A 64bit one.  A 32bit one for ia32.  A 32bit one for x32
>> > with a 64bit si_clock_t.
>
> At least you and Al have managed to get it down to a single source-level
> definition across all architectures, but there is also the (lesser) problem
> that the structure has a slightly different binary layout on each of the
> classic architectures.
>
> If we go with Andy's suggestion of having only a single binary layout
> on x86 for the new call, I'd argue that we should also make it have
> the exact same layout on all other architectures.

Yes.

>> > > We may then also want to make sure that any system call that takes a
>> > > siginfo has a replacement that takes a signalfd_siginfo, and that this
>> > > replacement can be used to implement the old version purely in
>> > > user space.
>> >
>> > If you are not implementing CRIU and reinserting exceptions to yourself.
>> > At most user space wants the ability to implement sigqueue:
>> >
>> > AKA:
>> > sigqueue(pid_t pid, int sig, const union sigval value);
>> >
>> > Well sigqueue with it's own si_codes so the following would cover all
>> > the use cases I know of:
>> > int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
>> >
>> > The si_code could even be set to SI_USER to request that the normal
>> > trusted SI_USER values be filled in.  si_code values of < 0 if not
>> > recognized could reasonably safely be treated as the _rt member of
>> > the siginfo union.  Or perhaps better we error out in such a case.
>> >
>> > If we want to be flexible and not have N kinds of system calls that
>> > is the direction I would go.  That is simple, and you don't need any of
>> > the rest.
>
> I'm not sure I understand what you are suggesting here. Would the
> low-level implementation of this be based on procfd, or do you
> mean that would be done for pid_t at the kernel level, plus another
> syscall for procfd?

I was thinking in general and for something that would be enough to back
sigqueue, and normal kill semantics.

For Christians proposed system call I can think of two ways we could go:

	long procfd_sigsend(int fd, int sig, int si_code, __u64 sigval);

Where sigval is an appropriately written version of union sigval that
works on both 32bit and 64bit.  Or we could go with:

	long procfd_sigqueueinfo(int fd, struct siginfo_subset *info)

Both would support the same set of system calls today and both would
have some similar challenges.  Both would take an si_code value of
SI_USER as a request that the kernel fill in the struct siginfo as kill
would.  As otherwise it is invalid for userspace to send a signal with
SI_USER.  I would not worry about the signal injection case for CRIU
where we explicitly allow any siginfo to be sent if we are sending it to
ourselves.  We already have a system call for that.

What I would do is carefully define siginfo_subset architecturally as
a proper subset of siginfo on each architecture.  That in most if not
all cases will work on 32bit and 64bit.   Because of the extra padding
after union sigval.

That siginfo_subset userspace could just consider a siginfo_t.  We would
just leave out the trouble some bits.  So we could do a simple
untranslated copy_from_user on it, and then validate that si_code is in
a range we support.

That definition of siginfo_subset doesn't preclude extensions in the
future.  And so it could very much be a pass through interface on 32bit
on 64bit and into everyone's existing stack frames that embed siginfo.

So if we want to fix the situation that is what I would do, as userspace
is not allowed to send any of the problematic siginfo definitions today.

There might be some finicky detail of union sigval or some architure
might have funny alignment between 32bit and 64bit, but I don't think
there is.  If we want to be more flexible than my procfd_sigsend above
I think we should definitely look at that.

>> > Alternatively we abandon the mistake of sigqueueinfo and not allow
>> > a signal number in the arguments that differs from the si_signo in the
>> > siginfo and allow passing the entire thing unchanged from sender to
>> > receiver.  That is maximum flexibility.
>
> This would be regardless of which flavor of siginfo (today's arch
> specific one, signalfd_siginfo, or a new one) we pass, right?

No.  signalfd_siginfo offers much less future extensibility because it
must be translated.  Once you can not just pass the structure through
you fail.  As you can't be transparently forward compatible.

In a similar way Linux filesystems written assuming all the world was
ascii wound up forward compatible with utf8 as they are both byte
encodings of strings that can use a NULL terminator.

>> > signalfd_siginfo just sucks in practice.  It is larger that siginfo 104
>> > bytes versus 48.  We must deliver it to userspace as a siginfo so it
>> > must be translated.  Because of the translation signalfd_siginfo adds
>> > no flexiblity in practice, because it can not just be passed through.
>> > Finallay signalfd_siginfo does not have encodings for all of the
>> > siginfo union members, so it fails to be fully general.
>> >
>> > Personally if I was to define signalfd_siginfo today I would make it:
>> > struct siginfo_subset {
>> >       __u32 sis_signo;
>> >       __s32 sis_errno;
>> >       __s32 sis_code;
>> >         __u32 sis_pad;
>> >       __u32 sis_pid;
>> >       __u32 sis_uid;
>> >       __u64 sis_data (A pointer or integer data field);
>> > };
>> >
>> > That is just 32bytes, and is all that is needed for everything
>> > except for synchronous exceptions.  Oh and that happens to be a proper
>> > subset of a any sane siginfo layout, on both 32bit and 64bit.
>
> And that would work for signalfd and waitid_time64, but not for
> procfd_signal/kill/sendsiginfo?

No.  I was bringing this up because it will work for procfd_signal.  See
above.  It supports every signal userspace can send today.  We should be
able to specify it in a way that is compatible with the existing siginfo
definitions on all architectures.

For signalfd or let me say a future signalfd2 the practical problem is
that we have Linux specific signal extensions that are not synchronous
signals like seccomp and sigchld that have fields in a bad location.

If we successfully define a siginfo_subset that is the same on
32bit and 64bit  I would include the defintion in the kernels definition
of siginfo_t so new extensions are automatically picked up and require
any extensions to siginfo_t to be added to siginfo_subset_t, in the
appropriate 32bit/64bit way.

>> > This is one of those rare times where POSIX is sane and what linux
>> > has implemented is not.
>>
>> Thanks for the in-depth explanation. So your point is that we are better
>> off if we stick with siginfo_t instead of struct signalfd_siginfo in
>> procfd_signal()?

My point is that we are better sticking with siginfo_t or just cherry
picking the relevant fields and making them system call parameters.

The only advantage of using siginfo_t is forwards compatibility.  If we
give up forward compatibility we can just as easily have our system call
prototype include  "...(...,  int sig, int code, union sigval value)"
for the definition of the signal bits.  Simple easy and enough for all
posix signals.

Eric
Eric W. Biederman Dec. 1, 2018, 3:28 p.m. UTC | #35
It just occurs to me that the simple way to implement
procfd_sigqueueinfo info is like:

int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
{
#ifdef CONFIG_COMPAT
	if (in_compat_syscall)
		return copy_siginfo_from_user32(info, uinfo);
#endif
	return copy_siginfo_from_user(info, uinfo);                        
}

long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
{
	kernel_siginfo info;

        if (copy_siginfo_from_user_any(&info, uinfo))
        	return -EFAULT;
	...;                
}

It looks like there is already a place in ptrace.c that already
hand rolls copy_siginfo_from_user_any.

So while I would love to figure out the subset of siginfo_t tha we can
just pass through, as I think that would make a better more forward
compatible copy_siginfo_from_user32.  I think for this use case we just
add the in_compat_syscall test and then we just need to ensure this new
system call is placed in the proper places in the syscall table.

Because we will need 3 call sights: x86_64, x32 and ia32.  As the layout
changes between those three subarchitecuters.

Eric
Andy Lutomirski Dec. 1, 2018, 3:52 p.m. UTC | #36
> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <ebiederm@xmission.com> wrote:
> 
> 
> It just occurs to me that the simple way to implement
> procfd_sigqueueinfo info is like:
> 
> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
> {
> #ifdef CONFIG_COMPAT
>    if (in_compat_syscall)
>        return copy_siginfo_from_user32(info, uinfo);
> #endif
>    return copy_siginfo_from_user(info, uinfo);                        
> }
> 
> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
> {
>    kernel_siginfo info;
> 
>        if (copy_siginfo_from_user_any(&info, uinfo))
>            return -EFAULT;
>    ...;                
> }
> 
> It looks like there is already a place in ptrace.c that already
> hand rolls copy_siginfo_from_user_any.
> 
> So while I would love to figure out the subset of siginfo_t tha we can
> just pass through, as I think that would make a better more forward
> compatible copy_siginfo_from_user32.

Seems reasonable to me. It’s less code overall than any other suggestion, too.

>  I think for this use case we just
> add the in_compat_syscall test and then we just need to ensure this new
> system call is placed in the proper places in the syscall table.
> 
> Because we will need 3 call sights: x86_64, x32 and ia32.  As the layout
> changes between those three subarchitecuters.
> 
> 

If it’s done this way, it can just be “common” in the 64-bit table. And we kick the can a bit farther down the road :)

I’m working on patches to clean up x86’s syscall mess. It’s slow because I keep finding new messes.  So far I have rt_sigreturn working like every other syscall — whee.

Also, Eric, for your edification, I have a draft patch set to radically simplify x86’s signal delivery and return.  Once that’s done, I can trivially speed up delivery by a ton by using sysret.
Christian Brauner Dec. 1, 2018, 4:27 p.m. UTC | #37
On December 2, 2018 4:52:37 AM GMT+13:00, Andy Lutomirski <luto@amacapital.net> wrote:
>
>
>> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <ebiederm@xmission.com>
>wrote:
>> 
>> 
>> It just occurs to me that the simple way to implement
>> procfd_sigqueueinfo info is like:
>> 
>> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t
>*uinfo)
>> {
>> #ifdef CONFIG_COMPAT
>>    if (in_compat_syscall)
>>        return copy_siginfo_from_user32(info, uinfo);
>> #endif
>>    return copy_siginfo_from_user(info, uinfo);                       
>
>> }
>> 
>> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
>> {
>>    kernel_siginfo info;
>> 
>>        if (copy_siginfo_from_user_any(&info, uinfo))
>>            return -EFAULT;
>>    ...;                
>> }
>> 
>> It looks like there is already a place in ptrace.c that already
>> hand rolls copy_siginfo_from_user_any.
>> 
>> So while I would love to figure out the subset of siginfo_t tha we
>can
>> just pass through, as I think that would make a better more forward
>> compatible copy_siginfo_from_user32.
>
>Seems reasonable to me. It’s less code overall than any other
>suggestion, too.

Thanks everyone, that was super helpful!
All things equal I'm going to send out an
updated version of the patch latest next week!

>
>>  I think for this use case we just
>> add the in_compat_syscall test and then we just need to ensure this
>new
>> system call is placed in the proper places in the syscall table.
>> 
>> Because we will need 3 call sights: x86_64, x32 and ia32.  As the
>layout
>> changes between those three subarchitecuters.
>> 
>> 
>
>If it’s done this way, it can just be “common” in the 64-bit table. And
>we kick the can a bit farther down the road :)
>
>I’m working on patches to clean up x86’s syscall mess. It’s slow
>because I keep finding new messes.  So far I have rt_sigreturn working
>like every other syscall — whee.
>
>Also, Eric, for your edification, I have a draft patch set to radically
>simplify x86’s signal delivery and return.  Once that’s done, I can
>trivially speed up delivery by a ton by using sysret.
Eric W. Biederman Dec. 2, 2018, 12:06 a.m. UTC | #38
Andy Lutomirski <luto@amacapital.net> writes:

>> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <ebiederm@xmission.com> wrote:
>> 
>> 
>> It just occurs to me that the simple way to implement
>> procfd_sigqueueinfo info is like:
>> 
>> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
>> {
>> #ifdef CONFIG_COMPAT
>>    if (in_compat_syscall)
>>        return copy_siginfo_from_user32(info, uinfo);
>> #endif
>>    return copy_siginfo_from_user(info, uinfo);                        
>> }
>> 
>> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
>> {
>>    kernel_siginfo info;
>> 
>>        if (copy_siginfo_from_user_any(&info, uinfo))
>>            return -EFAULT;
>>    ...;                
>> }
>> 
>> It looks like there is already a place in ptrace.c that already
>> hand rolls copy_siginfo_from_user_any.
>> 
>> So while I would love to figure out the subset of siginfo_t tha we can
>> just pass through, as I think that would make a better more forward
>> compatible copy_siginfo_from_user32.
>
> Seems reasonable to me. It’s less code overall than any other suggestion, too.
>
>>  I think for this use case we just
>> add the in_compat_syscall test and then we just need to ensure this new
>> system call is placed in the proper places in the syscall table.
>> 
>> Because we will need 3 call sights: x86_64, x32 and ia32.  As the layout
>> changes between those three subarchitecuters.
>> 
>> 
>
> If it’s done this way, it can just be “common” in the 64-bit
> table. And we kick the can a bit farther down the road :)
>
> I’m working on patches to clean up x86’s syscall mess. It’s slow
> because I keep finding new messes.  So far I have rt_sigreturn working
> like every other syscall — whee.
>
> Also, Eric, for your edification, I have a draft patch set to
> radically simplify x86’s signal delivery and return.  Once that’s
> done, I can trivially speed up delivery by a ton by using sysret.

Nice.

Do we care about the performance of synchronous signal delivery (AKA
hardware exceptions) vs ordinary signal delivery.  I get the feeling
there are serious simplifications to be had in that case.

Eric
Andy Lutomirski Dec. 2, 2018, 1:14 a.m. UTC | #39
On Sat, Dec 1, 2018 at 4:07 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Andy Lutomirski <luto@amacapital.net> writes:
>
> >> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >>
> >> It just occurs to me that the simple way to implement
> >> procfd_sigqueueinfo info is like:
> >>
> >> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
> >> {
> >> #ifdef CONFIG_COMPAT
> >>    if (in_compat_syscall)
> >>        return copy_siginfo_from_user32(info, uinfo);
> >> #endif
> >>    return copy_siginfo_from_user(info, uinfo);
> >> }
> >>
> >> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
> >> {
> >>    kernel_siginfo info;
> >>
> >>        if (copy_siginfo_from_user_any(&info, uinfo))
> >>            return -EFAULT;
> >>    ...;
> >> }
> >>
> >> It looks like there is already a place in ptrace.c that already
> >> hand rolls copy_siginfo_from_user_any.
> >>
> >> So while I would love to figure out the subset of siginfo_t tha we can
> >> just pass through, as I think that would make a better more forward
> >> compatible copy_siginfo_from_user32.
> >
> > Seems reasonable to me. It’s less code overall than any other suggestion, too.
> >
> >>  I think for this use case we just
> >> add the in_compat_syscall test and then we just need to ensure this new
> >> system call is placed in the proper places in the syscall table.
> >>
> >> Because we will need 3 call sights: x86_64, x32 and ia32.  As the layout
> >> changes between those three subarchitecuters.
> >>
> >>
> >
> > If it’s done this way, it can just be “common” in the 64-bit
> > table. And we kick the can a bit farther down the road :)
> >
> > I’m working on patches to clean up x86’s syscall mess. It’s slow
> > because I keep finding new messes.  So far I have rt_sigreturn working
> > like every other syscall — whee.
> >
> > Also, Eric, for your edification, I have a draft patch set to
> > radically simplify x86’s signal delivery and return.  Once that’s
> > done, I can trivially speed up delivery by a ton by using sysret.
>
> Nice.
>
> Do we care about the performance of synchronous signal delivery (AKA
> hardware exceptions) vs ordinary signal delivery.  I get the feeling
> there are serious simplifications to be had in that case.
>


I dunno what user code cares about.  Linux's support for synchronous
exception handling is so far behind, say, Windows, that I don't know
if it's even used for anything very serious.  We should probably
profile it after I finish my changes and we can see how bad it is.  We
can't do anything at all about the time it takes the CPU to deliver
the exception, and trying to avoid IRET when we return would be tricky
at best, although siglongjmp() might end up skipping it.
Christian Brauner Dec. 2, 2018, 8:52 a.m. UTC | #40
On Sat, Dec 01, 2018 at 09:28:47AM -0600, Eric W. Biederman wrote:
> 
> It just occurs to me that the simple way to implement
> procfd_sigqueueinfo info is like:
> 
> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
> {
> #ifdef CONFIG_COMPAT
> 	if (in_compat_syscall)
> 		return copy_siginfo_from_user32(info, uinfo);

Right, though that would require a cast afaict.

static int __copy_siginfo_from_user_generic(int signo, kernel_siginfo_t *kinfo,
                                           siginfo_t *info)
{
#ifdef CONFIG_COMPAT
       if (in_compat_syscall())
               return __copy_siginfo_from_user32(
                       signo, kinfo, (struct compat_siginfo __user *)info);
#endif
       return __copy_siginfo_from_user(signo, kinfo, info);
}

It seems that a cast to (compat_siginfo __user *) should be safe in this
context? I've at least seen similar things done for __sys_sendmsg().

> #endif
> 	return copy_siginfo_from_user(info, uinfo);                        
> }
> 
> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)

**bikeshedding**
Not a fan of that name. I'm going to go with procfd_send_signal().
sigqueue gives non-native speakers a lot of room for spelling errors and
it always seemed opaque to me what this function is doing without
consulting the manpage. :)

> {
> 	kernel_siginfo info;
> 
>         if (copy_siginfo_from_user_any(&info, uinfo))
>         	return -EFAULT;
> 	...;                
> }
> 
> It looks like there is already a place in ptrace.c that already
> hand rolls copy_siginfo_from_user_any.
> 
> So while I would love to figure out the subset of siginfo_t tha we can
> just pass through, as I think that would make a better more forward
> compatible copy_siginfo_from_user32.  I think for this use case we just
> add the in_compat_syscall test and then we just need to ensure this new
> system call is placed in the proper places in the syscall table.
> 
> Because we will need 3 call sights: x86_64, x32 and ia32.  As the layout
> changes between those three subarchitecuters.
> 
> Eric
>
Christian Brauner Dec. 2, 2018, 10:03 a.m. UTC | #41
On Sat, Dec 01, 2018 at 12:52:24PM +1300, Christian Brauner wrote:
> On November 30, 2018 1:28:15 AM GMT+13:00, Florian Weimer <fweimer@redhat.com> wrote:
> >Disclaimer: I'm looking at this patch because Christian requested it.
> >I'm not a kernel developer.
> 
> Given all your expertise this really doesn't matter. :)
> You're the one having to deal with this
> in glibc after all.
> Thanks for doing this and sorry for the late reply.
> I missed that mail.
> 
> >
> >* Christian Brauner:
> >
> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
> >b/arch/x86/entry/syscalls/syscall_32.tbl
> >> index 3cf7b533b3d1..3f27ffd8ae87 100644
> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> >> @@ -398,3 +398,4 @@
> >>  384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
> >> 
> >385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
> >>  386	i386	rseq			sys_rseq			__ia32_sys_rseq
> >>
> >+387	i386	procfd_signal		sys_procfd_signal		__ia32_compat_sys_procfd_signal
> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> >b/arch/x86/entry/syscalls/syscall_64.tbl
> >> index f0b1709a5ffb..8a30cde82450 100644
> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> >> @@ -343,6 +343,7 @@
> >>  332	common	statx			__x64_sys_statx
> >>  333	common	io_pgetevents		__x64_sys_io_pgetevents
> >>  334	common	rseq			__x64_sys_rseq
> >> +335	64	procfd_signal		__x64_sys_procfd_signal
> >>  
> >>  #
> >>  # x32-specific system call numbers start at 512 to avoid cache
> >impact
> >> @@ -386,3 +387,4 @@
> >>  545	x32	execveat		__x32_compat_sys_execveat/ptregs
> >>  546	x32	preadv2			__x32_compat_sys_preadv64v2
> >>  547	x32	pwritev2		__x32_compat_sys_pwritev64v2
> >> +548	x32	procfd_signal		__x32_compat_sys_procfd_signal
> >
> >Is there a reason why these numbers have to be different?
> >
> >(See the recent discussion with Andy Lutomirski.)
> >
> >> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t
> >*kinfo, int flags,
> >> +			    bool had_siginfo)
> >> +{
> >> +	int ret;
> >> +	struct fd f;
> >> +	struct pid *pid;
> >> +
> >> +	/* Enforce flags be set to 0 until we add an extension. */
> >> +	if (flags)
> >> +		return -EINVAL;
> >> +
> >> +	f = fdget_raw(fd);
> >> +	if (!f.file)
> >> +		return -EBADF;
> >> +
> >> +	/* Is this a process file descriptor? */
> >> +	ret = -EINVAL;
> >> +	if (!proc_is_tgid_procfd(f.file))
> >> +		goto err;
> >[…]
> >> +	ret = kill_pid_info(sig, kinfo, pid);
> >
> >I would like to see some comment here what happens to zombie processes.
> 
> You'd get ESRCH.
> I'm not sure if that has always been the case.
> Eric recently did some excellent refactoring of the signal code.
> Iirc, part of that involved not delivering signals to zombies.
> That's at least how I remember it.
> I don't have access to source code though atm.

Ok, I finally have access to source code again. Scratch what I said above!
I looked at the code and tested it. If the process has exited but not
yet waited upon aka is a zombie procfd_send_signal() will return 0. This
is identical to kill(2) behavior. It should've been sort-of obvious
since when a process is in zombie state /proc/<pid> will still be around
which means that struct pid must still be around.

> 
> >
> >> +/**
> >> + *  sys_procfd_signal - send a signal to a process through a process
> >file
> >> + *                      descriptor
> >> + *  @fd: the file descriptor of the process
> >> + *  @sig: signal to be sent
> >> + *  @info: the signal info
> >> + *  @flags: future flags to be passed
> >> + */
> >> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user
> >*, info,
> >> +		int, flags)
> >
> >Sorry, I'm quite unhappy with the name.  “signal” is for signal handler
> >management.  procfd_sendsignal, procfd_sigqueueinfo or something like
> >that would be fine.  Even procfd_kill would be better IMHO.
> 
> Ok. I only have strong opinions to procfd_kill().
> Mainly because the new syscall takes
> the job of multiple other syscalls
> so kill gives the wrong impression.
> I'll come up with a better name in the next iteration.
> 
> >
> >Looking at the rt_tgsigqueueinfo interface, is there a way to implement
> >the “tg” part with the current procfd_signal interface?  Would you use
> >openat to retrieve the Tgid: line from "status"?
> 
> Yes, the tg part can be implemented.
> As I pointed out in another mail my
> I is to make this work by using file
> descriptors for /proc/<pid>/task/<tid>.
> I don't want this in the initial patchset though.
> I prefer to slowly add those features
> once we have gotten the basic functionality
> in.
> 
> 
> >
> >Thanks,
> >Florian
>
Florian Weimer Dec. 3, 2018, 4:57 p.m. UTC | #42
* Christian Brauner:

> Ok, I finally have access to source code again. Scratch what I said above!
> I looked at the code and tested it. If the process has exited but not
> yet waited upon aka is a zombie procfd_send_signal() will return 0. This
> is identical to kill(2) behavior. It should've been sort-of obvious
> since when a process is in zombie state /proc/<pid> will still be around
> which means that struct pid must still be around.

Should we make this state more accessible, by providing a different
error code?

Will the system call ever return ESRCH, given that you have a handle for
the process?

>> >Looking at the rt_tgsigqueueinfo interface, is there a way to implement
>> >the “tg” part with the current procfd_signal interface?  Would you use
>> >openat to retrieve the Tgid: line from "status"?
> 
> Yes, the tg part can be implemented.

I meant on top of the existing interface.

> As I pointed out in another mail my I is to make this work by using
> file descriptors for /proc/<pid>/task/<tid>.  I don't want this in the
> initial patchset though.  I prefer to slowly add those features once
> we have gotten the basic functionality in.

Do you want to land all this in one kernel release?  I wonder how
applications are supposed to discover kernel support if functionality is
split across several kernel releases.  If you get EINVAL or EBADF, it
may not be obvious what is going on.

What happens if you use the new interface with an O_PATH descriptor?

Thanks,
Florian
Christian Brauner Dec. 3, 2018, 6:02 p.m. UTC | #43
On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
> * Christian Brauner:
> 
> > Ok, I finally have access to source code again. Scratch what I said above!
> > I looked at the code and tested it. If the process has exited but not
> > yet waited upon aka is a zombie procfd_send_signal() will return 0. This
> > is identical to kill(2) behavior. It should've been sort-of obvious
> > since when a process is in zombie state /proc/<pid> will still be around
> > which means that struct pid must still be around.
> 
> Should we make this state more accessible, by providing a different
> error code?

No, I don't think we want that. Imho, It's not really helpful. Signals
are still delivered to zombies. If zombie state were to always mean that
no-one is going to wait on this thread anymore then it would make sense
to me. But given that zombie can also mean that someone put a
sleep(1000) right before their wait() call in the parent it seems odd to
report back that it is a zombie.

> 
> Will the system call ever return ESRCH, given that you have a handle for
> the process?

Yes, whenever you signal a process that has already been waited upon:
- get procfd handle referring to <proc>
- <proc> exits and is waited upon
- procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH

> 
> >> >Looking at the rt_tgsigqueueinfo interface, is there a way to implement
> >> >the “tg” part with the current procfd_signal interface?  Would you use
> >> >openat to retrieve the Tgid: line from "status"?
> > 
> > Yes, the tg part can be implemented.
> 
> I meant on top of the existing interface.

See below.

> 
> > As I pointed out in another mail my I is to make this work by using
> > file descriptors for /proc/<pid>/task/<tid>.  I don't want this in the
> > initial patchset though.  I prefer to slowly add those features once
> > we have gotten the basic functionality in.
> 
> Do you want to land all this in one kernel release?  I wonder how
> applications are supposed to discover kernel support if functionality is
> split across several kernel releases.  If you get EINVAL or EBADF, it
> may not be obvious what is going on.

Sigh, I get that but I really don't want to have to land this in one big
chunk. I want this syscall to go in in a as soon as we can to fulfill
the most basic need: having a way that guarantees us that we signal the
process that we intended to signal.

The thread case is easy to implement on top of it. But I suspect we will
quibble about the exact semantics for a long time. Even now we have been
on multiple - justified - detrous. That's all pefectly fine and
expected. But if we have the basic functionality in we have time to do
all of that. We might even land it in the same kernel release still. I
really don't want to come of as tea-party-kernel-conservative here but I
have time-and-time again seen that making something fancy and cover ever
interesting feature in one patchset takes a very very long time.

If you care about userspace being able to detect that case I can return
EOPNOTSUPP when a tid descriptor is passed.

> 
> What happens if you use the new interface with an O_PATH descriptor?

You get EINVAL. When an O_PATH file descriptor is created the kernel
will set file->f_op = &empty_fops at which point the check I added 
        if (!proc_is_tgid_procfd(f.file))
                goto err;
will fail. Imho this is correct behavior since technically signaling a
struct pid is the equivalent of writing to a file and hence doesn't
purely operate on the file descriptor level.

Christian
Aleksa Sarai Dec. 4, 2018, 6:03 a.m. UTC | #44
On 2018-12-03, Christian Brauner <christian@brauner.io> wrote:
> > > As I pointed out in another mail my I is to make this work by using
> > > file descriptors for /proc/<pid>/task/<tid>.  I don't want this in the
> > > initial patchset though.  I prefer to slowly add those features once
> > > we have gotten the basic functionality in.
> > 
> > Do you want to land all this in one kernel release?  I wonder how
> > applications are supposed to discover kernel support if functionality is
> > split across several kernel releases.  If you get EINVAL or EBADF, it
> > may not be obvious what is going on.
> 
> Sigh, I get that but I really don't want to have to land this in one big
> chunk. I want this syscall to go in in a as soon as we can to fulfill
> the most basic need: having a way that guarantees us that we signal the
> process that we intended to signal.
> 
> The thread case is easy to implement on top of it. But I suspect we will
> quibble about the exact semantics for a long time. Even now we have been
> on multiple - justified - detrous. That's all pefectly fine and
> expected. But if we have the basic functionality in we have time to do
> all of that. We might even land it in the same kernel release still. I
> really don't want to come of as tea-party-kernel-conservative here but I
> have time-and-time again seen that making something fancy and cover ever
> interesting feature in one patchset takes a very very long time.
> 
> If you care about userspace being able to detect that case I can return
> EOPNOTSUPP when a tid descriptor is passed.

Personally, I'm +1 on -EOPNOTSUPP so we can get an MVP merged, and add
new features in later patches.

> > What happens if you use the new interface with an O_PATH descriptor?
> 
> You get EINVAL. When an O_PATH file descriptor is created the kernel
> will set file->f_op = &empty_fops at which point the check I added 
>         if (!proc_is_tgid_procfd(f.file))
>                 goto err;
> will fail. Imho this is correct behavior since technically signaling a
> struct pid is the equivalent of writing to a file and hence doesn't
> purely operate on the file descriptor level.

Not to mention that O_PATH file descriptors are a whole kettle of fish
when it comes to permission checking semantics.
Florian Weimer Dec. 4, 2018, 12:55 p.m. UTC | #45
* Christian Brauner:

> On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
>> * Christian Brauner:
>> 
>> > Ok, I finally have access to source code again. Scratch what I said above!
>> > I looked at the code and tested it. If the process has exited but not
>> > yet waited upon aka is a zombie procfd_send_signal() will return 0. This
>> > is identical to kill(2) behavior. It should've been sort-of obvious
>> > since when a process is in zombie state /proc/<pid> will still be around
>> > which means that struct pid must still be around.
>> 
>> Should we make this state more accessible, by providing a different
>> error code?
>
> No, I don't think we want that. Imho, It's not really helpful. Signals
> are still delivered to zombies. If zombie state were to always mean that
> no-one is going to wait on this thread anymore then it would make sense
> to me. But given that zombie can also mean that someone put a
> sleep(1000) right before their wait() call in the parent it seems odd to
> report back that it is a zombie.

It allows for error checking that the recipient of a signal is still
running.  It's obviously not reliable, but I think it could be helpful
in the context of closely cooperating processes.

>> Will the system call ever return ESRCH, given that you have a handle for
>> the process?
>
> Yes, whenever you signal a process that has already been waited upon:
> - get procfd handle referring to <proc>
> - <proc> exits and is waited upon
> - procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH

I see, thanks.

>> Do you want to land all this in one kernel release?  I wonder how
>> applications are supposed to discover kernel support if functionality is
>> split across several kernel releases.  If you get EINVAL or EBADF, it
>> may not be obvious what is going on.
>
> Sigh, I get that but I really don't want to have to land this in one big
> chunk. I want this syscall to go in in a as soon as we can to fulfill
> the most basic need: having a way that guarantees us that we signal the
> process that we intended to signal.
>
> The thread case is easy to implement on top of it. But I suspect we will
> quibble about the exact semantics for a long time. Even now we have been
> on multiple - justified - detrous. That's all pefectly fine and
> expected. But if we have the basic functionality in we have time to do
> all of that. We might even land it in the same kernel release still. I
> really don't want to come of as tea-party-kernel-conservative here but I
> have time-and-time again seen that making something fancy and cover ever
> interesting feature in one patchset takes a very very long time.
>
> If you care about userspace being able to detect that case I can return
> EOPNOTSUPP when a tid descriptor is passed.

I suppose that's fine.  Or alternatively, when thread group support is
added, introduce a flag that applications have to use to enable it, so
that they can probe for support by checking support for the flag.

I wouldn't be opposed to a new system call like this either:

  int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);

But I think this is frowned upon on the kernel side.

>> What happens if you use the new interface with an O_PATH descriptor?
>
> You get EINVAL. When an O_PATH file descriptor is created the kernel
> will set file->f_op = &empty_fops at which point the check I added 
>         if (!proc_is_tgid_procfd(f.file))
>                 goto err;
> will fail. Imho this is correct behavior since technically signaling a
> struct pid is the equivalent of writing to a file and hence doesn't
> purely operate on the file descriptor level.

Yes, that's quite reasonable.  Thanks.

Florian
Christian Brauner Dec. 4, 2018, 1:26 p.m. UTC | #46
On Tue, Dec 04, 2018 at 01:55:10PM +0100, Florian Weimer wrote:
> * Christian Brauner:
> 
> > On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
> >> * Christian Brauner:
> >> 
> >> > Ok, I finally have access to source code again. Scratch what I said above!
> >> > I looked at the code and tested it. If the process has exited but not
> >> > yet waited upon aka is a zombie procfd_send_signal() will return 0. This
> >> > is identical to kill(2) behavior. It should've been sort-of obvious
> >> > since when a process is in zombie state /proc/<pid> will still be around
> >> > which means that struct pid must still be around.
> >> 
> >> Should we make this state more accessible, by providing a different
> >> error code?
> >
> > No, I don't think we want that. Imho, It's not really helpful. Signals
> > are still delivered to zombies. If zombie state were to always mean that
> > no-one is going to wait on this thread anymore then it would make sense
> > to me. But given that zombie can also mean that someone put a
> > sleep(1000) right before their wait() call in the parent it seems odd to
> > report back that it is a zombie.
> 
> It allows for error checking that the recipient of a signal is still
> running.  It's obviously not reliable, but I think it could be helpful
> in the context of closely cooperating processes.
> 
> >> Will the system call ever return ESRCH, given that you have a handle for
> >> the process?
> >
> > Yes, whenever you signal a process that has already been waited upon:
> > - get procfd handle referring to <proc>
> > - <proc> exits and is waited upon
> > - procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH
> 
> I see, thanks.
> 
> >> Do you want to land all this in one kernel release?  I wonder how
> >> applications are supposed to discover kernel support if functionality is
> >> split across several kernel releases.  If you get EINVAL or EBADF, it
> >> may not be obvious what is going on.
> >
> > Sigh, I get that but I really don't want to have to land this in one big
> > chunk. I want this syscall to go in in a as soon as we can to fulfill
> > the most basic need: having a way that guarantees us that we signal the
> > process that we intended to signal.
> >
> > The thread case is easy to implement on top of it. But I suspect we will
> > quibble about the exact semantics for a long time. Even now we have been
> > on multiple - justified - detrous. That's all pefectly fine and
> > expected. But if we have the basic functionality in we have time to do
> > all of that. We might even land it in the same kernel release still. I
> > really don't want to come of as tea-party-kernel-conservative here but I
> > have time-and-time again seen that making something fancy and cover ever
> > interesting feature in one patchset takes a very very long time.
> >
> > If you care about userspace being able to detect that case I can return
> > EOPNOTSUPP when a tid descriptor is passed.
> 
> I suppose that's fine.  Or alternatively, when thread group support is
> added, introduce a flag that applications have to use to enable it, so
> that they can probe for support by checking support for the flag.
> 
> I wouldn't be opposed to a new system call like this either:
> 
>   int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
> 
> But I think this is frowned upon on the kernel side.

If this is purely about getting a procfd then I think this isn't really
necessary since you can get it from /proc/<pid> and
/proc/<pid>/task/<tid> so a syscall just for that is likely overkill.
However, I started to pick up the CLONE_FD patchset but ideally I would
like it to be way simpler to what was proposed back in the day (which is
not a critique, I just don't feel comfortable with bringing massive
patches to the table that I can barely judge wrt to their correctness.
:)). I have toyed around with this a little and I'm tempted to simply
have the syscall always return an fd for the process and not require a
separate flag for this. But I need to work through the details and this
is really far out into the (kernel) future.

> 
> >> What happens if you use the new interface with an O_PATH descriptor?
> >
> > You get EINVAL. When an O_PATH file descriptor is created the kernel
> > will set file->f_op = &empty_fops at which point the check I added 
> >         if (!proc_is_tgid_procfd(f.file))
> >                 goto err;
> > will fail. Imho this is correct behavior since technically signaling a
> > struct pid is the equivalent of writing to a file and hence doesn't
> > purely operate on the file descriptor level.
> 
> Yes, that's quite reasonable.  Thanks.
> 
> Florian
Andy Lutomirski Dec. 6, 2018, 6:54 p.m. UTC | #47
> On Dec 4, 2018, at 4:55 AM, Florian Weimer <fweimer@redhat.com> wrote:
>
> * Christian Brauner:
>
>>> On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
>>> * Christian Brauner:
>>>
>>>> Ok, I finally have access to source code again. Scratch what I said above!
>>>> I looked at the code and tested it. If the process has exited but not
>>>> yet waited upon aka is a zombie procfd_send_signal() will return 0. This
>>>> is identical to kill(2) behavior. It should've been sort-of obvious
>>>> since when a process is in zombie state /proc/<pid> will still be around
>>>> which means that struct pid must still be around.
>>>
>>> Should we make this state more accessible, by providing a different
>>> error code?
>>
>> No, I don't think we want that. Imho, It's not really helpful. Signals
>> are still delivered to zombies. If zombie state were to always mean that
>> no-one is going to wait on this thread anymore then it would make sense
>> to me. But given that zombie can also mean that someone put a
>> sleep(1000) right before their wait() call in the parent it seems odd to
>> report back that it is a zombie.
>
> It allows for error checking that the recipient of a signal is still
> running.  It's obviously not reliable, but I think it could be helpful
> in the context of closely cooperating processes.
>
>>> Will the system call ever return ESRCH, given that you have a handle for
>>> the process?
>>
>> Yes, whenever you signal a process that has already been waited upon:
>> - get procfd handle referring to <proc>
>> - <proc> exits and is waited upon
>> - procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH
>
> I see, thanks.
>
>>> Do you want to land all this in one kernel release?  I wonder how
>>> applications are supposed to discover kernel support if functionality is
>>> split across several kernel releases.  If you get EINVAL or EBADF, it
>>> may not be obvious what is going on.
>>
>> Sigh, I get that but I really don't want to have to land this in one big
>> chunk. I want this syscall to go in in a as soon as we can to fulfill
>> the most basic need: having a way that guarantees us that we signal the
>> process that we intended to signal.
>>
>> The thread case is easy to implement on top of it. But I suspect we will
>> quibble about the exact semantics for a long time. Even now we have been
>> on multiple - justified - detrous. That's all pefectly fine and
>> expected. But if we have the basic functionality in we have time to do
>> all of that. We might even land it in the same kernel release still. I
>> really don't want to come of as tea-party-kernel-conservative here but I
>> have time-and-time again seen that making something fancy and cover ever
>> interesting feature in one patchset takes a very very long time.
>>
>> If you care about userspace being able to detect that case I can return
>> EOPNOTSUPP when a tid descriptor is passed.
>
> I suppose that's fine.  Or alternatively, when thread group support is
> added, introduce a flag that applications have to use to enable it, so
> that they can probe for support by checking support for the flag.
>
> I wouldn't be opposed to a new system call like this either:
>
>  int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
>
> But I think this is frowned upon on the kernel side.

I have no problem with it, except that I think it shouldn’t return an
fd that can be used for proc filesystem access.
Florian Weimer Dec. 6, 2018, 6:56 p.m. UTC | #48
* Andy Lutomirski:

>> I suppose that's fine.  Or alternatively, when thread group support is
>> added, introduce a flag that applications have to use to enable it, so
>> that they can probe for support by checking support for the flag.
>>
>> I wouldn't be opposed to a new system call like this either:
>>
>>  int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
>>
>> But I think this is frowned upon on the kernel side.
>
> I have no problem with it, except that I think it shouldn’t return an
> fd that can be used for proc filesystem access.

Oh no, my intention was that it would just be used with  *_send_signal
and related functions.

Thanks,
Florian
Christian Brauner Dec. 6, 2018, 7:03 p.m. UTC | #49
On December 7, 2018 7:56:44 AM GMT+13:00, Florian Weimer <fweimer@redhat.com> wrote:
>* Andy Lutomirski:
>
>>> I suppose that's fine.  Or alternatively, when thread group support
>is
>>> added, introduce a flag that applications have to use to enable it,
>so
>>> that they can probe for support by checking support for the flag.
>>>
>>> I wouldn't be opposed to a new system call like this either:
>>>
>>>  int procfd_open (pid_t thread_group, pid_t thread_id, unsigned
>flags);
>>>
>>> But I think this is frowned upon on the kernel side.
>>
>> I have no problem with it, except that I think it shouldn’t return an
>> fd that can be used for proc filesystem access.
>
>Oh no, my intention was that it would just be used with  *_send_signal
>and related functions.

Let's postpone that discussion a little.
I think we don't need a syscall to base this off of pids.
As I said I rather send my revived version of CLONE_NEWFD that would serve the same task.
The same way we could also just add a new open() flag that blocks fs access completely.
I just pitched that idea to Serge a few days back: O_NOCHDIR or similar.
That could even be part of Aleksa's path resolution patchset.

>
>Thanks,
>Florian
Lai Jiangshan Dec. 25, 2018, 5:32 a.m. UTC | #50
Is it possible to avoid adding any syscall?

Since holding /proc/pid/reg_file can also hold the pid.
With this guarantee, /proc/pid/uuid (universally unique identifier ) can be
introduced to identify tasks, the kernel generates
a uuid for every task when created.

save_pid_uuid_pair_for_later_kill(int pid) {
  /* save via /proc/$pid/uuid */
  /* don't need to keep any fd after save */
}

safe_kill(pid, uuid, sig) {
    fd = open(/proc/$pid/uuid); /* also hold the pid until close() if
open() successes */
    if (open successes and read uuid from fd and if it equals to uuid)
        kill(pid, sig)
    close(fd)
}

All things needed to be done is to implement /proc/pid/uuid. And if pid can't
be recycled within 1 ticket, or the user can ensure it. The user can use
starttime(in /proc/pid/stat) instead.

save_pid_starttime_pair_for_later_kill(int pid) {
  /* save via /proc/$pid/stat */
  /* don't need to keep any fd after save or keep it for 1 ticket at most */
}

safe_kill(pid, starttime, sig) {
    fd = open(/proc/$pid/stat); /* also hold the pid until close() if
open() successes */
    if (open successes and read starttime from fd and if it equals to starttime)
        kill(pid, sig)
    close(fd)
}

In this case, zero LOC is added in the kernel. All of it depends on
the guarantee that holding /proc/pid/reg_file also holds the pid,
one of which I haven't checked carefully either.

On Fri, Dec 7, 2018 at 3:05 AM Christian Brauner <christian@brauner.io> wrote:
>
> On December 7, 2018 7:56:44 AM GMT+13:00, Florian Weimer <fweimer@redhat.com> wrote:
> >* Andy Lutomirski:
> >
> >>> I suppose that's fine.  Or alternatively, when thread group support
> >is
> >>> added, introduce a flag that applications have to use to enable it,
> >so
> >>> that they can probe for support by checking support for the flag.
> >>>
> >>> I wouldn't be opposed to a new system call like this either:
> >>>
> >>>  int procfd_open (pid_t thread_group, pid_t thread_id, unsigned
> >flags);
> >>>
> >>> But I think this is frowned upon on the kernel side.
> >>
> >> I have no problem with it, except that I think it shouldn’t return an
> >> fd that can be used for proc filesystem access.
> >
> >Oh no, my intention was that it would just be used with  *_send_signal
> >and related functions.
>
> Let's postpone that discussion a little.
> I think we don't need a syscall to base this off of pids.
> As I said I rather send my revived version of CLONE_NEWFD that would serve the same task.
> The same way we could also just add a new open() flag that blocks fs access completely.
> I just pitched that idea to Serge a few days back: O_NOCHDIR or similar.
> That could even be part of Aleksa's path resolution patchset.
>
> >
> >Thanks,
> >Florian
>
Lai Jiangshan Dec. 25, 2018, 7:11 a.m. UTC | #51
On Tue, Dec 25, 2018 at 1:32 PM Lai Jiangshan
<jiangshanlai+lkml@gmail.com> wrote:
>
> Is it possible to avoid adding any syscall?
>
> Since holding /proc/pid/reg_file can also hold the pid.
> With this guarantee, /proc/pid/uuid (universally unique identifier ) can be
> introduced to identify tasks, the kernel generates
> a uuid for every task when created.
>
> save_pid_uuid_pair_for_later_kill(int pid) {
>   /* save via /proc/$pid/uuid */
>   /* don't need to keep any fd after save */
> }
>
> safe_kill(pid, uuid, sig) {
>     fd = open(/proc/$pid/uuid); /* also hold the pid until close() if
> open() successes */
>     if (open successes and read uuid from fd and if it equals to uuid)
>         kill(pid, sig)
>     close(fd)
> }
>
> All things needed to be done is to implement /proc/pid/uuid. And if pid can't
> be recycled within 1 ticket, or the user can ensure it. The user can use
> starttime(in /proc/pid/stat) instead.
>
> save_pid_starttime_pair_for_later_kill(int pid) {
>   /* save via /proc/$pid/stat */
>   /* don't need to keep any fd after save or keep it for 1 ticket at most */
> }
>
> safe_kill(pid, starttime, sig) {
>     fd = open(/proc/$pid/stat); /* also hold the pid until close() if
> open() successes */
>     if (open successes and read starttime from fd and if it equals to starttime)
>         kill(pid, sig)
>     close(fd)
> }
>
> In this case, zero LOC is added in the kernel. All of it depends on
> the guarantee that holding /proc/pid/reg_file also holds the pid,
> one of which I haven't checked carefully either.
>

Oh, Sorry, I was wrong, the pid isn't reserved even when
the fd is kept in the user space. And I'm sorry that I had
replied to an "old" email thread.
Aleksa Sarai Dec. 25, 2018, 12:07 p.m. UTC | #52
On 2018-12-25, Lai Jiangshan <jiangshanlai+lkml@gmail.com> wrote:
> On Tue, Dec 25, 2018 at 1:32 PM Lai Jiangshan
> <jiangshanlai+lkml@gmail.com> wrote:
> >
> > Is it possible to avoid adding any syscall?
> >
> > Since holding /proc/pid/reg_file can also hold the pid.
> > With this guarantee, /proc/pid/uuid (universally unique identifier ) can be
> > introduced to identify tasks, the kernel generates
> > a uuid for every task when created.
> >
> > save_pid_uuid_pair_for_later_kill(int pid) {
> >   /* save via /proc/$pid/uuid */
> >   /* don't need to keep any fd after save */
> > }
> >
> > safe_kill(pid, uuid, sig) {
> >     fd = open(/proc/$pid/uuid); /* also hold the pid until close() if
> > open() successes */
> >     if (open successes and read uuid from fd and if it equals to uuid)
> >         kill(pid, sig)
> >     close(fd)
> > }
> >
> > All things needed to be done is to implement /proc/pid/uuid. And if pid can't
> > be recycled within 1 ticket, or the user can ensure it. The user can use
> > starttime(in /proc/pid/stat) instead.
> >
> > save_pid_starttime_pair_for_later_kill(int pid) {
> >   /* save via /proc/$pid/stat */
> >   /* don't need to keep any fd after save or keep it for 1 ticket at most */
> > }
> >
> > safe_kill(pid, starttime, sig) {
> >     fd = open(/proc/$pid/stat); /* also hold the pid until close() if
> > open() successes */
> >     if (open successes and read starttime from fd and if it equals to starttime)
> >         kill(pid, sig)
> >     close(fd)
> > }
> >
> > In this case, zero LOC is added in the kernel. All of it depends on
> > the guarantee that holding /proc/pid/reg_file also holds the pid,
> > one of which I haven't checked carefully either.
> >
> 
> Oh, Sorry, I was wrong, the pid isn't reserved even when
> the fd is kept in the user space. And I'm sorry that I had
> replied to an "old" email thread.

Don't worry, this was a common point of confusion during this (and
sister) threads. All the fd ensures is that access through that fd will
give you -ESRCH if the process is gone (and if the PID is reused it will
still give you -ESRCH).
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..3f27ffd8ae87 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,4 @@ 
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+387	i386	procfd_signal		sys_procfd_signal		__ia32_compat_sys_procfd_signal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..8a30cde82450 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@ 
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+335	64	procfd_signal		__x64_sys_procfd_signal
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -386,3 +387,4 @@ 
 545	x32	execveat		__x32_compat_sys_execveat/ptregs
 546	x32	preadv2			__x32_compat_sys_preadv64v2
 547	x32	pwritev2		__x32_compat_sys_pwritev64v2
+548	x32	procfd_signal		__x32_compat_sys_procfd_signal
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ce3465479447..771c6bd1cac6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -716,7 +716,10 @@  static int proc_pid_permission(struct inode *inode, int mask)
 	return generic_permission(inode, mask);
 }
 
-
+struct pid *proc_pid(const struct inode *inode)
+{
+       return PROC_I(inode)->pid;
+}
 
 static const struct inode_operations proc_def_inode_operations = {
 	.setattr	= proc_setattr,
@@ -3038,6 +3041,12 @@  static const struct file_operations proc_tgid_base_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+bool proc_is_tgid_procfd(const struct file *file)
+{
+	return d_is_dir(file->f_path.dentry) &&
+	       (file->f_op == &proc_tgid_base_operations);
+}
+
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..eb69afba83f3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -113,11 +113,6 @@  static inline void *__PDE_DATA(const struct inode *inode)
 	return PDE(inode)->data;
 }
 
-static inline struct pid *proc_pid(const struct inode *inode)
-{
-	return PROC_I(inode)->pid;
-}
-
 static inline struct task_struct *get_proc_task(const struct inode *inode)
 {
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d0e1f1522a78..96df2fe6311d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -73,6 +73,8 @@  struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    int (*show)(struct seq_file *, void *),
 						    proc_write_t write,
 						    void *data);
+extern bool proc_is_tgid_procfd(const struct file *file);
+extern struct pid *proc_pid(const struct inode *inode);
 
 #else /* CONFIG_PROC_FS */
 
@@ -114,6 +116,16 @@  static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
 #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
 #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
 
+static inline bool proc_is_tgid_procfd(const struct file *file)
+{
+	return false;
+}
+
+static inline struct pid *proc_pid(const struct inode *inode)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_PROC_FS */
 
 struct net;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ac3d13a915b..a5ca8cb84566 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -907,6 +907,8 @@  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
+asmlinkage long sys_procfd_signal(int fd, int sig, siginfo_t __user *info,
+				  int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 538546edbfbd..4dc81a994ad1 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -738,9 +738,11 @@  __SYSCALL(__NR_statx,     sys_statx)
 __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 #define __NR_rseq 293
 __SYSCALL(__NR_rseq, sys_rseq)
+#define __NR_procfd_signal 294
+__SC_COMP(__NR_procfd_signal, sys_procfd_signal, compat_sys_procfd_signal)
 
 #undef __NR_syscalls
-#define __NR_syscalls 294
+#define __NR_syscalls 295
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/signal.c b/kernel/signal.c
index 9a32bc2088c9..13695342f150 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -19,7 +19,9 @@ 
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
@@ -3286,6 +3288,16 @@  COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 }
 #endif
 
+static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+{
+	clear_siginfo(info);
+	info->si_signo = sig;
+	info->si_errno = 0;
+	info->si_code = SI_USER;
+	info->si_pid = task_tgid_vnr(current);
+	info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
+}
+
 /**
  *  sys_kill - send a signal to a process
  *  @pid: the PID of the process
@@ -3295,16 +3307,119 @@  SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct kernel_siginfo info;
 
-	clear_siginfo(&info);
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_USER;
-	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+	prepare_kill_siginfo(sig, &info);
 
 	return kill_something_info(sig, &info, pid);
 }
 
+/*
+ * Verify that the signaler and signalee either are in the same pid namespace
+ * or that the signaler's pid namespace is an ancestor of the signalee's pid
+ * namespace.
+ */
+static bool may_signal_procfd(struct pid *pid)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *p = ns_of_pid(pid);
+
+	for (;;) {
+		if (!p)
+			return false;
+		if (p == active)
+			break;
+		p = p->parent;
+	}
+
+	return true;
+}
+
+static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
+			    bool had_siginfo)
+{
+	int ret;
+	struct fd f;
+	struct pid *pid;
+
+	/* Enforce flags be set to 0 until we add an extension. */
+	if (flags)
+		return -EINVAL;
+
+	f = fdget_raw(fd);
+	if (!f.file)
+		return -EBADF;
+
+	/* Is this a process file descriptor? */
+	ret = -EINVAL;
+	if (!proc_is_tgid_procfd(f.file))
+		goto err;
+
+	/* Without CONFIG_PROC_FS proc_pid() returns NULL. */
+	pid = proc_pid(file_inode(f.file));
+	if (!pid)
+		goto err;
+
+	if (!may_signal_procfd(pid))
+		goto err;
+
+	if (had_siginfo) {
+		/*
+		 * Not even root can pretend to send signals from the kernel.
+		 * Nor can they impersonate a kill()/tgkill(), which adds
+		 * source info.
+		 */
+		ret = -EPERM;
+		if ((kinfo->si_code >= 0 || kinfo->si_code == SI_TKILL) &&
+		    (task_pid(current) != pid))
+			goto err;
+	} else {
+		prepare_kill_siginfo(sig, kinfo);
+	}
+
+	ret = kill_pid_info(sig, kinfo, pid);
+
+err:
+	fdput(f);
+	return ret;
+}
+
+/**
+ *  sys_procfd_signal - send a signal to a process through a process file
+ *                      descriptor
+ *  @fd: the file descriptor of the process
+ *  @sig: signal to be sent
+ *  @info: the signal info
+ *  @flags: future flags to be passed
+ */
+SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
+		int, flags)
+{
+	kernel_siginfo_t kinfo;
+
+	if (info) {
+		int ret = __copy_siginfo_from_user(sig, &kinfo, info);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig,
+		       struct compat_siginfo __user *, info, int, flags)
+{
+	kernel_siginfo_t kinfo;
+
+	if (info) {
+		int ret = __copy_siginfo_from_user32(sig, &kinfo, info);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
+}
+#endif
+
 static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
 {