diff mbox series

[v2,1/4] pidfd: support PIDFD_NONBLOCK in pidfd_open()

Message ID 20200902102130.147672-2-christian.brauner@ubuntu.com
State New
Headers show
Series Support non-blocking pidfds | expand

Commit Message

Christian Brauner Sept. 2, 2020, 10:21 a.m. UTC
Introduce PIDFD_NONBLOCK to support non-blocking pidfd file descriptors.

Ever since the introduction of pidfds and more advanced async io various
programming languages such as Rust have grown support for async event
libraries. These libraries are created to help build epoll-based event loops
around file descriptors. A common pattern is to automatically make all file
descriptors they manage to O_NONBLOCK.

For such libraries the EAGAIN error code is treated specially. When a function
is called that returns EAGAIN the function isn't called again until the event
loop indicates the the file descriptor is ready. Supporting EAGAIN when
waiting on pidfds makes such libraries just work with little effort. In the
following patch we will extend waitid() internally to support non-blocking
pidfds.

This introduces a new flag PIDFD_NONBLOCK that is equivalent to O_NONBLOCK.
This follows the same patterns we have for other (anon inode) file descriptors
such as EFD_NONBLOCK, IN_NONBLOCK, SFD_NONBLOCK, TFD_NONBLOCK and the same for
close-on-exec flags.

Link: https://lore.kernel.org/lkml/20200811181236.GA18763@localhost/
Link: https://github.com/joshtriplett/async-pidfd
Cc: Kees Cook <keescook@chromium.org>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v2 */
- Christian Brauner <christian.brauner@ubuntu.com>:
  - Improve commit message.
---
 include/uapi/linux/pidfd.h | 12 ++++++++++++
 kernel/pid.c               | 12 +++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/linux/pidfd.h

Comments

Oleg Nesterov Sept. 3, 2020, 2:31 p.m. UTC | #1
On 09/02, Christian Brauner wrote:
>
> -static int pidfd_create(struct pid *pid)
> +static int pidfd_create(struct pid *pid, unsigned int flags)
>  {
>  	int fd;
>  
>  	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
> -			      O_RDWR | O_CLOEXEC);
> +			      flags | O_RDWR | O_CLOEXEC);
>  	if (fd < 0)
>  		put_pid(pid);
>  
> @@ -565,7 +567,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
>  	int fd;
>  	struct pid *p;
>  
> -	if (flags)
> +	if (flags & ~PIDFD_NONBLOCK)
>  		return -EINVAL;
>  
>  	if (pid <= 0)
> @@ -576,7 +578,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
>  		return -ESRCH;
>  
>  	if (pid_has_task(p, PIDTYPE_TGID))
> -		fd = pidfd_create(p);
> +		fd = pidfd_create(p, flags);
>  	else
>  		fd = -EINVAL;
>  

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Oleg Nesterov Sept. 3, 2020, 2:58 p.m. UTC | #2
Christian, off-topic question...

On 09/02, Christian Brauner wrote:
>
> -static int pidfd_create(struct pid *pid)
> +static int pidfd_create(struct pid *pid, unsigned int flags)
>  {
>  	int fd;
>
>  	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
> -			      O_RDWR | O_CLOEXEC);
> +			      flags | O_RDWR | O_CLOEXEC);

I just noticed this comment above pidfd_create:

	 * Note, that this function can only be called after the fd table has
	 * been unshared to avoid leaking the pidfd to the new process.

what does it mean?

Of course, if fd table is shared then pidfd can "leak" to another process,
but this is true for any file and sys_pidfd_open() doesn't do any check?



In fact I think this helper buys nothing but adds the unnecessary get/put_pid,
we can kill it and change pidfd_open() to do

	SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
	{
		int fd;
		struct pid *p;

		if (flags & ~PIDFD_NONBLOCK)
			return -EINVAL;

		if (pid <= 0)
			return -EINVAL;

		p = find_get_pid(pid);
		if (!p)
			return -ESRCH;

		fd = -EINVAL;
		if (pid_has_task(p, PIDTYPE_TGID)) {
			fd = anon_inode_getfd("[pidfd]", &pidfd_fops, pid,
						flags | O_RDWR | O_CLOEXEC);
		}
		if (fd < 0)
			put_pid(p);
		return fd;
	}

but this is cosmetic and off-topic too.

Oleg.
Christian Brauner Sept. 3, 2020, 3:25 p.m. UTC | #3
On Thu, Sep 03, 2020 at 04:58:09PM +0200, Oleg Nesterov wrote:
> Christian, off-topic question...
> 
> On 09/02, Christian Brauner wrote:
> >
> > -static int pidfd_create(struct pid *pid)
> > +static int pidfd_create(struct pid *pid, unsigned int flags)
> >  {
> >  	int fd;
> >
> >  	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
> > -			      O_RDWR | O_CLOEXEC);
> > +			      flags | O_RDWR | O_CLOEXEC);
> 
> I just noticed this comment above pidfd_create:
> 
> 	 * Note, that this function can only be called after the fd table has
> 	 * been unshared to avoid leaking the pidfd to the new process.
> 
> what does it mean?
> 
> Of course, if fd table is shared then pidfd can "leak" to another process,
> but this is true for any file and sys_pidfd_open() doesn't do any check?

It's the same comment we added in kernel/fork.c to make callers aware
that they can leak a pidfd to another process unintentionally. Sure,
this is true of any fd but since pidfds were a new type of handle and on
another process at that we felt that this was important to spell out. The
"can only" should've arguably been "should probably".

> 
> 
> 
> In fact I think this helper buys nothing but adds the unnecessary get/put_pid,
> we can kill it and change pidfd_open() to do
> 
> 	SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> 	{
> 		int fd;
> 		struct pid *p;
> 
> 		if (flags & ~PIDFD_NONBLOCK)
> 			return -EINVAL;
> 
> 		if (pid <= 0)
> 			return -EINVAL;
> 
> 		p = find_get_pid(pid);
> 		if (!p)
> 			return -ESRCH;
> 
> 		fd = -EINVAL;
> 		if (pid_has_task(p, PIDTYPE_TGID)) {
> 			fd = anon_inode_getfd("[pidfd]", &pidfd_fops, pid,
> 						flags | O_RDWR | O_CLOEXEC);
> 		}
> 		if (fd < 0)
> 			put_pid(p);
> 		return fd;
> 	}

Sure, I'd totally take a patch like that!

> 
> but this is cosmetic and off-topic too.

No, much appreciated. Good-looking code is important. :)

Christian
Josh Triplett Sept. 3, 2020, 11:50 p.m. UTC | #4
On Wed, Sep 02, 2020 at 12:21:27PM +0200, Christian Brauner wrote:
> Introduce PIDFD_NONBLOCK to support non-blocking pidfd file descriptors.
> 
> Ever since the introduction of pidfds and more advanced async io various
> programming languages such as Rust have grown support for async event
> libraries. These libraries are created to help build epoll-based event loops
> around file descriptors. A common pattern is to automatically make all file
> descriptors they manage to O_NONBLOCK.
> 
> For such libraries the EAGAIN error code is treated specially. When a function
> is called that returns EAGAIN the function isn't called again until the event
> loop indicates the the file descriptor is ready. Supporting EAGAIN when
> waiting on pidfds makes such libraries just work with little effort. In the
> following patch we will extend waitid() internally to support non-blocking
> pidfds.
> 
> This introduces a new flag PIDFD_NONBLOCK that is equivalent to O_NONBLOCK.
> This follows the same patterns we have for other (anon inode) file descriptors
> such as EFD_NONBLOCK, IN_NONBLOCK, SFD_NONBLOCK, TFD_NONBLOCK and the same for
> close-on-exec flags.
> 
> Link: https://lore.kernel.org/lkml/20200811181236.GA18763@localhost/
> Link: https://github.com/joshtriplett/async-pidfd
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Sargun Dhillon <sargun@sargun.me>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Suggested-by: Josh Triplett <josh@joshtriplett.org>
> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>

Reviewed-by: Josh Triplett <josh@joshtriplett.org>
diff mbox series

Patch

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
new file mode 100644
index 000000000000..5406fbc13074
--- /dev/null
+++ b/include/uapi/linux/pidfd.h
@@ -0,0 +1,12 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_PIDFD_H
+#define _UAPI_LINUX_PIDFD_H
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+
+/* Flags for pidfd_open().  */
+#define PIDFD_NONBLOCK O_NONBLOCK
+
+#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index b2562a7ce525..74ddbff1a6ba 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@ 
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 #include <net/sock.h>
+#include <uapi/linux/pidfd.h>
 
 struct pid init_struct_pid = {
 	.count		= REFCOUNT_INIT(1),
@@ -522,7 +523,8 @@  struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 /**
  * pidfd_create() - Create a new pid file descriptor.
  *
- * @pid:  struct pid that the pidfd will reference
+ * @pid:   struct pid that the pidfd will reference
+ * @flags: flags to pass
  *
  * This creates a new pid file descriptor with the O_CLOEXEC flag set.
  *
@@ -532,12 +534,12 @@  struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
  */
-static int pidfd_create(struct pid *pid)
+static int pidfd_create(struct pid *pid, unsigned int flags)
 {
 	int fd;
 
 	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
-			      O_RDWR | O_CLOEXEC);
+			      flags | O_RDWR | O_CLOEXEC);
 	if (fd < 0)
 		put_pid(pid);
 
@@ -565,7 +567,7 @@  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	int fd;
 	struct pid *p;
 
-	if (flags)
+	if (flags & ~PIDFD_NONBLOCK)
 		return -EINVAL;
 
 	if (pid <= 0)
@@ -576,7 +578,7 @@  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 		return -ESRCH;
 
 	if (pid_has_task(p, PIDTYPE_TGID))
-		fd = pidfd_create(p);
+		fd = pidfd_create(p, flags);
 	else
 		fd = -EINVAL;