diff mbox series

[1/4] pidfd: support PIDFD_NONBLOCK in pidfd_open()

Message ID 20200831134551.1599689-2-christian.brauner@ubuntu.com
State New
Headers show
Series Support non-blocking pidfds | expand

Commit Message

Christian Brauner Aug. 31, 2020, 1:45 p.m. UTC
Introduce PIDFD_NONBLOCK to support non-blocking pidfd file descriptors.

Ever since the introduction of pidfds and more advanced async io various
programming languages such as Rust have grown support for async event
libraries. These libraries are created to help build epoll-based event loops
around file descriptors. A common pattern is to automatically make all file
descriptors they manage to O_NONBLOCK.

For such libraries the EAGAIN error code is treated specially. When a function
is called that returns EAGAIN the function isn't called again until the event
loop indicates the the file descriptor is ready. Supporting EAGAIN when
waiting on pidfds makes such libraries just work with little effort. In the
following patch we will extend waitid() internally to support non-blocking
pidfds.

Link: https://lore.kernel.org/lkml/20200811181236.GA18763@localhost/
Link: https://github.com/joshtriplett/async-pidfd
Cc: Kees Cook <keescook@chromium.org>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/uapi/linux/pidfd.h | 12 ++++++++++++
 kernel/pid.c               | 12 +++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/linux/pidfd.h

Comments

Oleg Nesterov Sept. 1, 2020, 4:23 p.m. UTC | #1
On 08/31, Christian Brauner wrote:
>
> --- /dev/null
> +++ b/include/uapi/linux/pidfd.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +
> +#ifndef _UAPI_LINUX_PIDFD_H
> +#define _UAPI_LINUX_PIDFD_H
> +
> +#include <linux/types.h>
> +#include <linux/fcntl.h>
> +
> +/* Flags for pidfd_open().  */
> +#define PIDFD_NONBLOCK O_NONBLOCK
> +
> +#endif /* _UAPI_LINUX_PIDFD_H */

Why? Can't we simply use O_NONBLOCK ?

Oleg.
Christian Brauner Sept. 1, 2020, 4:33 p.m. UTC | #2
On Tue, Sep 01, 2020 at 06:23:10PM +0200, Oleg Nesterov wrote:
> On 08/31, Christian Brauner wrote:
> >
> > --- /dev/null
> > +++ b/include/uapi/linux/pidfd.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > +
> > +#ifndef _UAPI_LINUX_PIDFD_H
> > +#define _UAPI_LINUX_PIDFD_H
> > +
> > +#include <linux/types.h>
> > +#include <linux/fcntl.h>
> > +
> > +/* Flags for pidfd_open().  */
> > +#define PIDFD_NONBLOCK O_NONBLOCK
> > +
> > +#endif /* _UAPI_LINUX_PIDFD_H */
> 
> Why? Can't we simply use O_NONBLOCK ?

It's the same thing we seem to do for any other (anon inode) fds:

include/linux/eventfd.h:#define		EFD_NONBLOCK O_NONBLOCK
include/uapi/linux/inotify.h:#define	IN_NONBLOCK O_NONBLOCK
include/uapi/linux/signalfd.h:#define	SFD_NONBLOCK O_NONBLOCK
include/uapi/linux/timerfd.h:#define	TFD_NONBLOCK O_NONBLOCK

also for O_CLOEXEC:

include/linux/eventfd.h:#define		EFD_CLOEXEC O_CLOEXEC
include/linux/userfaultfd_k.h:#define	UFFD_CLOEXEC O_CLOEXEC
include/uapi/linux/eventpoll.h:#define	EPOLL_CLOEXEC O_CLOEXEC
include/uapi/linux/mount.h:#define	OPEN_TREE_CLOEXEC    O_CLOEXEC
include/uapi/linux/perf_event.h:#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
include/uapi/linux/signalfd.h:#define	SFD_CLOEXEC O_CLOEXEC
include/uapi/linux/timerfd.h:#define	TFD_CLOEXEC O_CLOEXEC

So I think we should just do the same. A clean flag namespace seems
nicer to me too tbh.

Christian
Oleg Nesterov Sept. 1, 2020, 4:53 p.m. UTC | #3
On 09/01, Christian Brauner wrote:
>
> On Tue, Sep 01, 2020 at 06:23:10PM +0200, Oleg Nesterov wrote:
> > On 08/31, Christian Brauner wrote:
> > >
> > > --- /dev/null
> > > +++ b/include/uapi/linux/pidfd.h
> > > @@ -0,0 +1,12 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > > +
> > > +#ifndef _UAPI_LINUX_PIDFD_H
> > > +#define _UAPI_LINUX_PIDFD_H
> > > +
> > > +#include <linux/types.h>
> > > +#include <linux/fcntl.h>
> > > +
> > > +/* Flags for pidfd_open().  */
> > > +#define PIDFD_NONBLOCK O_NONBLOCK
> > > +
> > > +#endif /* _UAPI_LINUX_PIDFD_H */
> >
> > Why? Can't we simply use O_NONBLOCK ?
>
> It's the same thing we seem to do for any other (anon inode) fds:
>
> include/linux/eventfd.h:#define		EFD_NONBLOCK O_NONBLOCK
> include/uapi/linux/inotify.h:#define	IN_NONBLOCK O_NONBLOCK
> include/uapi/linux/signalfd.h:#define	SFD_NONBLOCK O_NONBLOCK
> include/uapi/linux/timerfd.h:#define	TFD_NONBLOCK O_NONBLOCK
>
> also for O_CLOEXEC:
>
> include/linux/eventfd.h:#define		EFD_CLOEXEC O_CLOEXEC
> include/linux/userfaultfd_k.h:#define	UFFD_CLOEXEC O_CLOEXEC
> include/uapi/linux/eventpoll.h:#define	EPOLL_CLOEXEC O_CLOEXEC
> include/uapi/linux/mount.h:#define	OPEN_TREE_CLOEXEC    O_CLOEXEC
> include/uapi/linux/perf_event.h:#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
> include/uapi/linux/signalfd.h:#define	SFD_CLOEXEC O_CLOEXEC
> include/uapi/linux/timerfd.h:#define	TFD_CLOEXEC O_CLOEXEC
>
> So I think we should just do the same.

Hmm, OK, then I have to agree.

> A clean flag namespace seems
> nicer to me too tbh.

Disagree but this doesn't matter ;)

Oleg.
diff mbox series

Patch

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
new file mode 100644
index 000000000000..5406fbc13074
--- /dev/null
+++ b/include/uapi/linux/pidfd.h
@@ -0,0 +1,12 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_PIDFD_H
+#define _UAPI_LINUX_PIDFD_H
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+
+/* Flags for pidfd_open().  */
+#define PIDFD_NONBLOCK O_NONBLOCK
+
+#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index b2562a7ce525..74ddbff1a6ba 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@ 
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 #include <net/sock.h>
+#include <uapi/linux/pidfd.h>
 
 struct pid init_struct_pid = {
 	.count		= REFCOUNT_INIT(1),
@@ -522,7 +523,8 @@  struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 /**
  * pidfd_create() - Create a new pid file descriptor.
  *
- * @pid:  struct pid that the pidfd will reference
+ * @pid:   struct pid that the pidfd will reference
+ * @flags: flags to pass
  *
  * This creates a new pid file descriptor with the O_CLOEXEC flag set.
  *
@@ -532,12 +534,12 @@  struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
  */
-static int pidfd_create(struct pid *pid)
+static int pidfd_create(struct pid *pid, unsigned int flags)
 {
 	int fd;
 
 	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
-			      O_RDWR | O_CLOEXEC);
+			      flags | O_RDWR | O_CLOEXEC);
 	if (fd < 0)
 		put_pid(pid);
 
@@ -565,7 +567,7 @@  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	int fd;
 	struct pid *p;
 
-	if (flags)
+	if (flags & ~PIDFD_NONBLOCK)
 		return -EINVAL;
 
 	if (pid <= 0)
@@ -576,7 +578,7 @@  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 		return -ESRCH;
 
 	if (pid_has_task(p, PIDTYPE_TGID))
-		fd = pidfd_create(p);
+		fd = pidfd_create(p, flags);
 	else
 		fd = -EINVAL;