diff mbox series

[5/5] fanotify: Add pidfd info record support to the fanotify API

Message ID 48d18055deb4617d97c695a08dca77eb573097e9.1621473846.git.repnop@google.com (mailing list archive)
State New, archived
Headers show
Series Add pidfd support to the fanotify API | expand

Commit Message

Matthew Bobrowski May 20, 2021, 2:11 a.m. UTC
Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
allows userspace applications to control whether a pidfd info record
containing a pidfd is to be returned with each event.

If FAN_REPORT_PIDFD is enabled for a notification group, an additional
struct fanotify_event_info_pidfd object will be supplied alongside the
generic struct fanotify_event_metadata within a single event. This
functionality is analogous to that of FAN_REPORT_FID in terms of how
the event structure is supplied to the userspace application. Usage of
FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
permitted, and in this case a struct fanotify_event_info_pidfd object
will follow any struct fanotify_event_info_fid object.

Usage of FAN_REPORT_TID is not permitted with FAN_REPORT_PIDFD as the
pidfd API only supports the creation of pidfds for thread-group
leaders. Attempting to do so will result with a -EINVAL being returned
when calling fanotify_init(2).

If pidfd creation fails via pidfd_create(), the pidfd field within
struct fanotify_event_info_pidfd is set to FAN_NOPIDFD.

Signed-off-by: Matthew Bobrowski <repnop@google.com>
---
 fs/notify/fanotify/fanotify_user.c | 65 +++++++++++++++++++++++++++---
 include/linux/fanotify.h           |  3 +-
 include/uapi/linux/fanotify.h      | 12 ++++++
 3 files changed, 74 insertions(+), 6 deletions(-)

Comments

Christian Brauner May 20, 2021, 8:17 a.m. UTC | #1
On Thu, May 20, 2021 at 12:11:51PM +1000, Matthew Bobrowski wrote:
> Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
> allows userspace applications to control whether a pidfd info record
> containing a pidfd is to be returned with each event.
> 
> If FAN_REPORT_PIDFD is enabled for a notification group, an additional
> struct fanotify_event_info_pidfd object will be supplied alongside the
> generic struct fanotify_event_metadata within a single event. This
> functionality is analogous to that of FAN_REPORT_FID in terms of how
> the event structure is supplied to the userspace application. Usage of
> FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
> permitted, and in this case a struct fanotify_event_info_pidfd object
> will follow any struct fanotify_event_info_fid object.
> 
> Usage of FAN_REPORT_TID is not permitted with FAN_REPORT_PIDFD as the
> pidfd API only supports the creation of pidfds for thread-group
> leaders. Attempting to do so will result with a -EINVAL being returned
> when calling fanotify_init(2).
> 
> If pidfd creation fails via pidfd_create(), the pidfd field within
> struct fanotify_event_info_pidfd is set to FAN_NOPIDFD.
> 
> Signed-off-by: Matthew Bobrowski <repnop@google.com>
> ---
>  fs/notify/fanotify/fanotify_user.c | 65 +++++++++++++++++++++++++++---
>  include/linux/fanotify.h           |  3 +-
>  include/uapi/linux/fanotify.h      | 12 ++++++
>  3 files changed, 74 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 1e15f3222eb2..bba61988f4a0 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -106,6 +106,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
>  #define FANOTIFY_EVENT_ALIGN 4
>  #define FANOTIFY_FID_INFO_HDR_LEN \
>  	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
> +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> +	sizeof(struct fanotify_event_info_pidfd)
>  
>  static int fanotify_fid_info_len(int fh_len, int name_len)
>  {
> @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
>  	if (fh_len)
>  		info_len += fanotify_fid_info_len(fh_len, dot_len);
>  
> +	if (info_mode & FAN_REPORT_PIDFD)
> +		info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> +
>  	return info_len;
>  }
>  
> @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
>  	return info_len;
>  }
>  
> +static int copy_pidfd_info_to_user(struct pid *pid,
> +				   char __user *buf,
> +				   size_t count)
> +{
> +	struct fanotify_event_info_pidfd info = { };
> +	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> +
> +	if (WARN_ON_ONCE(info_len > count))
> +		return -EFAULT;
> +
> +	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> +	info.hdr.len = info_len;
> +
> +	info.pidfd = pidfd_create(pid, 0);
> +	if (info.pidfd < 0)
> +		info.pidfd = FAN_NOPIDFD;
> +
> +	if (copy_to_user(buf, &info, info_len))
> +		return -EFAULT;

Hm, well this kinda sucks. The caller can end up with a pidfd in their
fd table and when the copy_to_user() failed they won't know what fd it
is. I think this might be better served by sm like (see what I did in
kernel/fork.c): 

	pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
	if (pidfd < 0)
		/* error handling */
	
	pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, O_RDWR | O_CLOEXEC);
	if (IS_ERR(pidfile)) {
		put_unused_fd(pidfd);
		pidfd = PTR_ERR(pidfile);
		/* error handling */
	}
	get_pid(pid);	/* held by pidfile now */

	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
	info.hdr.len = info_len;
	info.pidfd = pidfd;

	if (copy_to_user(buf, &info, info_len)) {
		fput(pidfile);
		put_unused_fd(pidfd);
		return -EFAULT;
	}

	fd_install(pidfd, pidfile);
	return 0;

> +
> +	return info_len;
> +}
> +
>  static int copy_info_to_user(struct fanotify_event *event,
>  			     struct fanotify_info *info,
>  			     unsigned int info_mode,
> @@ -408,9 +436,12 @@ static int copy_info_to_user(struct fanotify_event *event,
>  {
>  	int ret, info_type = 0;
>  	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
> +	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
>  
>  	/*
>  	 * Event info records order is as follows: dir fid + name, child fid.
> +	 * If FAN_REPORT_PIDFD has been specified, then a pidfd info record
> +	 * will follow the fid info records.
>  	 */
>  	if (fanotify_event_dir_fh_len(event)) {
>  		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
> @@ -465,10 +496,18 @@ static int copy_info_to_user(struct fanotify_event *event,
>  		}
>  
>  		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
> -					    fanotify_event_object_fh(event),
> -					    info_type, dot, dot_len,
> -					    buf, count);
> -	}
> +					    fanotify_event_object_fh(event),
> +					    info_type, dot, dot_len,
> +					    buf, count);
> +		if (ret < 0)
> +			return ret;
> +
> +		buf += ret;
> +		count -= ret;
> +	}
> +
> +	if (pidfd_mode)
> +		return copy_pidfd_info_to_user(event->pid, buf, count);
>  
>  	return ret;
>  }
> @@ -530,6 +569,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>  		fd_install(fd, f);
>  
>  	if (info_mode) {
> +		/*
> +		 * Complain if FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
> +		 * exclusion is ever lifted. At the time of implementing
> +		 * FAN_REPORT_PIDFD, the pidfd API only supported the creation
> +		 * of pidfds on thread-group leaders.
> +		 */
> +		WARN_ON_ONCE((info_mode & FAN_REPORT_PIDFD) &&
> +			     FAN_GROUP_FLAG(group, FAN_REPORT_TID));
> +
>  		ret = copy_info_to_user(event, info, info_mode, buf, count);
>  		if (ret < 0)
>  			return ret;
> @@ -1079,6 +1127,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>  #endif
>  		return -EINVAL;
>  
> +	/*
> +	 * A pidfd can only be returned for a thread-group leader; thus
> +	 * FAN_REPORT_TID and FAN_REPORT_PIDFD need to be mutually exclusive.
> +	 */
> +	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
> +		return -EINVAL;
> +
>  	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
>  		return -EINVAL;
>  
> @@ -1477,7 +1532,7 @@ static int __init fanotify_user_setup(void)
>  	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
>  				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
>  
> -	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
> +	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
>  	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
>  
>  	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
> diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
> index f76c7635efc8..bb2898240e5a 100644
> --- a/include/linux/fanotify.h
> +++ b/include/linux/fanotify.h
> @@ -27,7 +27,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
>  
>  #define FANOTIFY_FID_BITS	(FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
>  
> -#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS)
> +#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
>  
>  /*
>   * fanotify_init() flags that require CAP_SYS_ADMIN.
> @@ -37,6 +37,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
>   */
>  #define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
>  					 FAN_REPORT_TID | \
> +					 FAN_REPORT_PIDFD | \
>  					 FAN_UNLIMITED_QUEUE | \
>  					 FAN_UNLIMITED_MARKS)
>  
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index fbf9c5c7dd59..36c3bddcf690 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -55,6 +55,7 @@
>  #define FAN_REPORT_FID		0x00000200	/* Report unique file id */
>  #define FAN_REPORT_DIR_FID	0x00000400	/* Report unique directory id */
>  #define FAN_REPORT_NAME		0x00000800	/* Report events with name */
> +#define FAN_REPORT_PIDFD	0x00001000	/* Report pidfd for event->pid */
>  
>  /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
>  #define FAN_REPORT_DFID_NAME	(FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
> @@ -123,6 +124,7 @@ struct fanotify_event_metadata {
>  #define FAN_EVENT_INFO_TYPE_FID		1
>  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
>  #define FAN_EVENT_INFO_TYPE_DFID	3
> +#define FAN_EVENT_INFO_TYPE_PIDFD	4
>  
>  /* Variable length info record following event metadata */
>  struct fanotify_event_info_header {
> @@ -148,6 +150,15 @@ struct fanotify_event_info_fid {
>  	unsigned char handle[0];
>  };
>  
> +/*
> + * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
> + * It holds a pidfd for the pid responsible for generating an event.
> + */
> +struct fanotify_event_info_pidfd {
> +	struct fanotify_event_info_header hdr;
> +	__s32 pidfd;
> +};
> +
>  struct fanotify_response {
>  	__s32 fd;
>  	__u32 response;
> @@ -160,6 +171,7 @@ struct fanotify_response {
>  
>  /* No fd set in event */
>  #define FAN_NOFD	-1
> +#define FAN_NOPIDFD	FAN_NOFD
>  
>  /* Helper functions to deal with fanotify_event_metadata buffers */
>  #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
> -- 
> 2.31.1.751.gd2f1c929bd-goog
> 
> /M
Amir Goldstein May 20, 2021, 1:43 p.m. UTC | #2
On Thu, May 20, 2021 at 11:17 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
>
> On Thu, May 20, 2021 at 12:11:51PM +1000, Matthew Bobrowski wrote:
> > Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
> > allows userspace applications to control whether a pidfd info record
> > containing a pidfd is to be returned with each event.
> >
> > If FAN_REPORT_PIDFD is enabled for a notification group, an additional
> > struct fanotify_event_info_pidfd object will be supplied alongside the
> > generic struct fanotify_event_metadata within a single event. This
> > functionality is analogous to that of FAN_REPORT_FID in terms of how
> > the event structure is supplied to the userspace application. Usage of
> > FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
> > permitted, and in this case a struct fanotify_event_info_pidfd object
> > will follow any struct fanotify_event_info_fid object.
> >
> > Usage of FAN_REPORT_TID is not permitted with FAN_REPORT_PIDFD as the
> > pidfd API only supports the creation of pidfds for thread-group
> > leaders. Attempting to do so will result with a -EINVAL being returned
> > when calling fanotify_init(2).
> >
> > If pidfd creation fails via pidfd_create(), the pidfd field within
> > struct fanotify_event_info_pidfd is set to FAN_NOPIDFD.
> >
> > Signed-off-by: Matthew Bobrowski <repnop@google.com>
> > ---
> >  fs/notify/fanotify/fanotify_user.c | 65 +++++++++++++++++++++++++++---
> >  include/linux/fanotify.h           |  3 +-
> >  include/uapi/linux/fanotify.h      | 12 ++++++
> >  3 files changed, 74 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> > index 1e15f3222eb2..bba61988f4a0 100644
> > --- a/fs/notify/fanotify/fanotify_user.c
> > +++ b/fs/notify/fanotify/fanotify_user.c
> > @@ -106,6 +106,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
> >  #define FANOTIFY_EVENT_ALIGN 4
> >  #define FANOTIFY_FID_INFO_HDR_LEN \
> >       (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
> > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > +     sizeof(struct fanotify_event_info_pidfd)
> >
> >  static int fanotify_fid_info_len(int fh_len, int name_len)
> >  {
> > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> >       if (fh_len)
> >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> >
> > +     if (info_mode & FAN_REPORT_PIDFD)
> > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > +
> >       return info_len;
> >  }
> >
> > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> >       return info_len;
> >  }
> >
> > +static int copy_pidfd_info_to_user(struct pid *pid,
> > +                                char __user *buf,
> > +                                size_t count)
> > +{
> > +     struct fanotify_event_info_pidfd info = { };
> > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > +
> > +     if (WARN_ON_ONCE(info_len > count))
> > +             return -EFAULT;
> > +
> > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > +     info.hdr.len = info_len;
> > +
> > +     info.pidfd = pidfd_create(pid, 0);
> > +     if (info.pidfd < 0)
> > +             info.pidfd = FAN_NOPIDFD;
> > +
> > +     if (copy_to_user(buf, &info, info_len))
> > +             return -EFAULT;
>
> Hm, well this kinda sucks. The caller can end up with a pidfd in their
> fd table and when the copy_to_user() failed they won't know what fd it

Good catch!
But I prefer to solve it differently, because moving fd_install() to the
end of this function does not guarantee that copy_event_to_user()
won't return an error one day with dangling pidfd in fd table.

It might be simpler to do pidfd_create() next to create_fd() in
copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
pidfd can be closed on error along with fd on out_close_fd label.

You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
(even though fanotify_init() does check for that).

Anyway, something like:

        if (!capable(CAP_SYS_ADMIN) &&
            task_tgid(current) != event->pid)
                metadata.pid = 0;
+      else if (pidfd_mode)
+              pidfd = pidfd_create(pid, 0);

[...]

+       if (pidfd_mode)
+               ret = copy_pidfd_info_to_user(pidfd, buf, count);

        return metadata.event_len;

out_close_fd:
+        if (pidfd != FAN_NOPIDFD) {
...


And in any case, it wrong to call copy_pidfd_info_to_user() from
copy_info_to_user(). It needs to be called once from copy_event_to_user()
because copy_pidfd_info_to_user() may be called twice to report both
FAN_EVENT_INFO_TYPE_FID and FAN_EVENT_INFO_TYPE_DFID
records for the same event.

Thanks,
Amir.
Matthew Bobrowski May 21, 2021, 9:21 a.m. UTC | #3
Hey Amir/Christian,

On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> <christian.brauner@ubuntu.com> wrote:
> > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > +     sizeof(struct fanotify_event_info_pidfd)
> > >
> > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > >  {
> > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > >       if (fh_len)
> > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > >
> > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > +
> > >       return info_len;
> > >  }
> > >
> > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > >       return info_len;
> > >  }
> > >
> > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > +                                char __user *buf,
> > > +                                size_t count)
> > > +{
> > > +     struct fanotify_event_info_pidfd info = { };
> > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > +
> > > +     if (WARN_ON_ONCE(info_len > count))
> > > +             return -EFAULT;
> > > +
> > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > +     info.hdr.len = info_len;
> > > +
> > > +     info.pidfd = pidfd_create(pid, 0);
> > > +     if (info.pidfd < 0)
> > > +             info.pidfd = FAN_NOPIDFD;
> > > +
> > > +     if (copy_to_user(buf, &info, info_len))
> > > +             return -EFAULT;
> >
> > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > fd table and when the copy_to_user() failed they won't know what fd it
> 
> Good catch!

Super awesome catch Christian, thanks pulling this up!

> But I prefer to solve it differently, because moving fd_install() to the
> end of this function does not guarantee that copy_event_to_user()
> won't return an error one day with dangling pidfd in fd table.

I can see the angle you're approaching this from...

> It might be simpler to do pidfd_create() next to create_fd() in
> copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> pidfd can be closed on error along with fd on out_close_fd label.
> 
> You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> (even though fanotify_init() does check for that).

I didn't really understand the need for this check here given that the
administrative bits are already being checked for in fanotify_init()
i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
thus never walking any of the pidfd_mode paths. Is this just a defense
in depth approach here, or is it something else that I'm missing?

> Anyway, something like:
> 
>         if (!capable(CAP_SYS_ADMIN) &&
>             task_tgid(current) != event->pid)
>                 metadata.pid = 0;
> +      else if (pidfd_mode)
> +              pidfd = pidfd_create(pid, 0);
> 
> [...]
> 
> +       if (pidfd_mode)
> +               ret = copy_pidfd_info_to_user(pidfd, buf, count);
> 
>         return metadata.event_len;
> 
> out_close_fd:
> +        if (pidfd != FAN_NOPIDFD) {
> ...

The early call to pidfd_create() and clean up in copy_event_to_user()
makes most sense to me.

> And in any case, it wrong to call copy_pidfd_info_to_user() from
> copy_info_to_user(). It needs to be called once from copy_event_to_user()
> because copy_pidfd_info_to_user() may be called twice to report both
> FAN_EVENT_INFO_TYPE_FID and FAN_EVENT_INFO_TYPE_DFID
> records for the same event.

Right, as mentioned in patch 4 of this series, copy_info_to_user() has
been repurposed to account for the double call into
copy_fid_info_to_user() when reporting FAN_EVENT_INFO_TYPE_FID and
FAN_EVENT_INFO_TYPE_DFID.

/M
Amir Goldstein May 21, 2021, 9:41 a.m. UTC | #4
On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
>
> Hey Amir/Christian,
>
> On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > <christian.brauner@ubuntu.com> wrote:
> > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > >
> > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > >  {
> > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > >       if (fh_len)
> > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > >
> > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > +
> > > >       return info_len;
> > > >  }
> > > >
> > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > >       return info_len;
> > > >  }
> > > >
> > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > +                                char __user *buf,
> > > > +                                size_t count)
> > > > +{
> > > > +     struct fanotify_event_info_pidfd info = { };
> > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > +
> > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > +             return -EFAULT;
> > > > +
> > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > +     info.hdr.len = info_len;
> > > > +
> > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > +     if (info.pidfd < 0)
> > > > +             info.pidfd = FAN_NOPIDFD;
> > > > +
> > > > +     if (copy_to_user(buf, &info, info_len))
> > > > +             return -EFAULT;
> > >
> > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > fd table and when the copy_to_user() failed they won't know what fd it
> >
> > Good catch!
>
> Super awesome catch Christian, thanks pulling this up!
>
> > But I prefer to solve it differently, because moving fd_install() to the
> > end of this function does not guarantee that copy_event_to_user()
> > won't return an error one day with dangling pidfd in fd table.
>
> I can see the angle you're approaching this from...
>
> > It might be simpler to do pidfd_create() next to create_fd() in
> > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > pidfd can be closed on error along with fd on out_close_fd label.
> >
> > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > (even though fanotify_init() does check for that).
>
> I didn't really understand the need for this check here given that the
> administrative bits are already being checked for in fanotify_init()
> i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> thus never walking any of the pidfd_mode paths. Is this just a defense
> in depth approach here, or is it something else that I'm missing?
>

We want to be extra careful not to create privilege escalations,
so even if the fanotify fd is leaked or intentionally passed to a less
privileged user, it cannot get an open pidfd.

IOW, it is *much* easier to be defensive in this case than to prove
that the change cannot introduce any privilege escalations.

Thanks,
Amir.
Jan Kara May 21, 2021, 10:24 a.m. UTC | #5
On Fri 21-05-21 12:41:51, Amir Goldstein wrote:
> On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
> >
> > Hey Amir/Christian,
> >
> > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > > <christian.brauner@ubuntu.com> wrote:
> > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > > >
> > > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > > >  {
> > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > > >       if (fh_len)
> > > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > > >
> > > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > +
> > > > >       return info_len;
> > > > >  }
> > > > >
> > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > > >       return info_len;
> > > > >  }
> > > > >
> > > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > > +                                char __user *buf,
> > > > > +                                size_t count)
> > > > > +{
> > > > > +     struct fanotify_event_info_pidfd info = { };
> > > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > +
> > > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > > +             return -EFAULT;
> > > > > +
> > > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > > +     info.hdr.len = info_len;
> > > > > +
> > > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > > +     if (info.pidfd < 0)
> > > > > +             info.pidfd = FAN_NOPIDFD;
> > > > > +
> > > > > +     if (copy_to_user(buf, &info, info_len))
> > > > > +             return -EFAULT;
> > > >
> > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > > fd table and when the copy_to_user() failed they won't know what fd it
> > >
> > > Good catch!
> >
> > Super awesome catch Christian, thanks pulling this up!
> >
> > > But I prefer to solve it differently, because moving fd_install() to the
> > > end of this function does not guarantee that copy_event_to_user()
> > > won't return an error one day with dangling pidfd in fd table.
> >
> > I can see the angle you're approaching this from...
> >
> > > It might be simpler to do pidfd_create() next to create_fd() in
> > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > > pidfd can be closed on error along with fd on out_close_fd label.
> > >
> > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > (even though fanotify_init() does check for that).
> >
> > I didn't really understand the need for this check here given that the
> > administrative bits are already being checked for in fanotify_init()
> > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > thus never walking any of the pidfd_mode paths. Is this just a defense
> > in depth approach here, or is it something else that I'm missing?
> >
> 
> We want to be extra careful not to create privilege escalations,
> so even if the fanotify fd is leaked or intentionally passed to a less
> privileged user, it cannot get an open pidfd.
> 
> IOW, it is *much* easier to be defensive in this case than to prove
> that the change cannot introduce any privilege escalations.

I have no problems with being more defensive (it's certainly better than
being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
process, that process is also free to update all the passwords.
Traditionally permission checks in Unix are performed on open and then who
has fd can do whatever that fd allows... I've tried to follow similar
philosophy with fanotify as well and e.g. open happening as a result of
fanotify path events does not check permissions either.

								Honza
Amir Goldstein May 21, 2021, 11:10 a.m. UTC | #6
On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote:
>
> On Fri 21-05-21 12:41:51, Amir Goldstein wrote:
> > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
> > >
> > > Hey Amir/Christian,
> > >
> > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > > > <christian.brauner@ubuntu.com> wrote:
> > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > > > >
> > > > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > > > >  {
> > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > > > >       if (fh_len)
> > > > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > > > >
> > > > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > +
> > > > > >       return info_len;
> > > > > >  }
> > > > > >
> > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > > > >       return info_len;
> > > > > >  }
> > > > > >
> > > > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > > > +                                char __user *buf,
> > > > > > +                                size_t count)
> > > > > > +{
> > > > > > +     struct fanotify_event_info_pidfd info = { };
> > > > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > +
> > > > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > > > +             return -EFAULT;
> > > > > > +
> > > > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > > > +     info.hdr.len = info_len;
> > > > > > +
> > > > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > > > +     if (info.pidfd < 0)
> > > > > > +             info.pidfd = FAN_NOPIDFD;
> > > > > > +
> > > > > > +     if (copy_to_user(buf, &info, info_len))
> > > > > > +             return -EFAULT;
> > > > >
> > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > > > fd table and when the copy_to_user() failed they won't know what fd it
> > > >
> > > > Good catch!
> > >
> > > Super awesome catch Christian, thanks pulling this up!
> > >
> > > > But I prefer to solve it differently, because moving fd_install() to the
> > > > end of this function does not guarantee that copy_event_to_user()
> > > > won't return an error one day with dangling pidfd in fd table.
> > >
> > > I can see the angle you're approaching this from...
> > >
> > > > It might be simpler to do pidfd_create() next to create_fd() in
> > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > > > pidfd can be closed on error along with fd on out_close_fd label.
> > > >
> > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > > (even though fanotify_init() does check for that).
> > >
> > > I didn't really understand the need for this check here given that the
> > > administrative bits are already being checked for in fanotify_init()
> > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > > thus never walking any of the pidfd_mode paths. Is this just a defense
> > > in depth approach here, or is it something else that I'm missing?
> > >
> >
> > We want to be extra careful not to create privilege escalations,
> > so even if the fanotify fd is leaked or intentionally passed to a less
> > privileged user, it cannot get an open pidfd.
> >
> > IOW, it is *much* easier to be defensive in this case than to prove
> > that the change cannot introduce any privilege escalations.
>
> I have no problems with being more defensive (it's certainly better than
> being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
> task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
> process, that process is also free to update all the passwords.
> Traditionally permission checks in Unix are performed on open and then who
> has fd can do whatever that fd allows... I've tried to follow similar
> philosophy with fanotify as well and e.g. open happening as a result of
> fanotify path events does not check permissions either.
>

Agreed.

However, because we had this issue with no explicit FAN_REPORT_PID
we added the CAP_SYS_ADMIN check for reporting event->pid as next
best thing. So now that becomes weird if priv process created fanotify fd
and passes it to unpriv process, then unpriv process gets events with
pidfd but without event->pid.

We can change the code to:

        if (!capable(CAP_SYS_ADMIN) && !pidfd_mode &&
            task_tgid(current) != event->pid)
                metadata.pid = 0;

So the case I decscribed above ends up reporting both pidfd
and event->pid to unpriv user, but that is a bit inconsistent...

Thanks,
Amir.
Jan Kara May 21, 2021, 1:19 p.m. UTC | #7
On Fri 21-05-21 14:10:32, Amir Goldstein wrote:
> On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Fri 21-05-21 12:41:51, Amir Goldstein wrote:
> > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
> > > >
> > > > Hey Amir/Christian,
> > > >
> > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > > > > <christian.brauner@ubuntu.com> wrote:
> > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > > > > >
> > > > > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > > > > >  {
> > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > > > > >       if (fh_len)
> > > > > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > > > > >
> > > > > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > +
> > > > > > >       return info_len;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > > > > >       return info_len;
> > > > > > >  }
> > > > > > >
> > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > > > > +                                char __user *buf,
> > > > > > > +                                size_t count)
> > > > > > > +{
> > > > > > > +     struct fanotify_event_info_pidfd info = { };
> > > > > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > +
> > > > > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > > > > +             return -EFAULT;
> > > > > > > +
> > > > > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > > > > +     info.hdr.len = info_len;
> > > > > > > +
> > > > > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > > > > +     if (info.pidfd < 0)
> > > > > > > +             info.pidfd = FAN_NOPIDFD;
> > > > > > > +
> > > > > > > +     if (copy_to_user(buf, &info, info_len))
> > > > > > > +             return -EFAULT;
> > > > > >
> > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > > > > fd table and when the copy_to_user() failed they won't know what fd it
> > > > >
> > > > > Good catch!
> > > >
> > > > Super awesome catch Christian, thanks pulling this up!
> > > >
> > > > > But I prefer to solve it differently, because moving fd_install() to the
> > > > > end of this function does not guarantee that copy_event_to_user()
> > > > > won't return an error one day with dangling pidfd in fd table.
> > > >
> > > > I can see the angle you're approaching this from...
> > > >
> > > > > It might be simpler to do pidfd_create() next to create_fd() in
> > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > > > > pidfd can be closed on error along with fd on out_close_fd label.
> > > > >
> > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > > > (even though fanotify_init() does check for that).
> > > >
> > > > I didn't really understand the need for this check here given that the
> > > > administrative bits are already being checked for in fanotify_init()
> > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > > > thus never walking any of the pidfd_mode paths. Is this just a defense
> > > > in depth approach here, or is it something else that I'm missing?
> > > >
> > >
> > > We want to be extra careful not to create privilege escalations,
> > > so even if the fanotify fd is leaked or intentionally passed to a less
> > > privileged user, it cannot get an open pidfd.
> > >
> > > IOW, it is *much* easier to be defensive in this case than to prove
> > > that the change cannot introduce any privilege escalations.
> >
> > I have no problems with being more defensive (it's certainly better than
> > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
> > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
> > process, that process is also free to update all the passwords.
> > Traditionally permission checks in Unix are performed on open and then who
> > has fd can do whatever that fd allows... I've tried to follow similar
> > philosophy with fanotify as well and e.g. open happening as a result of
> > fanotify path events does not check permissions either.
> >
> 
> Agreed.
> 
> However, because we had this issue with no explicit FAN_REPORT_PID
> we added the CAP_SYS_ADMIN check for reporting event->pid as next
> best thing. So now that becomes weird if priv process created fanotify fd
> and passes it to unpriv process, then unpriv process gets events with
> pidfd but without event->pid.
> 
> We can change the code to:
> 
>         if (!capable(CAP_SYS_ADMIN) && !pidfd_mode &&
>             task_tgid(current) != event->pid)
>                 metadata.pid = 0;
> 
> So the case I decscribed above ends up reporting both pidfd
> and event->pid to unpriv user, but that is a bit inconsistent...

Oh, now I see where you are coming from :) Thanks for explanation. And
remind me please, cannot we just have internal FAN_REPORT_PID flag that
gets set on notification group when priviledged process creates it and then
test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is
mostly equivalent but I guess more in the spirit of how fanotify
traditionally does things. Also FAN_REPORT_PIDFD could then behave in the
same way...

								Honza
Amir Goldstein May 21, 2021, 1:52 p.m. UTC | #8
On Fri, May 21, 2021 at 4:19 PM Jan Kara <jack@suse.cz> wrote:
>
> On Fri 21-05-21 14:10:32, Amir Goldstein wrote:
> > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote:
> > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
> > > > >
> > > > > Hey Amir/Christian,
> > > > >
> > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > > > > > <christian.brauner@ubuntu.com> wrote:
> > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > > > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > > > > > >
> > > > > > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > > > > > >  {
> > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > > > > > >       if (fh_len)
> > > > > > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > > > > > >
> > > > > > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > > > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > > +
> > > > > > > >       return info_len;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > > > > > >       return info_len;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > > > > > +                                char __user *buf,
> > > > > > > > +                                size_t count)
> > > > > > > > +{
> > > > > > > > +     struct fanotify_event_info_pidfd info = { };
> > > > > > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > > +
> > > > > > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > > > > > +             return -EFAULT;
> > > > > > > > +
> > > > > > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > > > > > +     info.hdr.len = info_len;
> > > > > > > > +
> > > > > > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > > > > > +     if (info.pidfd < 0)
> > > > > > > > +             info.pidfd = FAN_NOPIDFD;
> > > > > > > > +
> > > > > > > > +     if (copy_to_user(buf, &info, info_len))
> > > > > > > > +             return -EFAULT;
> > > > > > >
> > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > > > > > fd table and when the copy_to_user() failed they won't know what fd it
> > > > > >
> > > > > > Good catch!
> > > > >
> > > > > Super awesome catch Christian, thanks pulling this up!
> > > > >
> > > > > > But I prefer to solve it differently, because moving fd_install() to the
> > > > > > end of this function does not guarantee that copy_event_to_user()
> > > > > > won't return an error one day with dangling pidfd in fd table.
> > > > >
> > > > > I can see the angle you're approaching this from...
> > > > >
> > > > > > It might be simpler to do pidfd_create() next to create_fd() in
> > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > > > > > pidfd can be closed on error along with fd on out_close_fd label.
> > > > > >
> > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > > > > (even though fanotify_init() does check for that).
> > > > >
> > > > > I didn't really understand the need for this check here given that the
> > > > > administrative bits are already being checked for in fanotify_init()
> > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > > > > thus never walking any of the pidfd_mode paths. Is this just a defense
> > > > > in depth approach here, or is it something else that I'm missing?
> > > > >
> > > >
> > > > We want to be extra careful not to create privilege escalations,
> > > > so even if the fanotify fd is leaked or intentionally passed to a less
> > > > privileged user, it cannot get an open pidfd.
> > > >
> > > > IOW, it is *much* easier to be defensive in this case than to prove
> > > > that the change cannot introduce any privilege escalations.
> > >
> > > I have no problems with being more defensive (it's certainly better than
> > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
> > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
> > > process, that process is also free to update all the passwords.
> > > Traditionally permission checks in Unix are performed on open and then who
> > > has fd can do whatever that fd allows... I've tried to follow similar
> > > philosophy with fanotify as well and e.g. open happening as a result of
> > > fanotify path events does not check permissions either.
> > >
> >
> > Agreed.
> >
> > However, because we had this issue with no explicit FAN_REPORT_PID
> > we added the CAP_SYS_ADMIN check for reporting event->pid as next
> > best thing. So now that becomes weird if priv process created fanotify fd
> > and passes it to unpriv process, then unpriv process gets events with
> > pidfd but without event->pid.
> >
> > We can change the code to:
> >
> >         if (!capable(CAP_SYS_ADMIN) && !pidfd_mode &&
> >             task_tgid(current) != event->pid)
> >                 metadata.pid = 0;
> >
> > So the case I decscribed above ends up reporting both pidfd
> > and event->pid to unpriv user, but that is a bit inconsistent...
>
> Oh, now I see where you are coming from :) Thanks for explanation. And
> remind me please, cannot we just have internal FAN_REPORT_PID flag that
> gets set on notification group when priviledged process creates it and then
> test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is
> mostly equivalent but I guess more in the spirit of how fanotify
> traditionally does things. Also FAN_REPORT_PIDFD could then behave in the
> same way...

Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV
as it described the situation better than FAN_REPORT_PID.
This happens to be how I implemented it in the initial RFC [1].

It's not easy to follow our entire discussion on this thread, but I think
we can resurrect the FANOTIFY_UNPRIV internal flag and use it
in this case instead of CAP_SYS_ADMIN.

Thanks,
Amir.

https://lore.kernel.org/linux-fsdevel/20210124184204.899729-3-amir73il@gmail.com/
Jan Kara May 21, 2021, 3:14 p.m. UTC | #9
On Fri 21-05-21 16:52:08, Amir Goldstein wrote:
> On Fri, May 21, 2021 at 4:19 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Fri 21-05-21 14:10:32, Amir Goldstein wrote:
> > > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote:
> > > >
> > > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote:
> > > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
> > > > > >
> > > > > > Hey Amir/Christian,
> > > > > >
> > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > > > > > > <christian.brauner@ubuntu.com> wrote:
> > > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > > > > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > > > > > > >
> > > > > > > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > > > > > > >  {
> > > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > > > > > > >       if (fh_len)
> > > > > > > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > > > > > > >
> > > > > > > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > > > > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > > > +
> > > > > > > > >       return info_len;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > > > > > > >       return info_len;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > > > > > > +                                char __user *buf,
> > > > > > > > > +                                size_t count)
> > > > > > > > > +{
> > > > > > > > > +     struct fanotify_event_info_pidfd info = { };
> > > > > > > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > > > +
> > > > > > > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > > > > > > +             return -EFAULT;
> > > > > > > > > +
> > > > > > > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > > > > > > +     info.hdr.len = info_len;
> > > > > > > > > +
> > > > > > > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > > > > > > +     if (info.pidfd < 0)
> > > > > > > > > +             info.pidfd = FAN_NOPIDFD;
> > > > > > > > > +
> > > > > > > > > +     if (copy_to_user(buf, &info, info_len))
> > > > > > > > > +             return -EFAULT;
> > > > > > > >
> > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > > > > > > fd table and when the copy_to_user() failed they won't know what fd it
> > > > > > >
> > > > > > > Good catch!
> > > > > >
> > > > > > Super awesome catch Christian, thanks pulling this up!
> > > > > >
> > > > > > > But I prefer to solve it differently, because moving fd_install() to the
> > > > > > > end of this function does not guarantee that copy_event_to_user()
> > > > > > > won't return an error one day with dangling pidfd in fd table.
> > > > > >
> > > > > > I can see the angle you're approaching this from...
> > > > > >
> > > > > > > It might be simpler to do pidfd_create() next to create_fd() in
> > > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > > > > > > pidfd can be closed on error along with fd on out_close_fd label.
> > > > > > >
> > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > > > > > (even though fanotify_init() does check for that).
> > > > > >
> > > > > > I didn't really understand the need for this check here given that the
> > > > > > administrative bits are already being checked for in fanotify_init()
> > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense
> > > > > > in depth approach here, or is it something else that I'm missing?
> > > > > >
> > > > >
> > > > > We want to be extra careful not to create privilege escalations,
> > > > > so even if the fanotify fd is leaked or intentionally passed to a less
> > > > > privileged user, it cannot get an open pidfd.
> > > > >
> > > > > IOW, it is *much* easier to be defensive in this case than to prove
> > > > > that the change cannot introduce any privilege escalations.
> > > >
> > > > I have no problems with being more defensive (it's certainly better than
> > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
> > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
> > > > process, that process is also free to update all the passwords.
> > > > Traditionally permission checks in Unix are performed on open and then who
> > > > has fd can do whatever that fd allows... I've tried to follow similar
> > > > philosophy with fanotify as well and e.g. open happening as a result of
> > > > fanotify path events does not check permissions either.
> > > >
> > >
> > > Agreed.
> > >
> > > However, because we had this issue with no explicit FAN_REPORT_PID
> > > we added the CAP_SYS_ADMIN check for reporting event->pid as next
> > > best thing. So now that becomes weird if priv process created fanotify fd
> > > and passes it to unpriv process, then unpriv process gets events with
> > > pidfd but without event->pid.
> > >
> > > We can change the code to:
> > >
> > >         if (!capable(CAP_SYS_ADMIN) && !pidfd_mode &&
> > >             task_tgid(current) != event->pid)
> > >                 metadata.pid = 0;
> > >
> > > So the case I decscribed above ends up reporting both pidfd
> > > and event->pid to unpriv user, but that is a bit inconsistent...
> >
> > Oh, now I see where you are coming from :) Thanks for explanation. And
> > remind me please, cannot we just have internal FAN_REPORT_PID flag that
> > gets set on notification group when priviledged process creates it and then
> > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is
> > mostly equivalent but I guess more in the spirit of how fanotify
> > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the
> > same way...
> 
> Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV
> as it described the situation better than FAN_REPORT_PID.
> This happens to be how I implemented it in the initial RFC [1].
> 
> It's not easy to follow our entire discussion on this thread, but I think
> we can resurrect the FANOTIFY_UNPRIV internal flag and use it
> in this case instead of CAP_SYS_ADMIN.

I think at that time we were discussing how to handle opening of fds and
we decided to not depend on FANOTIFY_UNPRIV and then I didn't see a value
of that flag because I forgot about pids... Anyway now I agree to go for
that flag. :)

								Honza
Matthew Bobrowski May 22, 2021, 12:41 a.m. UTC | #10
On Fri, May 21, 2021 at 05:14:15PM +0200, Jan Kara wrote:
> On Fri 21-05-21 16:52:08, Amir Goldstein wrote:
> > On Fri, May 21, 2021 at 4:19 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Fri 21-05-21 14:10:32, Amir Goldstein wrote:
> > > > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote:
> > > > >
> > > > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote:
> > > > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote:
> > > > > > >
> > > > > > > Hey Amir/Christian,
> > > > > > >
> > > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote:
> > > > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner
> > > > > > > > <christian.brauner@ubuntu.com> wrote:
> > > > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> > > > > > > > > > +     sizeof(struct fanotify_event_info_pidfd)
> > > > > > > > > >
> > > > > > > > > >  static int fanotify_fid_info_len(int fh_len, int name_len)
> > > > > > > > > >  {
> > > > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
> > > > > > > > > >       if (fh_len)
> > > > > > > > > >               info_len += fanotify_fid_info_len(fh_len, dot_len);
> > > > > > > > > >
> > > > > > > > > > +     if (info_mode & FAN_REPORT_PIDFD)
> > > > > > > > > > +             info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > > > > +
> > > > > > > > > >       return info_len;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
> > > > > > > > > >       return info_len;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid,
> > > > > > > > > > +                                char __user *buf,
> > > > > > > > > > +                                size_t count)
> > > > > > > > > > +{
> > > > > > > > > > +     struct fanotify_event_info_pidfd info = { };
> > > > > > > > > > +     size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> > > > > > > > > > +
> > > > > > > > > > +     if (WARN_ON_ONCE(info_len > count))
> > > > > > > > > > +             return -EFAULT;
> > > > > > > > > > +
> > > > > > > > > > +     info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> > > > > > > > > > +     info.hdr.len = info_len;
> > > > > > > > > > +
> > > > > > > > > > +     info.pidfd = pidfd_create(pid, 0);
> > > > > > > > > > +     if (info.pidfd < 0)
> > > > > > > > > > +             info.pidfd = FAN_NOPIDFD;
> > > > > > > > > > +
> > > > > > > > > > +     if (copy_to_user(buf, &info, info_len))
> > > > > > > > > > +             return -EFAULT;
> > > > > > > > >
> > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their
> > > > > > > > > fd table and when the copy_to_user() failed they won't know what fd it
> > > > > > > >
> > > > > > > > Good catch!
> > > > > > >
> > > > > > > Super awesome catch Christian, thanks pulling this up!
> > > > > > >
> > > > > > > > But I prefer to solve it differently, because moving fd_install() to the
> > > > > > > > end of this function does not guarantee that copy_event_to_user()
> > > > > > > > won't return an error one day with dangling pidfd in fd table.
> > > > > > >
> > > > > > > I can see the angle you're approaching this from...
> > > > > > >
> > > > > > > > It might be simpler to do pidfd_create() next to create_fd() in
> > > > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user().
> > > > > > > > pidfd can be closed on error along with fd on out_close_fd label.
> > > > > > > >
> > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > > > > > > (even though fanotify_init() does check for that).
> > > > > > >
> > > > > > > I didn't really understand the need for this check here given that the
> > > > > > > administrative bits are already being checked for in fanotify_init()
> > > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense
> > > > > > > in depth approach here, or is it something else that I'm missing?
> > > > > > >
> > > > > >
> > > > > > We want to be extra careful not to create privilege escalations,
> > > > > > so even if the fanotify fd is leaked or intentionally passed to a less
> > > > > > privileged user, it cannot get an open pidfd.
> > > > > >
> > > > > > IOW, it is *much* easier to be defensive in this case than to prove
> > > > > > that the change cannot introduce any privilege escalations.
> > > > >
> > > > > I have no problems with being more defensive (it's certainly better than
> > > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
> > > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
> > > > > process, that process is also free to update all the passwords.
> > > > > Traditionally permission checks in Unix are performed on open and then who
> > > > > has fd can do whatever that fd allows... I've tried to follow similar
> > > > > philosophy with fanotify as well and e.g. open happening as a result of
> > > > > fanotify path events does not check permissions either.
> > > > >
> > > >
> > > > Agreed.
> > > >
> > > > However, because we had this issue with no explicit FAN_REPORT_PID
> > > > we added the CAP_SYS_ADMIN check for reporting event->pid as next
> > > > best thing. So now that becomes weird if priv process created fanotify fd
> > > > and passes it to unpriv process, then unpriv process gets events with
> > > > pidfd but without event->pid.
> > > >
> > > > We can change the code to:
> > > >
> > > >         if (!capable(CAP_SYS_ADMIN) && !pidfd_mode &&
> > > >             task_tgid(current) != event->pid)
> > > >                 metadata.pid = 0;
> > > >
> > > > So the case I decscribed above ends up reporting both pidfd
> > > > and event->pid to unpriv user, but that is a bit inconsistent...
> > >
> > > Oh, now I see where you are coming from :) Thanks for explanation. And
> > > remind me please, cannot we just have internal FAN_REPORT_PID flag that
> > > gets set on notification group when priviledged process creates it and then
> > > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is
> > > mostly equivalent but I guess more in the spirit of how fanotify
> > > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the
> > > same way...
> > 
> > Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV
> > as it described the situation better than FAN_REPORT_PID.
> > This happens to be how I implemented it in the initial RFC [1].
> > 
> > It's not easy to follow our entire discussion on this thread, but I think
> > we can resurrect the FANOTIFY_UNPRIV internal flag and use it
> > in this case instead of CAP_SYS_ADMIN.
> 
> I think at that time we were discussing how to handle opening of fds and
> we decided to not depend on FANOTIFY_UNPRIV and then I didn't see a value
> of that flag because I forgot about pids... Anyway now I agree to go for
> that flag. :)

Resurrection of this flag SGTM! However, it also sounds like we need
to land that series before this PIDFD series or simply incorporate the
UNPRIV flag into this one.

Will chat with Amir to get this done.

/M
Amir Goldstein May 22, 2021, 9:01 a.m. UTC | #11
> > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create()
> > > > > > > > > (even though fanotify_init() does check for that).
> > > > > > > >
> > > > > > > > I didn't really understand the need for this check here given that the
> > > > > > > > administrative bits are already being checked for in fanotify_init()
> > > > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener;
> > > > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense
> > > > > > > > in depth approach here, or is it something else that I'm missing?
> > > > > > > >
> > > > > > >
> > > > > > > We want to be extra careful not to create privilege escalations,
> > > > > > > so even if the fanotify fd is leaked or intentionally passed to a less
> > > > > > > privileged user, it cannot get an open pidfd.
> > > > > > >
> > > > > > > IOW, it is *much* easier to be defensive in this case than to prove
> > > > > > > that the change cannot introduce any privilege escalations.
> > > > > >
> > > > > > I have no problems with being more defensive (it's certainly better than
> > > > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN
> > > > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged
> > > > > > process, that process is also free to update all the passwords.
> > > > > > Traditionally permission checks in Unix are performed on open and then who
> > > > > > has fd can do whatever that fd allows... I've tried to follow similar
> > > > > > philosophy with fanotify as well and e.g. open happening as a result of
> > > > > > fanotify path events does not check permissions either.
> > > > > >
> > > > >
> > > > > Agreed.
> > > > >
> > > > > However, because we had this issue with no explicit FAN_REPORT_PID
> > > > > we added the CAP_SYS_ADMIN check for reporting event->pid as next
> > > > > best thing. So now that becomes weird if priv process created fanotify fd
> > > > > and passes it to unpriv process, then unpriv process gets events with
> > > > > pidfd but without event->pid.
> > > > >
> > > > > We can change the code to:
> > > > >
> > > > >         if (!capable(CAP_SYS_ADMIN) && !pidfd_mode &&
> > > > >             task_tgid(current) != event->pid)
> > > > >                 metadata.pid = 0;
> > > > >
> > > > > So the case I decscribed above ends up reporting both pidfd
> > > > > and event->pid to unpriv user, but that is a bit inconsistent...
> > > >
> > > > Oh, now I see where you are coming from :) Thanks for explanation. And
> > > > remind me please, cannot we just have internal FAN_REPORT_PID flag that
> > > > gets set on notification group when priviledged process creates it and then
> > > > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is
> > > > mostly equivalent but I guess more in the spirit of how fanotify
> > > > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the
> > > > same way...
> > >
> > > Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV
> > > as it described the situation better than FAN_REPORT_PID.
> > > This happens to be how I implemented it in the initial RFC [1].
> > >
> > > It's not easy to follow our entire discussion on this thread, but I think
> > > we can resurrect the FANOTIFY_UNPRIV internal flag and use it
> > > in this case instead of CAP_SYS_ADMIN.
> >
> > I think at that time we were discussing how to handle opening of fds and
> > we decided to not depend on FANOTIFY_UNPRIV and then I didn't see a value
> > of that flag because I forgot about pids... Anyway now I agree to go for
> > that flag. :)
>
> Resurrection of this flag SGTM! However, it also sounds like we need
> to land that series before this PIDFD series or simply incorporate the
> UNPRIV flag into this one.
>
> Will chat with Amir to get this done.

Let me post this patch as a fix patch to unprivileged group.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1e15f3222eb2..bba61988f4a0 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,6 +106,8 @@  struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_FID_INFO_HDR_LEN \
 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+	sizeof(struct fanotify_event_info_pidfd)
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -141,6 +143,9 @@  static int fanotify_event_info_len(unsigned int info_mode,
 	if (fh_len)
 		info_len += fanotify_fid_info_len(fh_len, dot_len);
 
+	if (info_mode & FAN_REPORT_PIDFD)
+		info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
 	return info_len;
 }
 
@@ -401,6 +406,29 @@  static int copy_fid_info_to_user(__kernel_fsid_t *fsid,
 	return info_len;
 }
 
+static int copy_pidfd_info_to_user(struct pid *pid,
+				   char __user *buf,
+				   size_t count)
+{
+	struct fanotify_event_info_pidfd info = { };
+	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+	if (WARN_ON_ONCE(info_len > count))
+		return -EFAULT;
+
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
+	info.hdr.len = info_len;
+
+	info.pidfd = pidfd_create(pid, 0);
+	if (info.pidfd < 0)
+		info.pidfd = FAN_NOPIDFD;
+
+	if (copy_to_user(buf, &info, info_len))
+		return -EFAULT;
+
+	return info_len;
+}
+
 static int copy_info_to_user(struct fanotify_event *event,
 			     struct fanotify_info *info,
 			     unsigned int info_mode,
@@ -408,9 +436,12 @@  static int copy_info_to_user(struct fanotify_event *event,
 {
 	int ret, info_type = 0;
 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
+	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 
 	/*
 	 * Event info records order is as follows: dir fid + name, child fid.
+	 * If FAN_REPORT_PIDFD has been specified, then a pidfd info record
+	 * will follow the fid info records.
 	 */
 	if (fanotify_event_dir_fh_len(event)) {
 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
@@ -465,10 +496,18 @@  static int copy_info_to_user(struct fanotify_event *event,
 		}
 
 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
-					    fanotify_event_object_fh(event),
-					    info_type, dot, dot_len,
-					    buf, count);
-	}
+					    fanotify_event_object_fh(event),
+					    info_type, dot, dot_len,
+					    buf, count);
+		if (ret < 0)
+			return ret;
+
+		buf += ret;
+		count -= ret;
+	}
+
+	if (pidfd_mode)
+		return copy_pidfd_info_to_user(event->pid, buf, count);
 
 	return ret;
 }
@@ -530,6 +569,15 @@  static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		fd_install(fd, f);
 
 	if (info_mode) {
+		/*
+		 * Complain if FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
+		 * exclusion is ever lifted. At the time of implementing
+		 * FAN_REPORT_PIDFD, the pidfd API only supported the creation
+		 * of pidfds on thread-group leaders.
+		 */
+		WARN_ON_ONCE((info_mode & FAN_REPORT_PIDFD) &&
+			     FAN_GROUP_FLAG(group, FAN_REPORT_TID));
+
 		ret = copy_info_to_user(event, info, info_mode, buf, count);
 		if (ret < 0)
 			return ret;
@@ -1079,6 +1127,13 @@  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 #endif
 		return -EINVAL;
 
+	/*
+	 * A pidfd can only be returned for a thread-group leader; thus
+	 * FAN_REPORT_TID and FAN_REPORT_PIDFD need to be mutually exclusive.
+	 */
+	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
+		return -EINVAL;
+
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
 		return -EINVAL;
 
@@ -1477,7 +1532,7 @@  static int __init fanotify_user_setup(void)
 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index f76c7635efc8..bb2898240e5a 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -27,7 +27,7 @@  extern struct ctl_table fanotify_table[]; /* for sysctl */
 
 #define FANOTIFY_FID_BITS	(FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
 
-#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS)
+#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
 
 /*
  * fanotify_init() flags that require CAP_SYS_ADMIN.
@@ -37,6 +37,7 @@  extern struct ctl_table fanotify_table[]; /* for sysctl */
  */
 #define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
 					 FAN_REPORT_TID | \
+					 FAN_REPORT_PIDFD | \
 					 FAN_UNLIMITED_QUEUE | \
 					 FAN_UNLIMITED_MARKS)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index fbf9c5c7dd59..36c3bddcf690 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -55,6 +55,7 @@ 
 #define FAN_REPORT_FID		0x00000200	/* Report unique file id */
 #define FAN_REPORT_DIR_FID	0x00000400	/* Report unique directory id */
 #define FAN_REPORT_NAME		0x00000800	/* Report events with name */
+#define FAN_REPORT_PIDFD	0x00001000	/* Report pidfd for event->pid */
 
 /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
 #define FAN_REPORT_DFID_NAME	(FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
@@ -123,6 +124,7 @@  struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_FID		1
 #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
 #define FAN_EVENT_INFO_TYPE_DFID	3
+#define FAN_EVENT_INFO_TYPE_PIDFD	4
 
 /* Variable length info record following event metadata */
 struct fanotify_event_info_header {
@@ -148,6 +150,15 @@  struct fanotify_event_info_fid {
 	unsigned char handle[0];
 };
 
+/*
+ * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
+ * It holds a pidfd for the pid responsible for generating an event.
+ */
+struct fanotify_event_info_pidfd {
+	struct fanotify_event_info_header hdr;
+	__s32 pidfd;
+};
+
 struct fanotify_response {
 	__s32 fd;
 	__u32 response;
@@ -160,6 +171,7 @@  struct fanotify_response {
 
 /* No fd set in event */
 #define FAN_NOFD	-1
+#define FAN_NOPIDFD	FAN_NOFD
 
 /* Helper functions to deal with fanotify_event_metadata buffers */
 #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))