diff mbox series

[v2,5/5] fanotify: add pidfd support to the fanotify API

Message ID 7f9d3b7815e72bfee92945cab51992f9db6533dd.1623282854.git.repnop@google.com (mailing list archive)
State New, archived
Headers show
Series Add pidfd support to the fanotify API | expand

Commit Message

Matthew Bobrowski June 10, 2021, 12:21 a.m. UTC
Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
allows userspace applications to control whether a pidfd info record
containing a pidfd is to be returned with each event.

If FAN_REPORT_PIDFD is enabled for a notification group, an additional
struct fanotify_event_info_pidfd object will be supplied alongside the
generic struct fanotify_event_metadata within a single event. This
functionality is analogous to that of FAN_REPORT_FID in terms of how
the event structure is supplied to the userspace application. Usage of
FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
permitted, and in this case a struct fanotify_event_info_pidfd object
will follow any struct fanotify_event_info_fid object.

Currently, the usage of FAN_REPORT_TID is not permitted along with
FAN_REPORT_PIDFD as the pidfd API only supports the creation of pidfds
for thread-group leaders. Additionally, the FAN_REPORT_PIDFD is
limited to privileged processes only i.e. listeners that are running
with the CAP_SYS_ADMIN capability. Attempting to supply either of
these initialisation flags with FAN_REPORT_PIDFD will result with
EINVAL being returned to the caller.

In the event of a pidfd creation error, there are two types of error
values that can be reported back to the listener. There is
FAN_NOPIDFD, which will be reported in cases where the process
responsible for generating the event has terminated prior to fanotify
being able to create pidfd for event->pid via pidfd_create(). The
there is FAN_EPIDFD, which will be reported if a more generic pidfd
creation error occurred when calling pidfd_create().

Signed-off-by: Matthew Bobrowski <repnop@google.com>

---

Changes since v1:

* Explicit checks added to copy_event_to_user() for unprivileged
  listeners via FANOTIFY_UNPRIV. Only processes running with the
  CAP_SYS_ADMIN capability can receive pidfds for events.

* The pidfd creation via pidfd_create() has been taken out from
  copy_pidfd_info_to_user() and put into copy_event_to_user() so that
  proper clean up of the installed file descriptor can take place in
  the event that we error out during one of the info copying routines.

* Before pidfd creation is done via pidfd_create(), we perform an
  explicit check using pid_has_task() to make sure that the process
  responsible for generating the event in the first place hasn't been
  terminated. If it has, we supply the FAN_NOPIDFD error to the
  listener which explicitly indicates this was the case. All other
  pidfd creation errors are represented by FAN_EPIDFD.

* An additional check has been implemented before calling into
  pidfd_create() to see whether pid_vnr() had returned 0 for
  event->pid. In such cases, we also return FAN_NOPIDFD within the
  pidfd info record as returning metadata->pid = 0 with a valid pidfd
  doesn't make much sense and could lead to possible security problem.

 fs/notify/fanotify/fanotify_user.c | 98 ++++++++++++++++++++++++++++--
 include/linux/fanotify.h           |  3 +-
 include/uapi/linux/fanotify.h      | 13 ++++
 3 files changed, 107 insertions(+), 7 deletions(-)

Comments

Amir Goldstein June 10, 2021, 5:18 a.m. UTC | #1
On Thu, Jun 10, 2021 at 3:22 AM Matthew Bobrowski <repnop@google.com> wrote:
>
> Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
> allows userspace applications to control whether a pidfd info record
> containing a pidfd is to be returned with each event.
>
> If FAN_REPORT_PIDFD is enabled for a notification group, an additional
> struct fanotify_event_info_pidfd object will be supplied alongside the
> generic struct fanotify_event_metadata within a single event. This
> functionality is analogous to that of FAN_REPORT_FID in terms of how
> the event structure is supplied to the userspace application. Usage of
> FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
> permitted, and in this case a struct fanotify_event_info_pidfd object
> will follow any struct fanotify_event_info_fid object.
>
> Currently, the usage of FAN_REPORT_TID is not permitted along with
> FAN_REPORT_PIDFD as the pidfd API only supports the creation of pidfds
> for thread-group leaders. Additionally, the FAN_REPORT_PIDFD is
> limited to privileged processes only i.e. listeners that are running
> with the CAP_SYS_ADMIN capability. Attempting to supply either of
> these initialisation flags with FAN_REPORT_PIDFD will result with
> EINVAL being returned to the caller.
>
> In the event of a pidfd creation error, there are two types of error
> values that can be reported back to the listener. There is
> FAN_NOPIDFD, which will be reported in cases where the process
> responsible for generating the event has terminated prior to fanotify
> being able to create pidfd for event->pid via pidfd_create(). The
> there is FAN_EPIDFD, which will be reported if a more generic pidfd
> creation error occurred when calling pidfd_create().
>
> Signed-off-by: Matthew Bobrowski <repnop@google.com>
>
> ---
>
> Changes since v1:
>
> * Explicit checks added to copy_event_to_user() for unprivileged
>   listeners via FANOTIFY_UNPRIV. Only processes running with the
>   CAP_SYS_ADMIN capability can receive pidfds for events.
>
> * The pidfd creation via pidfd_create() has been taken out from
>   copy_pidfd_info_to_user() and put into copy_event_to_user() so that
>   proper clean up of the installed file descriptor can take place in
>   the event that we error out during one of the info copying routines.
>
> * Before pidfd creation is done via pidfd_create(), we perform an
>   explicit check using pid_has_task() to make sure that the process
>   responsible for generating the event in the first place hasn't been
>   terminated. If it has, we supply the FAN_NOPIDFD error to the
>   listener which explicitly indicates this was the case. All other
>   pidfd creation errors are represented by FAN_EPIDFD.
>
> * An additional check has been implemented before calling into
>   pidfd_create() to see whether pid_vnr() had returned 0 for
>   event->pid. In such cases, we also return FAN_NOPIDFD within the
>   pidfd info record as returning metadata->pid = 0 with a valid pidfd
>   doesn't make much sense and could lead to possible security problem.
>
>  fs/notify/fanotify/fanotify_user.c | 98 ++++++++++++++++++++++++++++--
>  include/linux/fanotify.h           |  3 +-
>  include/uapi/linux/fanotify.h      | 13 ++++
>  3 files changed, 107 insertions(+), 7 deletions(-)
>
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 85d6eea8d45d..1ce66bcfd9b5 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -106,6 +106,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
>  #define FANOTIFY_EVENT_ALIGN 4
>  #define FANOTIFY_FID_INFO_HDR_LEN \
>         (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
> +#define FANOTIFY_PIDFD_INFO_HDR_LEN \
> +       sizeof(struct fanotify_event_info_pidfd)
>
>  static int fanotify_fid_info_len(int fh_len, int name_len)
>  {
> @@ -138,6 +140,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
>                 dot_len = 1;
>         }
>
> +       if (info_mode & FAN_REPORT_PIDFD)
> +               info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
> +
>         if (fh_len)
>                 info_len += fanotify_fid_info_len(fh_len, dot_len);
>
> @@ -401,13 +406,34 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
>         return info_len;
>  }
>
> +static int copy_pidfd_info_to_user(int pidfd,
> +                                  char __user *buf,
> +                                  size_t count)
> +{
> +       struct fanotify_event_info_pidfd info = { };
> +       size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
> +
> +       if (WARN_ON_ONCE(info_len > count))
> +               return -EFAULT;
> +
> +       info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
> +       info.hdr.len = info_len;
> +       info.pidfd = pidfd;
> +
> +       if (copy_to_user(buf, &info, info_len))
> +               return -EFAULT;
> +
> +       return info_len;
> +}
> +
>  static int copy_info_records_to_user(struct fanotify_event *event,
>                                      struct fanotify_info *info,
> -                                    unsigned int info_mode,
> +                                    unsigned int info_mode, int pidfd,
>                                      char __user *buf, size_t count)
>  {
>         int ret, total_bytes = 0, info_type = 0;
>         unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
> +       unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
>
>         /*
>          * Event info records order is as follows: dir fid + name, child fid.
> @@ -478,6 +504,16 @@ static int copy_info_records_to_user(struct fanotify_event *event,
>                 total_bytes += ret;
>         }
>
> +       if (pidfd_mode) {
> +               ret = copy_pidfd_info_to_user(pidfd, buf, count);
> +               if (ret < 0)
> +                       return ret;
> +
> +               buf += ret;
> +               count -= ret;
> +               total_bytes += ret;
> +       }
> +
>         return total_bytes;
>  }
>
> @@ -489,8 +525,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>         struct path *path = fanotify_event_path(event);
>         struct fanotify_info *info = fanotify_event_info(event);
>         unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
> +       unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
>         struct file *f = NULL;
> -       int ret, fd = FAN_NOFD;
> +       int ret, pidfd = 0, fd = FAN_NOFD;

It feels like this should be pidfd = FAN_NOPIDFD?

>
>         pr_debug("%s: group=%p event=%p\n", __func__, group, event);
>
> @@ -524,6 +561,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>         }
>         metadata.fd = fd;
>
> +       /*
> +        * Currently, reporting a pidfd to an unprivileged listener is not
> +        * supported. The FANOTIFY_UNPRIV flag is to be kept here so that a
> +        * pidfd is not accidentally leaked to an unprivileged listener.
> +        */
> +       if (pidfd_mode && !FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) {
> +               /*
> +                * The PIDTYPE_TGID check for an event->pid is performed
> +                * preemptively in attempt to catch those rare instances
> +                * where the process responsible for generating the event has
> +                * terminated prior to calling into pidfd_create() and
> +                * acquiring a valid pidfd. Report FAN_NOPIDFD to the listener
> +                * in those cases.
> +                */
> +               if (metadata.pid == 0 ||
> +                   !pid_has_task(event->pid, PIDTYPE_TGID)) {
> +                       pidfd = FAN_NOPIDFD;
> +               } else {
> +                       pidfd = pidfd_create(event->pid, 0);
> +                       if (pidfd < 0)
> +                               /*
> +                                * All other pidfd creation errors are reported
> +                                * as FAN_EPIDFD to the listener.
> +                                */
> +                               pidfd = FAN_EPIDFD;

That's an anti pattern. a multi-line statement, even due to comment should
be inside {}, but in this case, I think it is better to put this
comment as another
line in the big comment above which explains both the if and the else, because
it is in fact a continuation of the comment above.

> +               }
> +       }
> +
>         ret = -EFAULT;
>         /*
>          * Sanity check copy size in case get_one_event() and
> @@ -545,10 +610,19 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>                 fd_install(fd, f);
>
>         if (info_mode) {
> -               ret = copy_info_records_to_user(event, info, info_mode,
> -                                               buf, count);
> +               /*
> +                * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
> +                * exclusion is ever lifted. At the time of incorporating pidfd
> +                * support within fanotify, the pidfd API only supported the
> +                * creation of pidfds for thread-group leaders.
> +                */
> +               WARN_ON_ONCE(pidfd_mode &&
> +                            FAN_GROUP_FLAG(group, FAN_REPORT_TID));
> +

This WARN_ON, if needed at all, would be better places inside if (pidfd_mode &&
code block above where you would only need to
     WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
as close as possible to PIDTYPE_TGID line.

> +               ret = copy_info_records_to_user(event, info, info_mode, pidfd,
> +                                               buf, count);
>                 if (ret < 0)
> -                       return ret;
> +                       goto out_close_fd;

This looks like a bug in upstream.
It should have been goto out_close_fd to begin with.
We did already copy metadata.fd to user, but the read() call
returns an error.
You should probably fix it before the refactoring patch, so it
can be applied to stable kernels.

>         }
>
>         return metadata.event_len;
> @@ -558,6 +632,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>                 put_unused_fd(fd);
>                 fput(f);
>         }
> +
> +       if (pidfd < 0)

That condition is reversed.
We do not seem to have any test coverage for this error handling
Not so surprising that upstream had a bug...

> +               put_unused_fd(pidfd);
> +
>         return ret;
>  }
>

Thanks,
Amir.
Matthew Bobrowski June 10, 2021, 6:35 a.m. UTC | #2
On Thu, Jun 10, 2021 at 08:18:01AM +0300, Amir Goldstein wrote:
> On Thu, Jun 10, 2021 at 3:22 AM Matthew Bobrowski <repnop@google.com> wrote:
> > @@ -489,8 +525,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> >         struct path *path = fanotify_event_path(event);
> >         struct fanotify_info *info = fanotify_event_info(event);
> >         unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
> > +       unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
> >         struct file *f = NULL;
> > -       int ret, fd = FAN_NOFD;
> > +       int ret, pidfd = 0, fd = FAN_NOFD;
> 
> It feels like this should be pidfd = FAN_NOPIDFD?

I had considered this, but in all honesty I wasn't sure what the behavior
is when put_unused_fd() is provided a negative value, nor whether it is
accepted. The way I saw it was that if fid info record copying had errored
out for whatever reason and we jumped to the out_close_fd label we'd also,
perhaps unnecessarily, take the pidfd clean up route, which IMO wouldn't be
required.

> >
> >         pr_debug("%s: group=%p event=%p\n", __func__, group, event);
> >
> > @@ -524,6 +561,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> >         }
> >         metadata.fd = fd;
> >
> > +       /*
> > +        * Currently, reporting a pidfd to an unprivileged listener is not
> > +        * supported. The FANOTIFY_UNPRIV flag is to be kept here so that a
> > +        * pidfd is not accidentally leaked to an unprivileged listener.
> > +        */
> > +       if (pidfd_mode && !FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) {
> > +               /*
> > +                * The PIDTYPE_TGID check for an event->pid is performed
> > +                * preemptively in attempt to catch those rare instances
> > +                * where the process responsible for generating the event has
> > +                * terminated prior to calling into pidfd_create() and
> > +                * acquiring a valid pidfd. Report FAN_NOPIDFD to the listener
> > +                * in those cases.
> > +                */
> > +               if (metadata.pid == 0 ||
> > +                   !pid_has_task(event->pid, PIDTYPE_TGID)) {
> > +                       pidfd = FAN_NOPIDFD;
> > +               } else {
> > +                       pidfd = pidfd_create(event->pid, 0);
> > +                       if (pidfd < 0)
> > +                               /*
> > +                                * All other pidfd creation errors are reported
> > +                                * as FAN_EPIDFD to the listener.
> > +                                */
> > +                               pidfd = FAN_EPIDFD;
> 
> That's an anti pattern. a multi-line statement, even due to comment should
> be inside {}, but in this case, I think it is better to put this
> comment as another
> line in the big comment above which explains both the if and the else, because
> it is in fact a continuation of the comment above.

Ah, right, I didn't know that this was considered as an anti-pattern. But
then again, I can totally understand why it would be. No objections with
merging this comment with the one that precedes the parent if statement.

> > +               }
> > +       }
> > +
> >         ret = -EFAULT;
> >         /*
> >          * Sanity check copy size in case get_one_event() and
> > @@ -545,10 +610,19 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> >                 fd_install(fd, f);
> >
> >         if (info_mode) {
> > -               ret = copy_info_records_to_user(event, info, info_mode,
> > -                                               buf, count);
> > +               /*
> > +                * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
> > +                * exclusion is ever lifted. At the time of incorporating pidfd
> > +                * support within fanotify, the pidfd API only supported the
> > +                * creation of pidfds for thread-group leaders.
> > +                */
> > +               WARN_ON_ONCE(pidfd_mode &&
> > +                            FAN_GROUP_FLAG(group, FAN_REPORT_TID));
> > +
> 
> This WARN_ON, if needed at all, would be better places inside if (pidfd_mode &&
> code block above where you would only need to
>      WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
> as close as possible to PIDTYPE_TGID line.

Agree, there's no reason why it can't be moved to the above pidfd_mode
check.

> > +               ret = copy_info_records_to_user(event, info, info_mode, pidfd,
> > +                                               buf, count);
> >                 if (ret < 0)
> > -                       return ret;
> > +                       goto out_close_fd;
> 
> This looks like a bug in upstream.

Yes, I'm glad this was picked up and I was actually wondering why it was
acceptable to directly return without jumping to the out_close_fd label in
the case of an error. I felt like it may have been a burden to raise the
question in the first place because I thought that this got picked up in
the review already and there was a good reason for having it, despite not
really making much sense.

> It should have been goto out_close_fd to begin with.
> We did already copy metadata.fd to user, but the read() call
> returns an error.
> You should probably fix it before the refactoring patch, so it
> can be applied to stable kernels.

Sure, I will send through a patch fixing this before submitting the next
version of this series though. How do I tag the patch so that it's picked
up an back ported accordingly?

> >         }
> >
> >         return metadata.event_len;
> > @@ -558,6 +632,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> >                 put_unused_fd(fd);
> >                 fput(f);
> >         }
> > +
> > +       if (pidfd < 0)
> 
> That condition is reversed.
> We do not seem to have any test coverage for this error handling
> Not so surprising that upstream had a bug...

Sorry Amir, I don't quite understand what you mean by "That condition is
reversed". Presumably you're referring to the fd != FAN_NOFD check and not
pidfd < 0 here.

/M
Amir Goldstein June 10, 2021, 7:11 a.m. UTC | #3
> > > +               ret = copy_info_records_to_user(event, info, info_mode, pidfd,
> > > +                                               buf, count);
> > >                 if (ret < 0)
> > > -                       return ret;
> > > +                       goto out_close_fd;
> >
> > This looks like a bug in upstream.
>
> Yes, I'm glad this was picked up and I was actually wondering why it was
> acceptable to directly return without jumping to the out_close_fd label in
> the case of an error. I felt like it may have been a burden to raise the
> question in the first place because I thought that this got picked up in
> the review already and there was a good reason for having it, despite not
> really making much sense.
>
> > It should have been goto out_close_fd to begin with.
> > We did already copy metadata.fd to user, but the read() call
> > returns an error.
> > You should probably fix it before the refactoring patch, so it
> > can be applied to stable kernels.
>
> Sure, I will send through a patch fixing this before submitting the next
> version of this series though. How do I tag the patch so that it's picked
> up an back ported accordingly?
>

The best option, in case this is a regression (it probably is)
is the Fixes: tag which is both a clear indication for stale
candidate patch tells the bots exactly which stable kernel the
patch should be applied to.

Otherwise, you can Cc: stable (see examples in git)
and generally any commit title with the right keywords
'fix' 'regression' 'bug' should be caught but the stable AI bots.

> > >         }
> > >
> > >         return metadata.event_len;
> > > @@ -558,6 +632,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> > >                 put_unused_fd(fd);
> > >                 fput(f);
> > >         }
> > > +
> > > +       if (pidfd < 0)
> >
> > That condition is reversed.
> > We do not seem to have any test coverage for this error handling
> > Not so surprising that upstream had a bug...
>
> Sorry Amir, I don't quite understand what you mean by "That condition is
> reversed". Presumably you're referring to the fd != FAN_NOFD check and not
> pidfd < 0 here.
>

IDGI, why is the init/cleanup code not as simple as

    int pidfd = FAN_NOPIDFD;
...
out_close_fd:
...
       if (pidfd >= 0)
                 put_unused_fd(fd);

What am I missing?

Thanks,
Amir.
Matthew Bobrowski June 10, 2021, 7:24 a.m. UTC | #4
On Thu, Jun 10, 2021 at 10:11:51AM +0300, Amir Goldstein wrote:
> > > > +               ret = copy_info_records_to_user(event, info, info_mode, pidfd,
> > > > +                                               buf, count);
> > > >                 if (ret < 0)
> > > > -                       return ret;
> > > > +                       goto out_close_fd;
> > >
> > > This looks like a bug in upstream.
> >
> > Yes, I'm glad this was picked up and I was actually wondering why it was
> > acceptable to directly return without jumping to the out_close_fd label in
> > the case of an error. I felt like it may have been a burden to raise the
> > question in the first place because I thought that this got picked up in
> > the review already and there was a good reason for having it, despite not
> > really making much sense.
> >
> > > It should have been goto out_close_fd to begin with.
> > > We did already copy metadata.fd to user, but the read() call
> > > returns an error.
> > > You should probably fix it before the refactoring patch, so it
> > > can be applied to stable kernels.
> >
> > Sure, I will send through a patch fixing this before submitting the next
> > version of this series though. How do I tag the patch so that it's picked
> > up an back ported accordingly?
> >
> 
> The best option, in case this is a regression (it probably is)
> is the Fixes: tag which is both a clear indication for stale
> candidate patch tells the bots exactly which stable kernel the
> patch should be applied to.
> 
> Otherwise, you can Cc: stable (see examples in git)
> and generally any commit title with the right keywords
> 'fix' 'regression' 'bug' should be caught but the stable AI bots.

Ah, OK, noted.

> > > >         }
> > > >
> > > >         return metadata.event_len;
> > > > @@ -558,6 +632,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> > > >                 put_unused_fd(fd);
> > > >                 fput(f);
> > > >         }
> > > > +
> > > > +       if (pidfd < 0)
> > >
> > > That condition is reversed.
> > > We do not seem to have any test coverage for this error handling
> > > Not so surprising that upstream had a bug...
> >
> > Sorry Amir, I don't quite understand what you mean by "That condition is
> > reversed". Presumably you're referring to the fd != FAN_NOFD check and not
> > pidfd < 0 here.
> >
> 
> IDGI, why is the init/cleanup code not as simple as
> 
>     int pidfd = FAN_NOPIDFD;
> ...
> out_close_fd:
> ...
>        if (pidfd >= 0)
>                  put_unused_fd(fd);

You're missing nothing, it's me that's missing a few brain cells. Sorry,
the context switching on my end is real and I had overlooked what you
meant. But yes, this will most definitely work.

/M
Jan Kara June 10, 2021, 11:23 a.m. UTC | #5
Hi Matthew!

On Thu 10-06-21 10:21:50, Matthew Bobrowski wrote:
> Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
> allows userspace applications to control whether a pidfd info record
> containing a pidfd is to be returned with each event.
> 
> If FAN_REPORT_PIDFD is enabled for a notification group, an additional
> struct fanotify_event_info_pidfd object will be supplied alongside the
> generic struct fanotify_event_metadata within a single event. This
> functionality is analogous to that of FAN_REPORT_FID in terms of how
> the event structure is supplied to the userspace application. Usage of
> FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
> permitted, and in this case a struct fanotify_event_info_pidfd object
> will follow any struct fanotify_event_info_fid object.
> 
> Currently, the usage of FAN_REPORT_TID is not permitted along with
> FAN_REPORT_PIDFD as the pidfd API only supports the creation of pidfds
> for thread-group leaders. Additionally, the FAN_REPORT_PIDFD is
> limited to privileged processes only i.e. listeners that are running
> with the CAP_SYS_ADMIN capability. Attempting to supply either of
> these initialisation flags with FAN_REPORT_PIDFD will result with
> EINVAL being returned to the caller.
> 
> In the event of a pidfd creation error, there are two types of error
> values that can be reported back to the listener. There is
> FAN_NOPIDFD, which will be reported in cases where the process
> responsible for generating the event has terminated prior to fanotify
> being able to create pidfd for event->pid via pidfd_create(). The
> there is FAN_EPIDFD, which will be reported if a more generic pidfd
> creation error occurred when calling pidfd_create().
> 
> Signed-off-by: Matthew Bobrowski <repnop@google.com>

A few comments in addition to what Amir wrote:

> @@ -524,6 +561,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>  	}
>  	metadata.fd = fd;
>  
> +	/*
> +	 * Currently, reporting a pidfd to an unprivileged listener is not
> +	 * supported. The FANOTIFY_UNPRIV flag is to be kept here so that a
> +	 * pidfd is not accidentally leaked to an unprivileged listener.
> +	 */
> +	if (pidfd_mode && !FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) {

Hum, you've added FAN_REPORT_PIDFD to FANOTIFY_ADMIN_INIT_FLAGS so this
condition should be always true? I don't think we need to be that much
defensive and would just drop the check here.

> +		/*
> +		 * The PIDTYPE_TGID check for an event->pid is performed
> +		 * preemptively in attempt to catch those rare instances
> +		 * where the process responsible for generating the event has
> +		 * terminated prior to calling into pidfd_create() and
> +		 * acquiring a valid pidfd. Report FAN_NOPIDFD to the listener
> +		 * in those cases.
> +		 */
> +		if (metadata.pid == 0 ||
> +		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
> +			pidfd = FAN_NOPIDFD;
> +		} else {
> +			pidfd = pidfd_create(event->pid, 0);
> +			if (pidfd < 0)
> +				/*
> +				 * All other pidfd creation errors are reported
> +				 * as FAN_EPIDFD to the listener.
> +				 */
> +				pidfd = FAN_EPIDFD;
> +		}
> +	}
> +
>  	ret = -EFAULT;
>  	/*
>  	 * Sanity check copy size in case get_one_event() and
...

> @@ -558,6 +632,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>  		put_unused_fd(fd);
>  		fput(f);
>  	}
> +
> +	if (pidfd < 0)
> +		put_unused_fd(pidfd);
> +

put_unused_fd() is not enough to destroy the pidfd you have. That will just
mark 'pidfd' as free in the fd table. You rather need to call close_fd()
here to fully close open file.

								Honza
Matthew Bobrowski June 11, 2021, 12:32 a.m. UTC | #6
On Thu, Jun 10, 2021 at 01:23:31PM +0200, Jan Kara wrote:
> > @@ -524,6 +561,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> >  	}
> >  	metadata.fd = fd;
> >  
> > +	/*
> > +	 * Currently, reporting a pidfd to an unprivileged listener is not
> > +	 * supported. The FANOTIFY_UNPRIV flag is to be kept here so that a
> > +	 * pidfd is not accidentally leaked to an unprivileged listener.
> > +	 */
> > +	if (pidfd_mode && !FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) {
> 
> Hum, you've added FAN_REPORT_PIDFD to FANOTIFY_ADMIN_INIT_FLAGS so this
> condition should be always true? I don't think we need to be that much
> defensive and would just drop the check here.

Yes, that's right, so dropping this check is also fine with me.

> > @@ -558,6 +632,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
> >  		put_unused_fd(fd);
> >  		fput(f);
> >  	}
> > +
> > +	if (pidfd < 0)
> > +		put_unused_fd(pidfd);
> > +
> 
> put_unused_fd() is not enough to destroy the pidfd you have. That will just
> mark 'pidfd' as free in the fd table. You rather need to call close_fd()
> here to fully close open file.

Ah, I see, put_unused_fd() doesn't free up the file instance. I will swap
this out with close_fd() instead.

Thanks for the suggestions Jan!

/M
Amir Goldstein July 10, 2021, 2:49 p.m. UTC | #7
On Thu, Jun 10, 2021 at 3:22 AM Matthew Bobrowski <repnop@google.com> wrote:
>
> Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
> allows userspace applications to control whether a pidfd info record
> containing a pidfd is to be returned with each event.
>
> If FAN_REPORT_PIDFD is enabled for a notification group, an additional
> struct fanotify_event_info_pidfd object will be supplied alongside the
> generic struct fanotify_event_metadata within a single event. This
> functionality is analogous to that of FAN_REPORT_FID in terms of how
> the event structure is supplied to the userspace application. Usage of
> FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
> permitted, and in this case a struct fanotify_event_info_pidfd object
> will follow any struct fanotify_event_info_fid object.
>
> Currently, the usage of FAN_REPORT_TID is not permitted along with
> FAN_REPORT_PIDFD as the pidfd API only supports the creation of pidfds
> for thread-group leaders. Additionally, the FAN_REPORT_PIDFD is
> limited to privileged processes only i.e. listeners that are running
> with the CAP_SYS_ADMIN capability. Attempting to supply either of
> these initialisation flags with FAN_REPORT_PIDFD will result with
> EINVAL being returned to the caller.
>
> In the event of a pidfd creation error, there are two types of error
> values that can be reported back to the listener. There is
> FAN_NOPIDFD, which will be reported in cases where the process
> responsible for generating the event has terminated prior to fanotify
> being able to create pidfd for event->pid via pidfd_create(). The
> there is FAN_EPIDFD, which will be reported if a more generic pidfd
> creation error occurred when calling pidfd_create().
>
> Signed-off-by: Matthew Bobrowski <repnop@google.com>
>

[...]

> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index fbf9c5c7dd59..5cb3e2369b96 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -55,6 +55,7 @@
>  #define FAN_REPORT_FID         0x00000200      /* Report unique file id */
>  #define FAN_REPORT_DIR_FID     0x00000400      /* Report unique directory id */
>  #define FAN_REPORT_NAME                0x00000800      /* Report events with name */
> +#define FAN_REPORT_PIDFD       0x00001000      /* Report pidfd for event->pid */
>

Matthew,

One very minor comment.
I have a patch in progress to add FAN_REPORT_CHILD_FID (for reporting fid
of created inode) and it would be nice if we can reserve the flag space in the
same block with the rest of the FID flags.

If its not a problem, maybe we could move FAN_REPORT_PIDFD up to 0x80
right above FAN_REPORT_TID, which also happen to be related flags.

Thanks,
Amir.
Matthew Bobrowski July 14, 2021, 12:18 a.m. UTC | #8
On Sat, Jul 10, 2021 at 05:49:57PM +0300, Amir Goldstein wrote:
> On Thu, Jun 10, 2021 at 3:22 AM Matthew Bobrowski <repnop@google.com> wrote:
> >
> > Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
> > allows userspace applications to control whether a pidfd info record
> > containing a pidfd is to be returned with each event.
> >
> > If FAN_REPORT_PIDFD is enabled for a notification group, an additional
> > struct fanotify_event_info_pidfd object will be supplied alongside the
> > generic struct fanotify_event_metadata within a single event. This
> > functionality is analogous to that of FAN_REPORT_FID in terms of how
> > the event structure is supplied to the userspace application. Usage of
> > FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is
> > permitted, and in this case a struct fanotify_event_info_pidfd object
> > will follow any struct fanotify_event_info_fid object.
> >
> > Currently, the usage of FAN_REPORT_TID is not permitted along with
> > FAN_REPORT_PIDFD as the pidfd API only supports the creation of pidfds
> > for thread-group leaders. Additionally, the FAN_REPORT_PIDFD is
> > limited to privileged processes only i.e. listeners that are running
> > with the CAP_SYS_ADMIN capability. Attempting to supply either of
> > these initialisation flags with FAN_REPORT_PIDFD will result with
> > EINVAL being returned to the caller.
> >
> > In the event of a pidfd creation error, there are two types of error
> > values that can be reported back to the listener. There is
> > FAN_NOPIDFD, which will be reported in cases where the process
> > responsible for generating the event has terminated prior to fanotify
> > being able to create pidfd for event->pid via pidfd_create(). The
> > there is FAN_EPIDFD, which will be reported if a more generic pidfd
> > creation error occurred when calling pidfd_create().
> >
> > Signed-off-by: Matthew Bobrowski <repnop@google.com>
> >
> 
> [...]
> 
> > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> > index fbf9c5c7dd59..5cb3e2369b96 100644
> > --- a/include/uapi/linux/fanotify.h
> > +++ b/include/uapi/linux/fanotify.h
> > @@ -55,6 +55,7 @@
> >  #define FAN_REPORT_FID         0x00000200      /* Report unique file id */
> >  #define FAN_REPORT_DIR_FID     0x00000400      /* Report unique directory id */
> >  #define FAN_REPORT_NAME                0x00000800      /* Report events with name */
> > +#define FAN_REPORT_PIDFD       0x00001000      /* Report pidfd for event->pid */
> >
> 
> Matthew,
> 
> One very minor comment.
> I have a patch in progress to add FAN_REPORT_CHILD_FID (for reporting fid
> of created inode) and it would be nice if we can reserve the flag space in the
> same block with the rest of the FID flags.
> 
> If its not a problem, maybe we could move FAN_REPORT_PIDFD up to 0x80
> right above FAN_REPORT_TID, which also happen to be related flags.

That's fine by me, no objections. Updated my patch series with the updated
flag value [0].

[0]
https://github.com/matthewbobrowski/linux/commit/02ba3581fee21c34bd986e093d9eb0b9897fa741#diff-c83e05fe10af36146658416e55756f304a099606f4a2b18d2fcb683b277c3c62R54

/M
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 85d6eea8d45d..1ce66bcfd9b5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,6 +106,8 @@  struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_FID_INFO_HDR_LEN \
 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+	sizeof(struct fanotify_event_info_pidfd)
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -138,6 +140,9 @@  static int fanotify_event_info_len(unsigned int info_mode,
 		dot_len = 1;
 	}
 
+	if (info_mode & FAN_REPORT_PIDFD)
+		info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
 	if (fh_len)
 		info_len += fanotify_fid_info_len(fh_len, dot_len);
 
@@ -401,13 +406,34 @@  static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 	return info_len;
 }
 
+static int copy_pidfd_info_to_user(int pidfd,
+				   char __user *buf,
+				   size_t count)
+{
+	struct fanotify_event_info_pidfd info = { };
+	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+	if (WARN_ON_ONCE(info_len > count))
+		return -EFAULT;
+
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
+	info.hdr.len = info_len;
+	info.pidfd = pidfd;
+
+	if (copy_to_user(buf, &info, info_len))
+		return -EFAULT;
+
+	return info_len;
+}
+
 static int copy_info_records_to_user(struct fanotify_event *event,
 				     struct fanotify_info *info,
-				     unsigned int info_mode,
+				     unsigned int info_mode, int pidfd,
 				     char __user *buf, size_t count)
 {
 	int ret, total_bytes = 0, info_type = 0;
 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
+	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 
 	/*
 	 * Event info records order is as follows: dir fid + name, child fid.
@@ -478,6 +504,16 @@  static int copy_info_records_to_user(struct fanotify_event *event,
 		total_bytes += ret;
 	}
 
+	if (pidfd_mode) {
+		ret = copy_pidfd_info_to_user(pidfd, buf, count);
+		if (ret < 0)
+			return ret;
+
+		buf += ret;
+		count -= ret;
+		total_bytes += ret;
+	}
+
 	return total_bytes;
 }
 
@@ -489,8 +525,9 @@  static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	struct path *path = fanotify_event_path(event);
 	struct fanotify_info *info = fanotify_event_info(event);
 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
+	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 	struct file *f = NULL;
-	int ret, fd = FAN_NOFD;
+	int ret, pidfd = 0, fd = FAN_NOFD;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -524,6 +561,34 @@  static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	}
 	metadata.fd = fd;
 
+	/*
+	 * Currently, reporting a pidfd to an unprivileged listener is not
+	 * supported. The FANOTIFY_UNPRIV flag is to be kept here so that a
+	 * pidfd is not accidentally leaked to an unprivileged listener.
+	 */
+	if (pidfd_mode && !FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) {
+		/*
+		 * The PIDTYPE_TGID check for an event->pid is performed
+		 * preemptively in attempt to catch those rare instances
+		 * where the process responsible for generating the event has
+		 * terminated prior to calling into pidfd_create() and
+		 * acquiring a valid pidfd. Report FAN_NOPIDFD to the listener
+		 * in those cases.
+		 */
+		if (metadata.pid == 0 ||
+		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
+			pidfd = FAN_NOPIDFD;
+		} else {
+			pidfd = pidfd_create(event->pid, 0);
+			if (pidfd < 0)
+				/*
+				 * All other pidfd creation errors are reported
+				 * as FAN_EPIDFD to the listener.
+				 */
+				pidfd = FAN_EPIDFD;
+		}
+	}
+
 	ret = -EFAULT;
 	/*
 	 * Sanity check copy size in case get_one_event() and
@@ -545,10 +610,19 @@  static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		fd_install(fd, f);
 
 	if (info_mode) {
-		ret = copy_info_records_to_user(event, info, info_mode,
-						buf, count);
+		/*
+		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
+		 * exclusion is ever lifted. At the time of incorporating pidfd
+		 * support within fanotify, the pidfd API only supported the
+		 * creation of pidfds for thread-group leaders.
+		 */
+		WARN_ON_ONCE(pidfd_mode &&
+			     FAN_GROUP_FLAG(group, FAN_REPORT_TID));
+
+		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
+				                buf, count);
 		if (ret < 0)
-			return ret;
+			goto out_close_fd;
 	}
 
 	return metadata.event_len;
@@ -558,6 +632,10 @@  static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		put_unused_fd(fd);
 		fput(f);
 	}
+
+	if (pidfd < 0)
+		put_unused_fd(pidfd);
+
 	return ret;
 }
 
@@ -1103,6 +1181,14 @@  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 #endif
 		return -EINVAL;
 
+	/*
+	 * A pidfd can only be returned for a thread-group leader; thus
+	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
+	 * exclusive.
+	 */
+	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
+		return -EINVAL;
+
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
 		return -EINVAL;
 
@@ -1504,7 +1590,7 @@  static int __init fanotify_user_setup(void)
 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 10a7e26ddba6..eec3b7c40811 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -27,7 +27,7 @@  extern struct ctl_table fanotify_table[]; /* for sysctl */
 
 #define FANOTIFY_FID_BITS	(FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
 
-#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS)
+#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
 
 /*
  * fanotify_init() flags that require CAP_SYS_ADMIN.
@@ -37,6 +37,7 @@  extern struct ctl_table fanotify_table[]; /* for sysctl */
  */
 #define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
 					 FAN_REPORT_TID | \
+					 FAN_REPORT_PIDFD | \
 					 FAN_UNLIMITED_QUEUE | \
 					 FAN_UNLIMITED_MARKS)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index fbf9c5c7dd59..5cb3e2369b96 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -55,6 +55,7 @@ 
 #define FAN_REPORT_FID		0x00000200	/* Report unique file id */
 #define FAN_REPORT_DIR_FID	0x00000400	/* Report unique directory id */
 #define FAN_REPORT_NAME		0x00000800	/* Report events with name */
+#define FAN_REPORT_PIDFD	0x00001000	/* Report pidfd for event->pid */
 
 /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
 #define FAN_REPORT_DFID_NAME	(FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
@@ -123,6 +124,7 @@  struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_FID		1
 #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
 #define FAN_EVENT_INFO_TYPE_DFID	3
+#define FAN_EVENT_INFO_TYPE_PIDFD	4
 
 /* Variable length info record following event metadata */
 struct fanotify_event_info_header {
@@ -148,6 +150,15 @@  struct fanotify_event_info_fid {
 	unsigned char handle[0];
 };
 
+/*
+ * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
+ * It holds a pidfd for the pid that was responsible for generating an event.
+ */
+struct fanotify_event_info_pidfd {
+	struct fanotify_event_info_header hdr;
+	__s32 pidfd;
+};
+
 struct fanotify_response {
 	__s32 fd;
 	__u32 response;
@@ -160,6 +171,8 @@  struct fanotify_response {
 
 /* No fd set in event */
 #define FAN_NOFD	-1
+#define FAN_NOPIDFD	FAN_NOFD
+#define FAN_EPIDFD	-2
 
 /* Helper functions to deal with fanotify_event_metadata buffers */
 #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))