Message ID | 48d18055deb4617d97c695a08dca77eb573097e9.1621473846.git.repnop@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add pidfd support to the fanotify API | expand |
On Thu, May 20, 2021 at 12:11:51PM +1000, Matthew Bobrowski wrote: > Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which > allows userspace applications to control whether a pidfd info record > containing a pidfd is to be returned with each event. > > If FAN_REPORT_PIDFD is enabled for a notification group, an additional > struct fanotify_event_info_pidfd object will be supplied alongside the > generic struct fanotify_event_metadata within a single event. This > functionality is analogous to that of FAN_REPORT_FID in terms of how > the event structure is supplied to the userspace application. Usage of > FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is > permitted, and in this case a struct fanotify_event_info_pidfd object > will follow any struct fanotify_event_info_fid object. > > Usage of FAN_REPORT_TID is not permitted with FAN_REPORT_PIDFD as the > pidfd API only supports the creation of pidfds for thread-group > leaders. Attempting to do so will result with a -EINVAL being returned > when calling fanotify_init(2). > > If pidfd creation fails via pidfd_create(), the pidfd field within > struct fanotify_event_info_pidfd is set to FAN_NOPIDFD. > > Signed-off-by: Matthew Bobrowski <repnop@google.com> > --- > fs/notify/fanotify/fanotify_user.c | 65 +++++++++++++++++++++++++++--- > include/linux/fanotify.h | 3 +- > include/uapi/linux/fanotify.h | 12 ++++++ > 3 files changed, 74 insertions(+), 6 deletions(-) > > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > index 1e15f3222eb2..bba61988f4a0 100644 > --- a/fs/notify/fanotify/fanotify_user.c > +++ b/fs/notify/fanotify/fanotify_user.c > @@ -106,6 +106,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; > #define FANOTIFY_EVENT_ALIGN 4 > #define FANOTIFY_FID_INFO_HDR_LEN \ > (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > + sizeof(struct fanotify_event_info_pidfd) > > static int fanotify_fid_info_len(int fh_len, int name_len) > { > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > if (fh_len) > info_len += fanotify_fid_info_len(fh_len, dot_len); > > + if (info_mode & FAN_REPORT_PIDFD) > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > + > return info_len; > } > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > return info_len; > } > > +static int copy_pidfd_info_to_user(struct pid *pid, > + char __user *buf, > + size_t count) > +{ > + struct fanotify_event_info_pidfd info = { }; > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > + > + if (WARN_ON_ONCE(info_len > count)) > + return -EFAULT; > + > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > + info.hdr.len = info_len; > + > + info.pidfd = pidfd_create(pid, 0); > + if (info.pidfd < 0) > + info.pidfd = FAN_NOPIDFD; > + > + if (copy_to_user(buf, &info, info_len)) > + return -EFAULT; Hm, well this kinda sucks. The caller can end up with a pidfd in their fd table and when the copy_to_user() failed they won't know what fd it is. I think this might be better served by sm like (see what I did in kernel/fork.c): pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (pidfd < 0) /* error handling */ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, O_RDWR | O_CLOEXEC); if (IS_ERR(pidfile)) { put_unused_fd(pidfd); pidfd = PTR_ERR(pidfile); /* error handling */ } get_pid(pid); /* held by pidfile now */ info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; info.hdr.len = info_len; info.pidfd = pidfd; if (copy_to_user(buf, &info, info_len)) { fput(pidfile); put_unused_fd(pidfd); return -EFAULT; } fd_install(pidfd, pidfile); return 0; > + > + return info_len; > +} > + > static int copy_info_to_user(struct fanotify_event *event, > struct fanotify_info *info, > unsigned int info_mode, > @@ -408,9 +436,12 @@ static int copy_info_to_user(struct fanotify_event *event, > { > int ret, info_type = 0; > unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; > + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; > > /* > * Event info records order is as follows: dir fid + name, child fid. > + * If FAN_REPORT_PIDFD has been specified, then a pidfd info record > + * will follow the fid info records. > */ > if (fanotify_event_dir_fh_len(event)) { > info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : > @@ -465,10 +496,18 @@ static int copy_info_to_user(struct fanotify_event *event, > } > > ret = copy_fid_info_to_user(fanotify_event_fsid(event), > - fanotify_event_object_fh(event), > - info_type, dot, dot_len, > - buf, count); > - } > + fanotify_event_object_fh(event), > + info_type, dot, dot_len, > + buf, count); > + if (ret < 0) > + return ret; > + > + buf += ret; > + count -= ret; > + } > + > + if (pidfd_mode) > + return copy_pidfd_info_to_user(event->pid, buf, count); > > return ret; > } > @@ -530,6 +569,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, > fd_install(fd, f); > > if (info_mode) { > + /* > + * Complain if FAN_REPORT_PIDFD and FAN_REPORT_TID mutual > + * exclusion is ever lifted. At the time of implementing > + * FAN_REPORT_PIDFD, the pidfd API only supported the creation > + * of pidfds on thread-group leaders. > + */ > + WARN_ON_ONCE((info_mode & FAN_REPORT_PIDFD) && > + FAN_GROUP_FLAG(group, FAN_REPORT_TID)); > + > ret = copy_info_to_user(event, info, info_mode, buf, count); > if (ret < 0) > return ret; > @@ -1079,6 +1127,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) > #endif > return -EINVAL; > > + /* > + * A pidfd can only be returned for a thread-group leader; thus > + * FAN_REPORT_TID and FAN_REPORT_PIDFD need to be mutually exclusive. > + */ > + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) > + return -EINVAL; > + > if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) > return -EINVAL; > > @@ -1477,7 +1532,7 @@ static int __init fanotify_user_setup(void) > max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, > FANOTIFY_DEFAULT_MAX_USER_MARKS); > > - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); > + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); > BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); > > fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, > diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h > index f76c7635efc8..bb2898240e5a 100644 > --- a/include/linux/fanotify.h > +++ b/include/linux/fanotify.h > @@ -27,7 +27,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ > > #define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) > > -#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS) > +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) > > /* > * fanotify_init() flags that require CAP_SYS_ADMIN. > @@ -37,6 +37,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ > */ > #define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \ > FAN_REPORT_TID | \ > + FAN_REPORT_PIDFD | \ > FAN_UNLIMITED_QUEUE | \ > FAN_UNLIMITED_MARKS) > > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h > index fbf9c5c7dd59..36c3bddcf690 100644 > --- a/include/uapi/linux/fanotify.h > +++ b/include/uapi/linux/fanotify.h > @@ -55,6 +55,7 @@ > #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ > #define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ > #define FAN_REPORT_NAME 0x00000800 /* Report events with name */ > +#define FAN_REPORT_PIDFD 0x00001000 /* Report pidfd for event->pid */ > > /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ > #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) > @@ -123,6 +124,7 @@ struct fanotify_event_metadata { > #define FAN_EVENT_INFO_TYPE_FID 1 > #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 > #define FAN_EVENT_INFO_TYPE_DFID 3 > +#define FAN_EVENT_INFO_TYPE_PIDFD 4 > > /* Variable length info record following event metadata */ > struct fanotify_event_info_header { > @@ -148,6 +150,15 @@ struct fanotify_event_info_fid { > unsigned char handle[0]; > }; > > +/* > + * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD. > + * It holds a pidfd for the pid responsible for generating an event. > + */ > +struct fanotify_event_info_pidfd { > + struct fanotify_event_info_header hdr; > + __s32 pidfd; > +}; > + > struct fanotify_response { > __s32 fd; > __u32 response; > @@ -160,6 +171,7 @@ struct fanotify_response { > > /* No fd set in event */ > #define FAN_NOFD -1 > +#define FAN_NOPIDFD FAN_NOFD > > /* Helper functions to deal with fanotify_event_metadata buffers */ > #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata)) > -- > 2.31.1.751.gd2f1c929bd-goog > > /M
On Thu, May 20, 2021 at 11:17 AM Christian Brauner <christian.brauner@ubuntu.com> wrote: > > On Thu, May 20, 2021 at 12:11:51PM +1000, Matthew Bobrowski wrote: > > Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which > > allows userspace applications to control whether a pidfd info record > > containing a pidfd is to be returned with each event. > > > > If FAN_REPORT_PIDFD is enabled for a notification group, an additional > > struct fanotify_event_info_pidfd object will be supplied alongside the > > generic struct fanotify_event_metadata within a single event. This > > functionality is analogous to that of FAN_REPORT_FID in terms of how > > the event structure is supplied to the userspace application. Usage of > > FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is > > permitted, and in this case a struct fanotify_event_info_pidfd object > > will follow any struct fanotify_event_info_fid object. > > > > Usage of FAN_REPORT_TID is not permitted with FAN_REPORT_PIDFD as the > > pidfd API only supports the creation of pidfds for thread-group > > leaders. Attempting to do so will result with a -EINVAL being returned > > when calling fanotify_init(2). > > > > If pidfd creation fails via pidfd_create(), the pidfd field within > > struct fanotify_event_info_pidfd is set to FAN_NOPIDFD. > > > > Signed-off-by: Matthew Bobrowski <repnop@google.com> > > --- > > fs/notify/fanotify/fanotify_user.c | 65 +++++++++++++++++++++++++++--- > > include/linux/fanotify.h | 3 +- > > include/uapi/linux/fanotify.h | 12 ++++++ > > 3 files changed, 74 insertions(+), 6 deletions(-) > > > > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > > index 1e15f3222eb2..bba61988f4a0 100644 > > --- a/fs/notify/fanotify/fanotify_user.c > > +++ b/fs/notify/fanotify/fanotify_user.c > > @@ -106,6 +106,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; > > #define FANOTIFY_EVENT_ALIGN 4 > > #define FANOTIFY_FID_INFO_HDR_LEN \ > > (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > + sizeof(struct fanotify_event_info_pidfd) > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > { > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > if (fh_len) > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > + if (info_mode & FAN_REPORT_PIDFD) > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > + > > return info_len; > > } > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > return info_len; > > } > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > + char __user *buf, > > + size_t count) > > +{ > > + struct fanotify_event_info_pidfd info = { }; > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > + > > + if (WARN_ON_ONCE(info_len > count)) > > + return -EFAULT; > > + > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > + info.hdr.len = info_len; > > + > > + info.pidfd = pidfd_create(pid, 0); > > + if (info.pidfd < 0) > > + info.pidfd = FAN_NOPIDFD; > > + > > + if (copy_to_user(buf, &info, info_len)) > > + return -EFAULT; > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > fd table and when the copy_to_user() failed they won't know what fd it Good catch! But I prefer to solve it differently, because moving fd_install() to the end of this function does not guarantee that copy_event_to_user() won't return an error one day with dangling pidfd in fd table. It might be simpler to do pidfd_create() next to create_fd() in copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). pidfd can be closed on error along with fd on out_close_fd label. You also forgot to add CAP_SYS_ADMIN check before pidfd_create() (even though fanotify_init() does check for that). Anyway, something like: if (!capable(CAP_SYS_ADMIN) && task_tgid(current) != event->pid) metadata.pid = 0; + else if (pidfd_mode) + pidfd = pidfd_create(pid, 0); [...] + if (pidfd_mode) + ret = copy_pidfd_info_to_user(pidfd, buf, count); return metadata.event_len; out_close_fd: + if (pidfd != FAN_NOPIDFD) { ... And in any case, it wrong to call copy_pidfd_info_to_user() from copy_info_to_user(). It needs to be called once from copy_event_to_user() because copy_pidfd_info_to_user() may be called twice to report both FAN_EVENT_INFO_TYPE_FID and FAN_EVENT_INFO_TYPE_DFID records for the same event. Thanks, Amir.
Hey Amir/Christian, On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > <christian.brauner@ubuntu.com> wrote: > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > { > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > if (fh_len) > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > + > > > return info_len; > > > } > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > return info_len; > > > } > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > + char __user *buf, > > > + size_t count) > > > +{ > > > + struct fanotify_event_info_pidfd info = { }; > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > + > > > + if (WARN_ON_ONCE(info_len > count)) > > > + return -EFAULT; > > > + > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > + info.hdr.len = info_len; > > > + > > > + info.pidfd = pidfd_create(pid, 0); > > > + if (info.pidfd < 0) > > > + info.pidfd = FAN_NOPIDFD; > > > + > > > + if (copy_to_user(buf, &info, info_len)) > > > + return -EFAULT; > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > fd table and when the copy_to_user() failed they won't know what fd it > > Good catch! Super awesome catch Christian, thanks pulling this up! > But I prefer to solve it differently, because moving fd_install() to the > end of this function does not guarantee that copy_event_to_user() > won't return an error one day with dangling pidfd in fd table. I can see the angle you're approaching this from... > It might be simpler to do pidfd_create() next to create_fd() in > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > pidfd can be closed on error along with fd on out_close_fd label. > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > (even though fanotify_init() does check for that). I didn't really understand the need for this check here given that the administrative bits are already being checked for in fanotify_init() i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; thus never walking any of the pidfd_mode paths. Is this just a defense in depth approach here, or is it something else that I'm missing? > Anyway, something like: > > if (!capable(CAP_SYS_ADMIN) && > task_tgid(current) != event->pid) > metadata.pid = 0; > + else if (pidfd_mode) > + pidfd = pidfd_create(pid, 0); > > [...] > > + if (pidfd_mode) > + ret = copy_pidfd_info_to_user(pidfd, buf, count); > > return metadata.event_len; > > out_close_fd: > + if (pidfd != FAN_NOPIDFD) { > ... The early call to pidfd_create() and clean up in copy_event_to_user() makes most sense to me. > And in any case, it wrong to call copy_pidfd_info_to_user() from > copy_info_to_user(). It needs to be called once from copy_event_to_user() > because copy_pidfd_info_to_user() may be called twice to report both > FAN_EVENT_INFO_TYPE_FID and FAN_EVENT_INFO_TYPE_DFID > records for the same event. Right, as mentioned in patch 4 of this series, copy_info_to_user() has been repurposed to account for the double call into copy_fid_info_to_user() when reporting FAN_EVENT_INFO_TYPE_FID and FAN_EVENT_INFO_TYPE_DFID. /M
On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > Hey Amir/Christian, > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > <christian.brauner@ubuntu.com> wrote: > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > { > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > if (fh_len) > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > + > > > > return info_len; > > > > } > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > return info_len; > > > > } > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > + char __user *buf, > > > > + size_t count) > > > > +{ > > > > + struct fanotify_event_info_pidfd info = { }; > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > + > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > + return -EFAULT; > > > > + > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > + info.hdr.len = info_len; > > > > + > > > > + info.pidfd = pidfd_create(pid, 0); > > > > + if (info.pidfd < 0) > > > > + info.pidfd = FAN_NOPIDFD; > > > > + > > > > + if (copy_to_user(buf, &info, info_len)) > > > > + return -EFAULT; > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > Good catch! > > Super awesome catch Christian, thanks pulling this up! > > > But I prefer to solve it differently, because moving fd_install() to the > > end of this function does not guarantee that copy_event_to_user() > > won't return an error one day with dangling pidfd in fd table. > > I can see the angle you're approaching this from... > > > It might be simpler to do pidfd_create() next to create_fd() in > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > pidfd can be closed on error along with fd on out_close_fd label. > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > (even though fanotify_init() does check for that). > > I didn't really understand the need for this check here given that the > administrative bits are already being checked for in fanotify_init() > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > thus never walking any of the pidfd_mode paths. Is this just a defense > in depth approach here, or is it something else that I'm missing? > We want to be extra careful not to create privilege escalations, so even if the fanotify fd is leaked or intentionally passed to a less privileged user, it cannot get an open pidfd. IOW, it is *much* easier to be defensive in this case than to prove that the change cannot introduce any privilege escalations. Thanks, Amir.
On Fri 21-05-21 12:41:51, Amir Goldstein wrote: > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > > > Hey Amir/Christian, > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > > <christian.brauner@ubuntu.com> wrote: > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > > { > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > > if (fh_len) > > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > + > > > > > return info_len; > > > > > } > > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > > return info_len; > > > > > } > > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > > + char __user *buf, > > > > > + size_t count) > > > > > +{ > > > > > + struct fanotify_event_info_pidfd info = { }; > > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > + > > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > > + return -EFAULT; > > > > > + > > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > > + info.hdr.len = info_len; > > > > > + > > > > > + info.pidfd = pidfd_create(pid, 0); > > > > > + if (info.pidfd < 0) > > > > > + info.pidfd = FAN_NOPIDFD; > > > > > + > > > > > + if (copy_to_user(buf, &info, info_len)) > > > > > + return -EFAULT; > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > > > Good catch! > > > > Super awesome catch Christian, thanks pulling this up! > > > > > But I prefer to solve it differently, because moving fd_install() to the > > > end of this function does not guarantee that copy_event_to_user() > > > won't return an error one day with dangling pidfd in fd table. > > > > I can see the angle you're approaching this from... > > > > > It might be simpler to do pidfd_create() next to create_fd() in > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > > pidfd can be closed on error along with fd on out_close_fd label. > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > (even though fanotify_init() does check for that). > > > > I didn't really understand the need for this check here given that the > > administrative bits are already being checked for in fanotify_init() > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > thus never walking any of the pidfd_mode paths. Is this just a defense > > in depth approach here, or is it something else that I'm missing? > > > > We want to be extra careful not to create privilege escalations, > so even if the fanotify fd is leaked or intentionally passed to a less > privileged user, it cannot get an open pidfd. > > IOW, it is *much* easier to be defensive in this case than to prove > that the change cannot introduce any privilege escalations. I have no problems with being more defensive (it's certainly better than being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged process, that process is also free to update all the passwords. Traditionally permission checks in Unix are performed on open and then who has fd can do whatever that fd allows... I've tried to follow similar philosophy with fanotify as well and e.g. open happening as a result of fanotify path events does not check permissions either. Honza
On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote: > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote: > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > > > > > Hey Amir/Christian, > > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > > > <christian.brauner@ubuntu.com> wrote: > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > > > { > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > > > if (fh_len) > > > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > + > > > > > > return info_len; > > > > > > } > > > > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > > > return info_len; > > > > > > } > > > > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > > > + char __user *buf, > > > > > > + size_t count) > > > > > > +{ > > > > > > + struct fanotify_event_info_pidfd info = { }; > > > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > + > > > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > > > + return -EFAULT; > > > > > > + > > > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > > > + info.hdr.len = info_len; > > > > > > + > > > > > > + info.pidfd = pidfd_create(pid, 0); > > > > > > + if (info.pidfd < 0) > > > > > > + info.pidfd = FAN_NOPIDFD; > > > > > > + > > > > > > + if (copy_to_user(buf, &info, info_len)) > > > > > > + return -EFAULT; > > > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > > > > > Good catch! > > > > > > Super awesome catch Christian, thanks pulling this up! > > > > > > > But I prefer to solve it differently, because moving fd_install() to the > > > > end of this function does not guarantee that copy_event_to_user() > > > > won't return an error one day with dangling pidfd in fd table. > > > > > > I can see the angle you're approaching this from... > > > > > > > It might be simpler to do pidfd_create() next to create_fd() in > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > > > pidfd can be closed on error along with fd on out_close_fd label. > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > > (even though fanotify_init() does check for that). > > > > > > I didn't really understand the need for this check here given that the > > > administrative bits are already being checked for in fanotify_init() > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > > thus never walking any of the pidfd_mode paths. Is this just a defense > > > in depth approach here, or is it something else that I'm missing? > > > > > > > We want to be extra careful not to create privilege escalations, > > so even if the fanotify fd is leaked or intentionally passed to a less > > privileged user, it cannot get an open pidfd. > > > > IOW, it is *much* easier to be defensive in this case than to prove > > that the change cannot introduce any privilege escalations. > > I have no problems with being more defensive (it's certainly better than > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged > process, that process is also free to update all the passwords. > Traditionally permission checks in Unix are performed on open and then who > has fd can do whatever that fd allows... I've tried to follow similar > philosophy with fanotify as well and e.g. open happening as a result of > fanotify path events does not check permissions either. > Agreed. However, because we had this issue with no explicit FAN_REPORT_PID we added the CAP_SYS_ADMIN check for reporting event->pid as next best thing. So now that becomes weird if priv process created fanotify fd and passes it to unpriv process, then unpriv process gets events with pidfd but without event->pid. We can change the code to: if (!capable(CAP_SYS_ADMIN) && !pidfd_mode && task_tgid(current) != event->pid) metadata.pid = 0; So the case I decscribed above ends up reporting both pidfd and event->pid to unpriv user, but that is a bit inconsistent... Thanks, Amir.
On Fri 21-05-21 14:10:32, Amir Goldstein wrote: > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote: > > > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote: > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > > > > > > > Hey Amir/Christian, > > > > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > > > > <christian.brauner@ubuntu.com> wrote: > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > > > > { > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > > > > if (fh_len) > > > > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > + > > > > > > > return info_len; > > > > > > > } > > > > > > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > > > > return info_len; > > > > > > > } > > > > > > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > > > > + char __user *buf, > > > > > > > + size_t count) > > > > > > > +{ > > > > > > > + struct fanotify_event_info_pidfd info = { }; > > > > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > + > > > > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > > > > + return -EFAULT; > > > > > > > + > > > > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > > > > + info.hdr.len = info_len; > > > > > > > + > > > > > > > + info.pidfd = pidfd_create(pid, 0); > > > > > > > + if (info.pidfd < 0) > > > > > > > + info.pidfd = FAN_NOPIDFD; > > > > > > > + > > > > > > > + if (copy_to_user(buf, &info, info_len)) > > > > > > > + return -EFAULT; > > > > > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > > > > > > > Good catch! > > > > > > > > Super awesome catch Christian, thanks pulling this up! > > > > > > > > > But I prefer to solve it differently, because moving fd_install() to the > > > > > end of this function does not guarantee that copy_event_to_user() > > > > > won't return an error one day with dangling pidfd in fd table. > > > > > > > > I can see the angle you're approaching this from... > > > > > > > > > It might be simpler to do pidfd_create() next to create_fd() in > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > > > > pidfd can be closed on error along with fd on out_close_fd label. > > > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > > > (even though fanotify_init() does check for that). > > > > > > > > I didn't really understand the need for this check here given that the > > > > administrative bits are already being checked for in fanotify_init() > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > > > thus never walking any of the pidfd_mode paths. Is this just a defense > > > > in depth approach here, or is it something else that I'm missing? > > > > > > > > > > We want to be extra careful not to create privilege escalations, > > > so even if the fanotify fd is leaked or intentionally passed to a less > > > privileged user, it cannot get an open pidfd. > > > > > > IOW, it is *much* easier to be defensive in this case than to prove > > > that the change cannot introduce any privilege escalations. > > > > I have no problems with being more defensive (it's certainly better than > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged > > process, that process is also free to update all the passwords. > > Traditionally permission checks in Unix are performed on open and then who > > has fd can do whatever that fd allows... I've tried to follow similar > > philosophy with fanotify as well and e.g. open happening as a result of > > fanotify path events does not check permissions either. > > > > Agreed. > > However, because we had this issue with no explicit FAN_REPORT_PID > we added the CAP_SYS_ADMIN check for reporting event->pid as next > best thing. So now that becomes weird if priv process created fanotify fd > and passes it to unpriv process, then unpriv process gets events with > pidfd but without event->pid. > > We can change the code to: > > if (!capable(CAP_SYS_ADMIN) && !pidfd_mode && > task_tgid(current) != event->pid) > metadata.pid = 0; > > So the case I decscribed above ends up reporting both pidfd > and event->pid to unpriv user, but that is a bit inconsistent... Oh, now I see where you are coming from :) Thanks for explanation. And remind me please, cannot we just have internal FAN_REPORT_PID flag that gets set on notification group when priviledged process creates it and then test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is mostly equivalent but I guess more in the spirit of how fanotify traditionally does things. Also FAN_REPORT_PIDFD could then behave in the same way... Honza
On Fri, May 21, 2021 at 4:19 PM Jan Kara <jack@suse.cz> wrote: > > On Fri 21-05-21 14:10:32, Amir Goldstein wrote: > > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote: > > > > > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote: > > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > > > > > > > > > Hey Amir/Christian, > > > > > > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > > > > > <christian.brauner@ubuntu.com> wrote: > > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > > > > > { > > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > > > > > if (fh_len) > > > > > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > > + > > > > > > > > return info_len; > > > > > > > > } > > > > > > > > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > > > > > return info_len; > > > > > > > > } > > > > > > > > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > > > > > + char __user *buf, > > > > > > > > + size_t count) > > > > > > > > +{ > > > > > > > > + struct fanotify_event_info_pidfd info = { }; > > > > > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > > + > > > > > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > > > > > + return -EFAULT; > > > > > > > > + > > > > > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > > > > > + info.hdr.len = info_len; > > > > > > > > + > > > > > > > > + info.pidfd = pidfd_create(pid, 0); > > > > > > > > + if (info.pidfd < 0) > > > > > > > > + info.pidfd = FAN_NOPIDFD; > > > > > > > > + > > > > > > > > + if (copy_to_user(buf, &info, info_len)) > > > > > > > > + return -EFAULT; > > > > > > > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > > > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > > > > > > > > > Good catch! > > > > > > > > > > Super awesome catch Christian, thanks pulling this up! > > > > > > > > > > > But I prefer to solve it differently, because moving fd_install() to the > > > > > > end of this function does not guarantee that copy_event_to_user() > > > > > > won't return an error one day with dangling pidfd in fd table. > > > > > > > > > > I can see the angle you're approaching this from... > > > > > > > > > > > It might be simpler to do pidfd_create() next to create_fd() in > > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > > > > > pidfd can be closed on error along with fd on out_close_fd label. > > > > > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > > > > (even though fanotify_init() does check for that). > > > > > > > > > > I didn't really understand the need for this check here given that the > > > > > administrative bits are already being checked for in fanotify_init() > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense > > > > > in depth approach here, or is it something else that I'm missing? > > > > > > > > > > > > > We want to be extra careful not to create privilege escalations, > > > > so even if the fanotify fd is leaked or intentionally passed to a less > > > > privileged user, it cannot get an open pidfd. > > > > > > > > IOW, it is *much* easier to be defensive in this case than to prove > > > > that the change cannot introduce any privilege escalations. > > > > > > I have no problems with being more defensive (it's certainly better than > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged > > > process, that process is also free to update all the passwords. > > > Traditionally permission checks in Unix are performed on open and then who > > > has fd can do whatever that fd allows... I've tried to follow similar > > > philosophy with fanotify as well and e.g. open happening as a result of > > > fanotify path events does not check permissions either. > > > > > > > Agreed. > > > > However, because we had this issue with no explicit FAN_REPORT_PID > > we added the CAP_SYS_ADMIN check for reporting event->pid as next > > best thing. So now that becomes weird if priv process created fanotify fd > > and passes it to unpriv process, then unpriv process gets events with > > pidfd but without event->pid. > > > > We can change the code to: > > > > if (!capable(CAP_SYS_ADMIN) && !pidfd_mode && > > task_tgid(current) != event->pid) > > metadata.pid = 0; > > > > So the case I decscribed above ends up reporting both pidfd > > and event->pid to unpriv user, but that is a bit inconsistent... > > Oh, now I see where you are coming from :) Thanks for explanation. And > remind me please, cannot we just have internal FAN_REPORT_PID flag that > gets set on notification group when priviledged process creates it and then > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is > mostly equivalent but I guess more in the spirit of how fanotify > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the > same way... Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV as it described the situation better than FAN_REPORT_PID. This happens to be how I implemented it in the initial RFC [1]. It's not easy to follow our entire discussion on this thread, but I think we can resurrect the FANOTIFY_UNPRIV internal flag and use it in this case instead of CAP_SYS_ADMIN. Thanks, Amir. https://lore.kernel.org/linux-fsdevel/20210124184204.899729-3-amir73il@gmail.com/
On Fri 21-05-21 16:52:08, Amir Goldstein wrote: > On Fri, May 21, 2021 at 4:19 PM Jan Kara <jack@suse.cz> wrote: > > > > On Fri 21-05-21 14:10:32, Amir Goldstein wrote: > > > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote: > > > > > > > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote: > > > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > > > > > > > > > > > Hey Amir/Christian, > > > > > > > > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > > > > > > <christian.brauner@ubuntu.com> wrote: > > > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > > > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > > > > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > > > > > > { > > > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > > > > > > if (fh_len) > > > > > > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > > > > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > > > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > > > + > > > > > > > > > return info_len; > > > > > > > > > } > > > > > > > > > > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > > > > > > return info_len; > > > > > > > > > } > > > > > > > > > > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > > > > > > + char __user *buf, > > > > > > > > > + size_t count) > > > > > > > > > +{ > > > > > > > > > + struct fanotify_event_info_pidfd info = { }; > > > > > > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > > > + > > > > > > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > > > > > > + return -EFAULT; > > > > > > > > > + > > > > > > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > > > > > > + info.hdr.len = info_len; > > > > > > > > > + > > > > > > > > > + info.pidfd = pidfd_create(pid, 0); > > > > > > > > > + if (info.pidfd < 0) > > > > > > > > > + info.pidfd = FAN_NOPIDFD; > > > > > > > > > + > > > > > > > > > + if (copy_to_user(buf, &info, info_len)) > > > > > > > > > + return -EFAULT; > > > > > > > > > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > > > > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > > > > > > > > > > > Good catch! > > > > > > > > > > > > Super awesome catch Christian, thanks pulling this up! > > > > > > > > > > > > > But I prefer to solve it differently, because moving fd_install() to the > > > > > > > end of this function does not guarantee that copy_event_to_user() > > > > > > > won't return an error one day with dangling pidfd in fd table. > > > > > > > > > > > > I can see the angle you're approaching this from... > > > > > > > > > > > > > It might be simpler to do pidfd_create() next to create_fd() in > > > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > > > > > > pidfd can be closed on error along with fd on out_close_fd label. > > > > > > > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > > > > > (even though fanotify_init() does check for that). > > > > > > > > > > > > I didn't really understand the need for this check here given that the > > > > > > administrative bits are already being checked for in fanotify_init() > > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense > > > > > > in depth approach here, or is it something else that I'm missing? > > > > > > > > > > > > > > > > We want to be extra careful not to create privilege escalations, > > > > > so even if the fanotify fd is leaked or intentionally passed to a less > > > > > privileged user, it cannot get an open pidfd. > > > > > > > > > > IOW, it is *much* easier to be defensive in this case than to prove > > > > > that the change cannot introduce any privilege escalations. > > > > > > > > I have no problems with being more defensive (it's certainly better than > > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN > > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged > > > > process, that process is also free to update all the passwords. > > > > Traditionally permission checks in Unix are performed on open and then who > > > > has fd can do whatever that fd allows... I've tried to follow similar > > > > philosophy with fanotify as well and e.g. open happening as a result of > > > > fanotify path events does not check permissions either. > > > > > > > > > > Agreed. > > > > > > However, because we had this issue with no explicit FAN_REPORT_PID > > > we added the CAP_SYS_ADMIN check for reporting event->pid as next > > > best thing. So now that becomes weird if priv process created fanotify fd > > > and passes it to unpriv process, then unpriv process gets events with > > > pidfd but without event->pid. > > > > > > We can change the code to: > > > > > > if (!capable(CAP_SYS_ADMIN) && !pidfd_mode && > > > task_tgid(current) != event->pid) > > > metadata.pid = 0; > > > > > > So the case I decscribed above ends up reporting both pidfd > > > and event->pid to unpriv user, but that is a bit inconsistent... > > > > Oh, now I see where you are coming from :) Thanks for explanation. And > > remind me please, cannot we just have internal FAN_REPORT_PID flag that > > gets set on notification group when priviledged process creates it and then > > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is > > mostly equivalent but I guess more in the spirit of how fanotify > > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the > > same way... > > Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV > as it described the situation better than FAN_REPORT_PID. > This happens to be how I implemented it in the initial RFC [1]. > > It's not easy to follow our entire discussion on this thread, but I think > we can resurrect the FANOTIFY_UNPRIV internal flag and use it > in this case instead of CAP_SYS_ADMIN. I think at that time we were discussing how to handle opening of fds and we decided to not depend on FANOTIFY_UNPRIV and then I didn't see a value of that flag because I forgot about pids... Anyway now I agree to go for that flag. :) Honza
On Fri, May 21, 2021 at 05:14:15PM +0200, Jan Kara wrote: > On Fri 21-05-21 16:52:08, Amir Goldstein wrote: > > On Fri, May 21, 2021 at 4:19 PM Jan Kara <jack@suse.cz> wrote: > > > > > > On Fri 21-05-21 14:10:32, Amir Goldstein wrote: > > > > On Fri, May 21, 2021 at 1:24 PM Jan Kara <jack@suse.cz> wrote: > > > > > > > > > > On Fri 21-05-21 12:41:51, Amir Goldstein wrote: > > > > > > On Fri, May 21, 2021 at 12:22 PM Matthew Bobrowski <repnop@google.com> wrote: > > > > > > > > > > > > > > Hey Amir/Christian, > > > > > > > > > > > > > > On Thu, May 20, 2021 at 04:43:48PM +0300, Amir Goldstein wrote: > > > > > > > > On Thu, May 20, 2021 at 11:17 AM Christian Brauner > > > > > > > > <christian.brauner@ubuntu.com> wrote: > > > > > > > > > > +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ > > > > > > > > > > + sizeof(struct fanotify_event_info_pidfd) > > > > > > > > > > > > > > > > > > > > static int fanotify_fid_info_len(int fh_len, int name_len) > > > > > > > > > > { > > > > > > > > > > @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, > > > > > > > > > > if (fh_len) > > > > > > > > > > info_len += fanotify_fid_info_len(fh_len, dot_len); > > > > > > > > > > > > > > > > > > > > + if (info_mode & FAN_REPORT_PIDFD) > > > > > > > > > > + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > > > > + > > > > > > > > > > return info_len; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, > > > > > > > > > > return info_len; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > +static int copy_pidfd_info_to_user(struct pid *pid, > > > > > > > > > > + char __user *buf, > > > > > > > > > > + size_t count) > > > > > > > > > > +{ > > > > > > > > > > + struct fanotify_event_info_pidfd info = { }; > > > > > > > > > > + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; > > > > > > > > > > + > > > > > > > > > > + if (WARN_ON_ONCE(info_len > count)) > > > > > > > > > > + return -EFAULT; > > > > > > > > > > + > > > > > > > > > > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; > > > > > > > > > > + info.hdr.len = info_len; > > > > > > > > > > + > > > > > > > > > > + info.pidfd = pidfd_create(pid, 0); > > > > > > > > > > + if (info.pidfd < 0) > > > > > > > > > > + info.pidfd = FAN_NOPIDFD; > > > > > > > > > > + > > > > > > > > > > + if (copy_to_user(buf, &info, info_len)) > > > > > > > > > > + return -EFAULT; > > > > > > > > > > > > > > > > > > Hm, well this kinda sucks. The caller can end up with a pidfd in their > > > > > > > > > fd table and when the copy_to_user() failed they won't know what fd it > > > > > > > > > > > > > > > > Good catch! > > > > > > > > > > > > > > Super awesome catch Christian, thanks pulling this up! > > > > > > > > > > > > > > > But I prefer to solve it differently, because moving fd_install() to the > > > > > > > > end of this function does not guarantee that copy_event_to_user() > > > > > > > > won't return an error one day with dangling pidfd in fd table. > > > > > > > > > > > > > > I can see the angle you're approaching this from... > > > > > > > > > > > > > > > It might be simpler to do pidfd_create() next to create_fd() in > > > > > > > > copy_event_to_user() and pass pidfd to copy_pidfd_info_to_user(). > > > > > > > > pidfd can be closed on error along with fd on out_close_fd label. > > > > > > > > > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > > > > > > (even though fanotify_init() does check for that). > > > > > > > > > > > > > > I didn't really understand the need for this check here given that the > > > > > > > administrative bits are already being checked for in fanotify_init() > > > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense > > > > > > > in depth approach here, or is it something else that I'm missing? > > > > > > > > > > > > > > > > > > > We want to be extra careful not to create privilege escalations, > > > > > > so even if the fanotify fd is leaked or intentionally passed to a less > > > > > > privileged user, it cannot get an open pidfd. > > > > > > > > > > > > IOW, it is *much* easier to be defensive in this case than to prove > > > > > > that the change cannot introduce any privilege escalations. > > > > > > > > > > I have no problems with being more defensive (it's certainly better than > > > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN > > > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged > > > > > process, that process is also free to update all the passwords. > > > > > Traditionally permission checks in Unix are performed on open and then who > > > > > has fd can do whatever that fd allows... I've tried to follow similar > > > > > philosophy with fanotify as well and e.g. open happening as a result of > > > > > fanotify path events does not check permissions either. > > > > > > > > > > > > > Agreed. > > > > > > > > However, because we had this issue with no explicit FAN_REPORT_PID > > > > we added the CAP_SYS_ADMIN check for reporting event->pid as next > > > > best thing. So now that becomes weird if priv process created fanotify fd > > > > and passes it to unpriv process, then unpriv process gets events with > > > > pidfd but without event->pid. > > > > > > > > We can change the code to: > > > > > > > > if (!capable(CAP_SYS_ADMIN) && !pidfd_mode && > > > > task_tgid(current) != event->pid) > > > > metadata.pid = 0; > > > > > > > > So the case I decscribed above ends up reporting both pidfd > > > > and event->pid to unpriv user, but that is a bit inconsistent... > > > > > > Oh, now I see where you are coming from :) Thanks for explanation. And > > > remind me please, cannot we just have internal FAN_REPORT_PID flag that > > > gets set on notification group when priviledged process creates it and then > > > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is > > > mostly equivalent but I guess more in the spirit of how fanotify > > > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the > > > same way... > > > > Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV > > as it described the situation better than FAN_REPORT_PID. > > This happens to be how I implemented it in the initial RFC [1]. > > > > It's not easy to follow our entire discussion on this thread, but I think > > we can resurrect the FANOTIFY_UNPRIV internal flag and use it > > in this case instead of CAP_SYS_ADMIN. > > I think at that time we were discussing how to handle opening of fds and > we decided to not depend on FANOTIFY_UNPRIV and then I didn't see a value > of that flag because I forgot about pids... Anyway now I agree to go for > that flag. :) Resurrection of this flag SGTM! However, it also sounds like we need to land that series before this PIDFD series or simply incorporate the UNPRIV flag into this one. Will chat with Amir to get this done. /M
> > > > > > > > > You also forgot to add CAP_SYS_ADMIN check before pidfd_create() > > > > > > > > > (even though fanotify_init() does check for that). > > > > > > > > > > > > > > > > I didn't really understand the need for this check here given that the > > > > > > > > administrative bits are already being checked for in fanotify_init() > > > > > > > > i.e. FAN_REPORT_PIDFD can never be set for an unprivileged listener; > > > > > > > > thus never walking any of the pidfd_mode paths. Is this just a defense > > > > > > > > in depth approach here, or is it something else that I'm missing? > > > > > > > > > > > > > > > > > > > > > > We want to be extra careful not to create privilege escalations, > > > > > > > so even if the fanotify fd is leaked or intentionally passed to a less > > > > > > > privileged user, it cannot get an open pidfd. > > > > > > > > > > > > > > IOW, it is *much* easier to be defensive in this case than to prove > > > > > > > that the change cannot introduce any privilege escalations. > > > > > > > > > > > > I have no problems with being more defensive (it's certainly better than > > > > > > being too lax) but does it really make sence here? I mean if CAP_SYS_ADMIN > > > > > > task opens O_RDWR /etc/passwd and then passes this fd to unpriviledged > > > > > > process, that process is also free to update all the passwords. > > > > > > Traditionally permission checks in Unix are performed on open and then who > > > > > > has fd can do whatever that fd allows... I've tried to follow similar > > > > > > philosophy with fanotify as well and e.g. open happening as a result of > > > > > > fanotify path events does not check permissions either. > > > > > > > > > > > > > > > > Agreed. > > > > > > > > > > However, because we had this issue with no explicit FAN_REPORT_PID > > > > > we added the CAP_SYS_ADMIN check for reporting event->pid as next > > > > > best thing. So now that becomes weird if priv process created fanotify fd > > > > > and passes it to unpriv process, then unpriv process gets events with > > > > > pidfd but without event->pid. > > > > > > > > > > We can change the code to: > > > > > > > > > > if (!capable(CAP_SYS_ADMIN) && !pidfd_mode && > > > > > task_tgid(current) != event->pid) > > > > > metadata.pid = 0; > > > > > > > > > > So the case I decscribed above ends up reporting both pidfd > > > > > and event->pid to unpriv user, but that is a bit inconsistent... > > > > > > > > Oh, now I see where you are coming from :) Thanks for explanation. And > > > > remind me please, cannot we just have internal FAN_REPORT_PID flag that > > > > gets set on notification group when priviledged process creates it and then > > > > test for that instead of CAP_SYS_ADMIN in copy_event_to_user()? It is > > > > mostly equivalent but I guess more in the spirit of how fanotify > > > > traditionally does things. Also FAN_REPORT_PIDFD could then behave in the > > > > same way... > > > > > > Yes, we can. In fact, we should call the internal flag FANOTIFY_UNPRIV > > > as it described the situation better than FAN_REPORT_PID. > > > This happens to be how I implemented it in the initial RFC [1]. > > > > > > It's not easy to follow our entire discussion on this thread, but I think > > > we can resurrect the FANOTIFY_UNPRIV internal flag and use it > > > in this case instead of CAP_SYS_ADMIN. > > > > I think at that time we were discussing how to handle opening of fds and > > we decided to not depend on FANOTIFY_UNPRIV and then I didn't see a value > > of that flag because I forgot about pids... Anyway now I agree to go for > > that flag. :) > > Resurrection of this flag SGTM! However, it also sounds like we need > to land that series before this PIDFD series or simply incorporate the > UNPRIV flag into this one. > > Will chat with Amir to get this done. Let me post this patch as a fix patch to unprivileged group. Thanks, Amir.
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1e15f3222eb2..bba61988f4a0 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -106,6 +106,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ + sizeof(struct fanotify_event_info_pidfd) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -141,6 +143,9 @@ static int fanotify_event_info_len(unsigned int info_mode, if (fh_len) info_len += fanotify_fid_info_len(fh_len, dot_len); + if (info_mode & FAN_REPORT_PIDFD) + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + return info_len; } @@ -401,6 +406,29 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, return info_len; } +static int copy_pidfd_info_to_user(struct pid *pid, + char __user *buf, + size_t count) +{ + struct fanotify_event_info_pidfd info = { }; + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; + info.hdr.len = info_len; + + info.pidfd = pidfd_create(pid, 0); + if (info.pidfd < 0) + info.pidfd = FAN_NOPIDFD; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + static int copy_info_to_user(struct fanotify_event *event, struct fanotify_info *info, unsigned int info_mode, @@ -408,9 +436,12 @@ static int copy_info_to_user(struct fanotify_event *event, { int ret, info_type = 0; unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; /* * Event info records order is as follows: dir fid + name, child fid. + * If FAN_REPORT_PIDFD has been specified, then a pidfd info record + * will follow the fid info records. */ if (fanotify_event_dir_fh_len(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : @@ -465,10 +496,18 @@ static int copy_info_to_user(struct fanotify_event *event, } ret = copy_fid_info_to_user(fanotify_event_fsid(event), - fanotify_event_object_fh(event), - info_type, dot, dot_len, - buf, count); - } + fanotify_event_object_fh(event), + info_type, dot, dot_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + } + + if (pidfd_mode) + return copy_pidfd_info_to_user(event->pid, buf, count); return ret; } @@ -530,6 +569,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fd_install(fd, f); if (info_mode) { + /* + * Complain if FAN_REPORT_PIDFD and FAN_REPORT_TID mutual + * exclusion is ever lifted. At the time of implementing + * FAN_REPORT_PIDFD, the pidfd API only supported the creation + * of pidfds on thread-group leaders. + */ + WARN_ON_ONCE((info_mode & FAN_REPORT_PIDFD) && + FAN_GROUP_FLAG(group, FAN_REPORT_TID)); + ret = copy_info_to_user(event, info, info_mode, buf, count); if (ret < 0) return ret; @@ -1079,6 +1127,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) #endif return -EINVAL; + /* + * A pidfd can only be returned for a thread-group leader; thus + * FAN_REPORT_TID and FAN_REPORT_PIDFD need to be mutually exclusive. + */ + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) + return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1477,7 +1532,7 @@ static int __init fanotify_user_setup(void) max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, FANOTIFY_DEFAULT_MAX_USER_MARKS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index f76c7635efc8..bb2898240e5a 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -27,7 +27,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ #define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) -#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS) +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) /* * fanotify_init() flags that require CAP_SYS_ADMIN. @@ -37,6 +37,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ */ #define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \ FAN_REPORT_TID | \ + FAN_REPORT_PIDFD | \ FAN_UNLIMITED_QUEUE | \ FAN_UNLIMITED_MARKS) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index fbf9c5c7dd59..36c3bddcf690 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -55,6 +55,7 @@ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ #define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ #define FAN_REPORT_NAME 0x00000800 /* Report events with name */ +#define FAN_REPORT_PIDFD 0x00001000 /* Report pidfd for event->pid */ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) @@ -123,6 +124,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_FID 1 #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 #define FAN_EVENT_INFO_TYPE_DFID 3 +#define FAN_EVENT_INFO_TYPE_PIDFD 4 /* Variable length info record following event metadata */ struct fanotify_event_info_header { @@ -148,6 +150,15 @@ struct fanotify_event_info_fid { unsigned char handle[0]; }; +/* + * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD. + * It holds a pidfd for the pid responsible for generating an event. + */ +struct fanotify_event_info_pidfd { + struct fanotify_event_info_header hdr; + __s32 pidfd; +}; + struct fanotify_response { __s32 fd; __u32 response; @@ -160,6 +171,7 @@ struct fanotify_response { /* No fd set in event */ #define FAN_NOFD -1 +#define FAN_NOPIDFD FAN_NOFD /* Helper functions to deal with fanotify_event_metadata buffers */ #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which allows userspace applications to control whether a pidfd info record containing a pidfd is to be returned with each event. If FAN_REPORT_PIDFD is enabled for a notification group, an additional struct fanotify_event_info_pidfd object will be supplied alongside the generic struct fanotify_event_metadata within a single event. This functionality is analogous to that of FAN_REPORT_FID in terms of how the event structure is supplied to the userspace application. Usage of FAN_REPORT_PIDFD with FAN_REPORT_FID/FAN_REPORT_DFID_NAME is permitted, and in this case a struct fanotify_event_info_pidfd object will follow any struct fanotify_event_info_fid object. Usage of FAN_REPORT_TID is not permitted with FAN_REPORT_PIDFD as the pidfd API only supports the creation of pidfds for thread-group leaders. Attempting to do so will result with a -EINVAL being returned when calling fanotify_init(2). If pidfd creation fails via pidfd_create(), the pidfd field within struct fanotify_event_info_pidfd is set to FAN_NOPIDFD. Signed-off-by: Matthew Bobrowski <repnop@google.com> --- fs/notify/fanotify/fanotify_user.c | 65 +++++++++++++++++++++++++++--- include/linux/fanotify.h | 3 +- include/uapi/linux/fanotify.h | 12 ++++++ 3 files changed, 74 insertions(+), 6 deletions(-)