diff mbox series

[1/1] userfaultfd: require CAP_SYS_PTRACE for UFFD_FEATURE_EVENT_FORK

Message ID 1572967777-8812-2-git-send-email-rppt@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: require CAP_SYS_PTRACE for UFFD_FEATURE_EVENT_FORK | expand

Commit Message

Mike Rapoport Nov. 5, 2019, 3:29 p.m. UTC
Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
descriptor table from the read() implementation of uffd, which may have
security implications for unprivileged use of the userfaultfd.

Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
CAP_SYS_PTRACE.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 fs/userfaultfd.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

Comments

Andrea Arcangeli Nov. 5, 2019, 3:37 p.m. UTC | #1
Hello Mike,

On Tue, Nov 05, 2019 at 05:29:37PM +0200, Mike Rapoport wrote:
> Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
> descriptor table from the read() implementation of uffd, which may have
> security implications for unprivileged use of the userfaultfd.
> 
> Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
> CAP_SYS_PTRACE.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  fs/userfaultfd.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index f9fd18670e22..d99d166fd892 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1834,13 +1834,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
>  	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
>  		goto out;
>  	features = uffdio_api.features;
> -	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
> -		memset(&uffdio_api, 0, sizeof(uffdio_api));
> -		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
> -			goto out;
> -		ret = -EINVAL;
> -		goto out;
> -	}
> +	ret = -EINVAL;
> +	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
> +		goto err_out;
> +	ret = -EPERM;
> +	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
> +		goto err_out;
>  	/* report all available features and ioctls to userland */
>  	uffdio_api.features = UFFD_API_FEATURES;
>  	uffdio_api.ioctls = UFFD_API_IOCTLS;
> @@ -1853,6 +1852,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
>  	ret = 0;
>  out:
>  	return ret;
> +err_out:
> +	memset(&uffdio_api, 0, sizeof(uffdio_api));
> +	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
> +		ret = -EFAULT;
> +	goto out;
>  }
>  
>  static long userfaultfd_ioctl(struct file *file, unsigned cmd,

Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>

Thanks,
Andrea
Daniel Colascione Nov. 5, 2019, 3:55 p.m. UTC | #2
On Tue, Nov 5, 2019 at 7:29 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
> descriptor table from the read() implementation of uffd, which may have
> security implications for unprivileged use of the userfaultfd.
>
> Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
> CAP_SYS_PTRACE.

Thanks. But shouldn't we be doing the capability check at
userfaultfd(2) time (when we do the other permission checks), not
later, in the API ioctl?
Aleksa Sarai Nov. 5, 2019, 3:59 p.m. UTC | #3
On 2019-11-05, Mike Rapoport <rppt@linux.ibm.com> wrote:
> Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
> descriptor table from the read() implementation of uffd, which may have
> security implications for unprivileged use of the userfaultfd.
> 
> Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
> CAP_SYS_PTRACE.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  fs/userfaultfd.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index f9fd18670e22..d99d166fd892 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1834,13 +1834,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
>  	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
>  		goto out;
>  	features = uffdio_api.features;
> -	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
> -		memset(&uffdio_api, 0, sizeof(uffdio_api));
> -		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
> -			goto out;
> -		ret = -EINVAL;
> -		goto out;
> -	}
> +	ret = -EINVAL;
> +	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
> +		goto err_out;
> +	ret = -EPERM;
> +	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
> +		goto err_out;
>  	/* report all available features and ioctls to userland */
>  	uffdio_api.features = UFFD_API_FEATURES;
>  	uffdio_api.ioctls = UFFD_API_IOCTLS;
> @@ -1853,6 +1852,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
>  	ret = 0;
>  out:
>  	return ret;
> +err_out:
> +	memset(&uffdio_api, 0, sizeof(uffdio_api));
> +	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))

Wouldn't it be simpler to do clear_user()?

> +		ret = -EFAULT;
> +	goto out;
>  }
>  
>  static long userfaultfd_ioctl(struct file *file, unsigned cmd,
Andy Lutomirski Nov. 5, 2019, 4 p.m. UTC | #4
On Tue, Nov 5, 2019 at 7:55 AM Daniel Colascione <dancol@google.com> wrote:
>
> On Tue, Nov 5, 2019 at 7:29 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> >
> > Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
> > descriptor table from the read() implementation of uffd, which may have
> > security implications for unprivileged use of the userfaultfd.
> >
> > Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
> > CAP_SYS_PTRACE.
>
> Thanks. But shouldn't we be doing the capability check at
> userfaultfd(2) time (when we do the other permission checks), not
> later, in the API ioctl?

The ioctl seems reasonable to me.  In particular, if there is anyone
who creates a userfaultfd as root and then drop permissions, a later
ioctl could unexpectedly enable FORK.

This assumes that the code in question is only reachable through
ioctl() and not write().
Daniel Colascione Nov. 5, 2019, 4:06 p.m. UTC | #5
On Tue, Nov 5, 2019 at 8:00 AM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Tue, Nov 5, 2019 at 7:55 AM Daniel Colascione <dancol@google.com> wrote:
> >
> > On Tue, Nov 5, 2019 at 7:29 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> > >
> > > Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
> > > descriptor table from the read() implementation of uffd, which may have
> > > security implications for unprivileged use of the userfaultfd.
> > >
> > > Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
> > > CAP_SYS_PTRACE.
> >
> > Thanks. But shouldn't we be doing the capability check at
> > userfaultfd(2) time (when we do the other permission checks), not
> > later, in the API ioctl?
>
> The ioctl seems reasonable to me.  In particular, if there is anyone
> who creates a userfaultfd as root and then drop permissions, a later
> ioctl could unexpectedly enable FORK.

Sure, but the same argument applies to all the other permission checks
that we do at open time, not at ioctl time. For better or for worse,
the DAC-ish model used in most places is that access checks happen at
file object creation time and anyone who has the FD can perform those
operations later. Confusing the model by doing *some* permission
checks at open time and *some* permission checks at usage time makes
the system harder to understand.
Andrea Arcangeli Nov. 5, 2019, 4:24 p.m. UTC | #6
On Tue, Nov 05, 2019 at 08:00:26AM -0800, Andy Lutomirski wrote:
> On Tue, Nov 5, 2019 at 7:55 AM Daniel Colascione <dancol@google.com> wrote:
> >
> > On Tue, Nov 5, 2019 at 7:29 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> > >
> > > Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file
> > > descriptor table from the read() implementation of uffd, which may have
> > > security implications for unprivileged use of the userfaultfd.
> > >
> > > Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have
> > > CAP_SYS_PTRACE.
> >
> > Thanks. But shouldn't we be doing the capability check at
> > userfaultfd(2) time (when we do the other permission checks), not
> > later, in the API ioctl?
> 
> The ioctl seems reasonable to me.  In particular, if there is anyone
> who creates a userfaultfd as root and then drop permissions, a later
> ioctl could unexpectedly enable FORK.
> 
> This assumes that the code in question is only reachable through
> ioctl() and not write().

write isn't implemented. Until UFFDIO_API runs, all other implemented
syscalls are disabled (i.e. all other ioctls, poll and read). You can
quickly verify all the 3 blocks by searching for UFFD_STATE_WAIT_API,

UFFDIO_API is the place where the handshake with userland
happens. userland asks for certain features and the kernel
implementation of userlands answers yes or no.

Normally we would only ever return -EINVAL on a request of a feature
that isn't available in the running kernel (equivalent to -ENOSYS if
the syscall is entirely missing on an even older kernel), -EPERM is
more informative as it tells userland the feature is actually in the
kernel just it requires more permissions.

We could have returned -EINVAL too, but it wouldn't have made a
difference to non-privileged CRIU and we're not aware of other users
that could benefit from -EINVAL instead of -EPERM. This the relevant
CRIU userland:

        if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
                pr_perror("Failed to get uffd API");
                goto err;
	}

Unfortunately this is an ABI break, preferred than the clean removal
of the feature, because it's at least not going to break CRIU
deployments running with the PTRACE privilege. The clean removal while
non-ABI breaking, would have prevented all CRIU users to keep running
after a kernel upgrade.

The long term plan is to introduce UFFD_FEATURE_EVENT_FORK2 feature
flag that uses the ioctl to receive the child uffd, it'll consume more
CPU, but it wouldn't require the PTRACE privilege anymore.

Overall any suid or SCM_RIGHTS fd-receiving app, that isn't checking
the retval of open/socket or whatever fd "installing" syscall, is non
robust and is prone to break over time as more people edit the code or
as any library call internally change behavior, so if there's any
practical issue caused by this, it should be fixed in userland
too for higher robustness.

If you stick your userland to std::fs and std::net robustness against
issues like this is enforced by the language.

Thanks,
Andrea
Andrea Arcangeli Nov. 5, 2019, 4:33 p.m. UTC | #7
On Tue, Nov 05, 2019 at 08:06:49AM -0800, Daniel Colascione wrote:
> Sure, but the same argument applies to all the other permission checks
> that we do at open time, not at ioctl time. For better or for worse,
> the DAC-ish model used in most places is that access checks happen at
> file object creation time and anyone who has the FD can perform those
> operations later. Confusing the model by doing *some* permission
> checks at open time and *some* permission checks at usage time makes
> the system harder to understand.

The only case that requires change is if userland requested the
UFFD_FEATURE_EVENT_FORK feature (which AFIK only CRIU does) and that
request is done in the UFFDIO_API call not during the syscall.

Doing the check in the syscall would then break all non privileged
users like if we'd set /proc/sys/vm/unprivileged_userfaultfd to
zero. Qemu for example rightfully never runs with privilege (with a
few exceptions like Kata which should be fixed in fact) and it never
asks for the UFFD_FEATURE_EVENT_FORK feature either.
Daniel Colascione Nov. 5, 2019, 4:39 p.m. UTC | #8
On Tue, Nov 5, 2019 at 8:33 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
>
> On Tue, Nov 05, 2019 at 08:06:49AM -0800, Daniel Colascione wrote:
> > Sure, but the same argument applies to all the other permission checks
> > that we do at open time, not at ioctl time. For better or for worse,
> > the DAC-ish model used in most places is that access checks happen at
> > file object creation time and anyone who has the FD can perform those
> > operations later. Confusing the model by doing *some* permission
> > checks at open time and *some* permission checks at usage time makes
> > the system harder to understand.
>
> The only case that requires change is if userland requested the
> UFFD_FEATURE_EVENT_FORK feature (which AFIK only CRIU does) and that
> request is done in the UFFDIO_API call not during the syscall.
>
> Doing the check in the syscall would then break all non privileged
> users like if we'd set /proc/sys/vm/unprivileged_userfaultfd to
> zero.

I'm not suggesting that we fail userfaultfd(2) without CAP_SYS_PTRACE.
That would, as you point out, break things. I'm talking about
recording *whether* we had CAP_SYS_PTRACE in an internal flag in the
uffd context when we create the thing --- and then, at ioctl time,
checking that flag, not the caller's CAP_SYS_PTRACE, to see whether
UFFD_FEATURE_EVENT_FORK should be made available. This way, the
security check hinges on whether the caller *at create time* was
privileged.
Daniel Colascione Nov. 5, 2019, 4:41 p.m. UTC | #9
On Tue, Nov 5, 2019 at 8:24 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> The long term plan is to introduce UFFD_FEATURE_EVENT_FORK2 feature
> flag that uses the ioctl to receive the child uffd, it'll consume more
> CPU, but it wouldn't require the PTRACE privilege anymore.

Why not just have callers retrieve FDs using recvmsg? This way, you
retrieve the message packet and the file descriptor at the same time
and you don't need any appreciable extra CPU use.
Andrea Arcangeli Nov. 5, 2019, 4:55 p.m. UTC | #10
On Tue, Nov 05, 2019 at 08:39:26AM -0800, Daniel Colascione wrote:
> I'm not suggesting that we fail userfaultfd(2) without CAP_SYS_PTRACE.
> That would, as you point out, break things. I'm talking about
> recording *whether* we had CAP_SYS_PTRACE in an internal flag in the
> uffd context when we create the thing --- and then, at ioctl time,
> checking that flag, not the caller's CAP_SYS_PTRACE, to see whether
> UFFD_FEATURE_EVENT_FORK should be made available. This way, the
> security check hinges on whether the caller *at create time* was
> privileged.

Until now it wasn't clear to me you still wanted to do the permission
check in UFFDIO_API time, and you only intended to move the
"measurement" of the capability to the syscall.

So you're suggesting to add more kernel complexity to code pending for
removal to achieve a theoretically more pure solution in the band-aid
required to defer the removal of the posix-breaking read
implementation of the uffd fork feature?
Daniel Colascione Nov. 5, 2019, 5:02 p.m. UTC | #11
On Tue, Nov 5, 2019 at 8:56 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
>
> On Tue, Nov 05, 2019 at 08:39:26AM -0800, Daniel Colascione wrote:
> > I'm not suggesting that we fail userfaultfd(2) without CAP_SYS_PTRACE.
> > That would, as you point out, break things. I'm talking about
> > recording *whether* we had CAP_SYS_PTRACE in an internal flag in the
> > uffd context when we create the thing --- and then, at ioctl time,
> > checking that flag, not the caller's CAP_SYS_PTRACE, to see whether
> > UFFD_FEATURE_EVENT_FORK should be made available. This way, the
> > security check hinges on whether the caller *at create time* was
> > privileged.
>
> Until now it wasn't clear to me you still wanted to do the permission
> check in UFFDIO_API time, and you only intended to move the
> "measurement" of the capability to the syscall.
>
> So you're suggesting to add more kernel complexity to code pending for
> removal to achieve a theoretically more pure solution in the band-aid
> required to defer the removal of the posix-breaking read
> implementation of the uffd fork feature?

And you're suggesting making a security check work weirdly unlike most
other security checks because you hope it'll get removed one day?
Temporary solutions aren't, and if something goes into the kernel at
all, it's worth getting right. The general rule is that access checks
happen at open time. The kernel has already been bitten by UFFD
exempting itself from the normal rules (e.g., the
read(2)-makes-a-file-descriptor thing) in the name of expediency.
There shouldn't be any more exceptions.
Andrea Arcangeli Nov. 5, 2019, 5:30 p.m. UTC | #12
On Tue, Nov 05, 2019 at 09:02:09AM -0800, Daniel Colascione wrote:
> And you're suggesting making a security check work weirdly unlike most
> other security checks because you hope it'll get removed one day?

I didn't actually suggest that, I was only asking clarifications that
I understood correctly because up until that point you didn't seem to
say that the "permission check" needs to remain in UFFDIO_API.

> Temporary solutions aren't, and if something goes into the kernel at
> all, it's worth getting right. The general rule is that access checks
> happen at open time. The kernel has already been bitten by UFFD
> exempting itself from the normal rules (e.g., the
> read(2)-makes-a-file-descriptor thing) in the name of expediency.
> There shouldn't be any more exceptions.

It didn't occur to me that not doing the measurement in the syscall
that opens an fd is weird.

The posted patch doesn't work any different than fscrypt_ioctl_add_key
in FS_IOC_ADD_ENCRYPTION_KEY of ext4 and others, or
btrfs_ioctl_fssetxattr or a ton of other examples where permissions
are checked directly the in ioctl of the files and the measurement is
also done in the ioctl and not in the open() as you suggest as the
only non-weird solution that should exist in the kernel.

I can surely provide a lot more examples of the exact same paradigm
where the measurement of the capability is done in the ioctl, those
are the first two examples that show up so it's unlikely they're the
only ones.

So overall I didn't think this was something wrong to do, or weird or
something particularly new and I didn't look like we were bitting
anything with it. And more than in the name of expediency this simply
looks preferable to keep the complexity of the kernel low which in
turns it means it's going to be more secure and simpler to
maintain. Especially considering this code is likely to be modified
later.

Said that I've nothing contrary to do the more complex solution if
that's the correct thing to do despite more complex and despite the
code is pending for removal anyway, just I don't see any difference
between the current simple patch to what ext4_ioctl does in
FS_IOC_ADD_ENCRYPTION_KEY + FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR.
Andy Lutomirski Nov. 5, 2019, 10:01 p.m. UTC | #13
> On Nov 5, 2019, at 9:02 AM, Daniel Colascione <dancol@google.com> wrote:
> 
> On Tue, Nov 5, 2019 at 8:56 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
>> 
>>> On Tue, Nov 05, 2019 at 08:39:26AM -0800, Daniel Colascione wrote:
>>> I'm not suggesting that we fail userfaultfd(2) without CAP_SYS_PTRACE.
>>> That would, as you point out, break things. I'm talking about
>>> recording *whether* we had CAP_SYS_PTRACE in an internal flag in the
>>> uffd context when we create the thing --- and then, at ioctl time,
>>> checking that flag, not the caller's CAP_SYS_PTRACE, to see whether
>>> UFFD_FEATURE_EVENT_FORK should be made available. This way, the
>>> security check hinges on whether the caller *at create time* was
>>> privileged.
>> 
>> Until now it wasn't clear to me you still wanted to do the permission
>> check in UFFDIO_API time, and you only intended to move the
>> "measurement" of the capability to the syscall.
>> 
>> So you're suggesting to add more kernel complexity to code pending for
>> removal to achieve a theoretically more pure solution in the band-aid
>> required to defer the removal of the posix-breaking read
>> implementation of the uffd fork feature?
> 
> And you're suggesting making a security check work weirdly unlike most
> other security checks because you hope it'll get removed one day?
> Temporary solutions aren't, and if something goes into the kernel at
> all, it's worth getting right. The general rule is that access checks
> happen at open time. The kernel has already been bitten by UFFD
> exempting itself from the normal rules (e.g., the
> read(2)-makes-a-file-descriptor thing) in the name of expediency.
> There shouldn't be any more exceptions.

I don’t think ioctl() checking permission is particularly unusual. In principle, it’s better than open for a retrofit — open didn’t capture this permission in the past, so adding it makes an existing capability stronger than it was, which isn’t fantastic.
Daniel Colascione Nov. 5, 2019, 10:10 p.m. UTC | #14
On Tue, Nov 5, 2019 at 2:01 PM Andy Lutomirski <luto@amacapital.net> wrote:
> > On Nov 5, 2019, at 9:02 AM, Daniel Colascione <dancol@google.com> wrote:
> >
> > On Tue, Nov 5, 2019 at 8:56 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> >>
> >>> On Tue, Nov 05, 2019 at 08:39:26AM -0800, Daniel Colascione wrote:
> >>> I'm not suggesting that we fail userfaultfd(2) without CAP_SYS_PTRACE.
> >>> That would, as you point out, break things. I'm talking about
> >>> recording *whether* we had CAP_SYS_PTRACE in an internal flag in the
> >>> uffd context when we create the thing --- and then, at ioctl time,
> >>> checking that flag, not the caller's CAP_SYS_PTRACE, to see whether
> >>> UFFD_FEATURE_EVENT_FORK should be made available. This way, the
> >>> security check hinges on whether the caller *at create time* was
> >>> privileged.
> >>
> >> Until now it wasn't clear to me you still wanted to do the permission
> >> check in UFFDIO_API time, and you only intended to move the
> >> "measurement" of the capability to the syscall.
> >>
> >> So you're suggesting to add more kernel complexity to code pending for
> >> removal to achieve a theoretically more pure solution in the band-aid
> >> required to defer the removal of the posix-breaking read
> >> implementation of the uffd fork feature?
> >
> > And you're suggesting making a security check work weirdly unlike most
> > other security checks because you hope it'll get removed one day?
> > Temporary solutions aren't, and if something goes into the kernel at
> > all, it's worth getting right. The general rule is that access checks
> > happen at open time. The kernel has already been bitten by UFFD
> > exempting itself from the normal rules (e.g., the
> > read(2)-makes-a-file-descriptor thing) in the name of expediency.
> > There shouldn't be any more exceptions.
>
> I don’t think ioctl() checking permission is particularly unusual. In principle, it’s better than open for a retrofit — open didn’t capture this permission in the past, so adding it makes an existing capability stronger than it was, which isn’t fantastic.

All right, let's do it the way the OP's patch does it then.
Mike Rapoport Nov. 7, 2019, 8:39 a.m. UTC | #15
Hi Daniel,

On Tue, Nov 05, 2019 at 08:41:18AM -0800, Daniel Colascione wrote:
> On Tue, Nov 5, 2019 at 8:24 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> > The long term plan is to introduce UFFD_FEATURE_EVENT_FORK2 feature
> > flag that uses the ioctl to receive the child uffd, it'll consume more
> > CPU, but it wouldn't require the PTRACE privilege anymore.
> 
> Why not just have callers retrieve FDs using recvmsg? This way, you
> retrieve the message packet and the file descriptor at the same time
> and you don't need any appreciable extra CPU use.

I don't follow you here. Can you elaborate on how recvmsg would be used in
this case?
Daniel Colascione Nov. 7, 2019, 8:54 a.m. UTC | #16
On Thu, Nov 7, 2019 at 12:39 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> On Tue, Nov 05, 2019 at 08:41:18AM -0800, Daniel Colascione wrote:
> > On Tue, Nov 5, 2019 at 8:24 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> > > The long term plan is to introduce UFFD_FEATURE_EVENT_FORK2 feature
> > > flag that uses the ioctl to receive the child uffd, it'll consume more
> > > CPU, but it wouldn't require the PTRACE privilege anymore.
> >
> > Why not just have callers retrieve FDs using recvmsg? This way, you
> > retrieve the message packet and the file descriptor at the same time
> > and you don't need any appreciable extra CPU use.
>
> I don't follow you here. Can you elaborate on how recvmsg would be used in
> this case?

Imagine an AF_UNIX SOCK_DGRAM socket. You call recvmsg(). You get a
blob of regular data along with some ancillary data. The ancillary
data may include some file descriptors or it may not. Isn't the UFFD
message model the same thing? You'd call recvmsg() on a UFFD and get
back a uffd_msg data structure. If that uffd_msg came with file
descriptors, these descriptors would be in ancillary data. If you
didn't reserve enough space for the message or enough space for its
ancillary data, the recvmsg() call would fail cleanly with MSG_TRUNC
or MSG_CTRUNC.

The nice thing about using recvmsg() for this purpose is that there's
tons of existing code for dealing with recvmsg()'s calling convention
and its ancillary data. You can, for example, use recvmsg out of the
box in a Python script. You could make an ioctl that also returned a
data blob plus some optional file descriptors, but if recvmsg already
does exactly that job and it's well-understood, why not just reuse the
recvmsg interface?

How practical is it to actually support recvmsg without being a
socket? How hard would it be to just become a socket? I don't know. My
point is only that *from a userspace API* point of view, recvmsg()
seems ideal.
Andrea Arcangeli Nov. 7, 2019, 3:38 p.m. UTC | #17
Hello,

On Thu, Nov 07, 2019 at 12:54:59AM -0800, Daniel Colascione wrote:
> On Thu, Nov 7, 2019 at 12:39 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> > On Tue, Nov 05, 2019 at 08:41:18AM -0800, Daniel Colascione wrote:
> > > On Tue, Nov 5, 2019 at 8:24 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> > > > The long term plan is to introduce UFFD_FEATURE_EVENT_FORK2 feature
> > > > flag that uses the ioctl to receive the child uffd, it'll consume more
> > > > CPU, but it wouldn't require the PTRACE privilege anymore.
> > >
> > > Why not just have callers retrieve FDs using recvmsg? This way, you
> > > retrieve the message packet and the file descriptor at the same time
> > > and you don't need any appreciable extra CPU use.
> >
> > I don't follow you here. Can you elaborate on how recvmsg would be used in
> > this case?
> 
> Imagine an AF_UNIX SOCK_DGRAM socket. You call recvmsg(). You get a
> blob of regular data along with some ancillary data. The ancillary
> data may include some file descriptors or it may not. Isn't the UFFD
> message model the same thing? You'd call recvmsg() on a UFFD and get
> back a uffd_msg data structure. If that uffd_msg came with file
> descriptors, these descriptors would be in ancillary data. If you
> didn't reserve enough space for the message or enough space for its
> ancillary data, the recvmsg() call would fail cleanly with MSG_TRUNC
> or MSG_CTRUNC.

Having to check for truncation is just a slowdown doesn't sound a
feature here but just a complication and unnecessary branches. You can
already read as much as you want in multiples of the uffd size.

> The nice thing about using recvmsg() for this purpose is that there's
> tons of existing code for dealing with recvmsg()'s calling convention
> and its ancillary data. You can, for example, use recvmsg out of the
> box in a Python script. You could make an ioctl that also returned a
> data blob plus some optional file descriptors, but if recvmsg already
> does exactly that job and it's well-understood, why not just reuse the
> recvmsg interface?

uffd can't become an plain AF_UNIX because on the other end there's no
other process but the kernel. Even if it could the fact it'd
facilitate a pure python backend isn't relevant because handling page
faults is a performance critical system activity, and rust can do the
ioctl like it can do poll/epoll without mio/tokyo by just calling
glibc. We can't write kernel code in python either for the same
reason.

> How practical is it to actually support recvmsg without being a
> socket? How hard would it be to just become a socket? I don't know. My

AF_UINIX has more features than we need (credentials) and dealing with
skbs and truncation would slow down the protocol. The objective is to
get the highest performance possible out of the uffd API so that it
performs as close as possible to running page faults in the kernel.

So even if we could avoid a syscall in CRIU, but we'd be slowing down
QEMU and all other normal cooperative usages if we made uffd a
socket. So overall it would be a net loss.

> point is only that *from a userspace API* point of view, recvmsg()
> seems ideal.

Now thinking about this, the semantics of the ancillary data seems to
be per socket family. So what does prevent you to create an AF_UNIX
socket, send it to a SCM_RIGHTS receiving daemon unaware that it is
getting an AF_UNIX socket. The daemon is calling recvmsg on the fd it
receives from SCM_RIGHTS in order to receive ancillary data from
another non-AF_UNIX family instead (it is irrelevant what the
semantics of the ancillary data are but they're not AF_UNIX). So the
daemon calls recvmsg and it will not understand that the fd in the
ancillary data represents an installed "fd" in the fd space and in
turn still gets the fd involuntary installed with the exact same side
effects of what we're fixing in the uffd fork event read?

I guess there shall be something somewhere that prevents recvmsg to
run on anything but an AF_UNIX if msg_control isn't NULL and
msg_controllen > 0? Otherwise even if we implemented the uffd fork
event with recvmsg, we would be back to square one.

As a corollary this could also imply we don't need the ptrace check
after all if the same thing can happen already to SCM_RIGHTS receiving
daemon expecting to receive ancillary data from AF_SOMETHING but
getting an AF_UNIX instead through SCM_RIGHTS (just like the uffd
example was expecting to call read() on a normal fd and instead it got
an uffd).

I'm sure there's something stopping SCM_RIGHTS to have the same
pitfalls of uffd event fork and that makes recvmsg safe unlike read()
but then it's not immediately clear what it is.

Thanks,
Andrea
Daniel Colascione Nov. 7, 2019, 4:15 p.m. UTC | #18
On Thu, Nov 7, 2019 at 7:38 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Thu, Nov 07, 2019 at 12:54:59AM -0800, Daniel Colascione wrote:
> > On Thu, Nov 7, 2019 at 12:39 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> > > On Tue, Nov 05, 2019 at 08:41:18AM -0800, Daniel Colascione wrote:
> > > > On Tue, Nov 5, 2019 at 8:24 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> > > > > The long term plan is to introduce UFFD_FEATURE_EVENT_FORK2 feature
> > > > > flag that uses the ioctl to receive the child uffd, it'll consume more
> > > > > CPU, but it wouldn't require the PTRACE privilege anymore.
> > > >
> > > > Why not just have callers retrieve FDs using recvmsg? This way, you
> > > > retrieve the message packet and the file descriptor at the same time
> > > > and you don't need any appreciable extra CPU use.
> > >
> > > I don't follow you here. Can you elaborate on how recvmsg would be used in
> > > this case?
> >
> > Imagine an AF_UNIX SOCK_DGRAM socket. You call recvmsg(). You get a
> > blob of regular data along with some ancillary data. The ancillary
> > data may include some file descriptors or it may not. Isn't the UFFD
> > message model the same thing? You'd call recvmsg() on a UFFD and get
> > back a uffd_msg data structure. If that uffd_msg came with file
> > descriptors, these descriptors would be in ancillary data. If you
> > didn't reserve enough space for the message or enough space for its
> > ancillary data, the recvmsg() call would fail cleanly with MSG_TRUNC
> > or MSG_CTRUNC.
>
> Having to check for truncation is just a slowdown doesn't sound a
> feature here but just a complication and unnecessary branches. You can
> already read as much as you want in multiples of the uffd size.

You're already paying for bounds checking. Receiving a message via a
datagram socket is basically the same thing as what UFFD's read is
doing anyway.

> > The nice thing about using recvmsg() for this purpose is that there's
> > tons of existing code for dealing with recvmsg()'s calling convention
> > and its ancillary data. You can, for example, use recvmsg out of the
> > box in a Python script. You could make an ioctl that also returned a
> > data blob plus some optional file descriptors, but if recvmsg already
> > does exactly that job and it's well-understood, why not just reuse the
> > recvmsg interface?
>
> uffd can't become an plain AF_UNIX because on the other end there's no
> other process but the kernel. Even if it could the fact it'd
> facilitate a pure python backend isn't relevant because handling page
> faults is a performance critical system activity, and rust can do the
> ioctl like it can do poll/epoll without mio/tokyo by just calling
> glibc. We can't write kernel code in python either for the same
> reason.

My point isn't "hey, you should write this in Python". (Although for
prototyping, why not?) My point is that where there's an existing
kernel interface for exactly the functionality you want, you should
use it instead of inventing some new thing, because when we use the
same interface for things have the same shape and purpose, we not only
get to reuse code, but also the knowledge in people's heads.

> > point is only that *from a userspace API* point of view, recvmsg()
> > seems ideal.
>
> Now thinking about this, the semantics of the ancillary data seems to
> be per socket family. So what does prevent you to create an AF_UNIX
> socket, send it to a SCM_RIGHTS receiving daemon unaware that it is
> getting an AF_UNIX socket. The daemon is calling recvmsg on the fd it
> receives from SCM_RIGHTS in order to receive ancillary data from
> another non-AF_UNIX family instead (it is irrelevant what the
> semantics of the ancillary data are but they're not AF_UNIX). So the
> daemon calls recvmsg and it will not understand that the fd in the
> ancillary data represents an installed "fd" in the fd space and in
> turn still gets the fd involuntary installed with the exact same side
> effects of what we're fixing in the uffd fork event read?

SCM_RIGHTS (AFAIK) is the only bit of ancillary data which indicates
that the kernel has created a file descriptor in the process doing the
recvmsg.

> I guess there shall be something somewhere that prevents recvmsg to
> run on anything but an AF_UNIX if msg_control isn't NULL and
> msg_controllen > 0? Otherwise even if we implemented the uffd fork
> event with recvmsg, we would be back to square one.

Why would we limit recvmsg to AF_UNIX? We can receive ancillary data
on other sockets, e.g., netlink. SCM_RIGHTS works only with AF_UNIX
right now, but this limitation isn't written in stone.

> As a corollary this could also imply we don't need the ptrace check
> after all if the same thing can happen already to SCM_RIGHTS receiving
> daemon expecting to receive ancillary data from AF_SOMETHING but
> getting an AF_UNIX instead through SCM_RIGHTS (just like the uffd
> example was expecting to call read() on a normal fd and instead it got
> an uffd).

Programs generally don't go calling recvmsg() on random FDs they get
from the outside world. They do call read() on those FDs, which is why
read() having unexpected side effects is terrible.

> I'm sure there's something stopping SCM_RIGHTS to have the same
> pitfalls of uffd event fork and that makes recvmsg safe unlike read()
> but then it's not immediately clear what it is.

If you call it with a non-empty ancillary data buffer, you know to
react to what you get. You're *opting into* the possibility of getting
file descriptors. Sure, it's theoretically possible that a program
calls recvmsg on random FDs it gets from unknown sources, sees
SCM_RIGHTS unexpectedly, and just the SCM_RIGHTS message and its FD
payload, but that's an outright bug, while calling read() on stdin is
no bug.

Anyway, IMHO, UFFD should be a netlink-like SOCK_DGRAM socket that
sends FDs with SCM_RIGHTS. This interface is already very efficient --
people have been optimizing the hell out of AF_UNIX for decades ---
and this interface provides exactly the right interface semantics for
what UFFD needs to do.
Andrea Arcangeli Nov. 7, 2019, 6:22 p.m. UTC | #19
On Thu, Nov 07, 2019 at 08:15:53AM -0800, Daniel Colascione wrote:
> You're already paying for bounds checking. Receiving a message via a
> datagram socket is basically the same thing as what UFFD's read is
> doing anyway.

Except it's synchronous and there are no dynamic allocations required
in uffd, while af_netlink and af_unix both all deal with queue of
events in skbs dynamically allocated.

Ultimately if we strip away the skbs for performance reasons, there
wouldn't be much code to share, so if the only benefit would be to
call recvmsg which would still be as insecure as read for the "worse"
case than suid, so I don't see the point.

And should then eventfd also become a netlink then? I mean uffd was
supposed to work like eventfd except it requires specialized events.

> Programs generally don't go calling recvmsg() on random FDs they get
> from the outside world. They do call read() on those FDs, which is why

That programs generally don't do something only means the attack is
less probable.

Programs generally aren't suid. Programs generally don't use
SCM_RIGHTS. Programs generally don't ignore the retval of
open/socket/uffd syscalls. Programs generally don't make assumptions
on the fd ID after one of those syscalls that install fds.

If all programs generally do the right thing (where the most important
is to not make assumptions on the fd IDs and to check all syscall
retvals), there was never an issue to begin with even in uffd API.

> read() having unexpected side effects is terrible.

If having unexpected side effects in read is "terrible" (i.e. I
personally prefer to use terms like terrible when there's at least
something that can be exploited in practice, not for theoretical
issues) for an SCM_RIGHTS receiving daemon, I just don't see how the
exact same unexpected (still theoretical) side effects in recvmsg with
an unexpected nested cmsg->cmsg_type == SCM_RIGHTS message being
returned, isn't terrible too.

> If you call it with a non-empty ancillary data buffer, you know to
> react to what you get. You're *opting into* the possibility of getting
> file descriptors. Sure, it's theoretically possible that a program
> calls recvmsg on random FDs it gets from unknown sources, sees
> SCM_RIGHTS unexpectedly, and just the SCM_RIGHTS message and its FD
> payload, but that's an outright bug, while calling read() on stdin is
> no bug.

I'm not talking about stdin and suid. recvmsg might mitigate the
concern for suid (not certain, depends on the suid, if it's generally
doing what you expect most suid to be doing or not), I was talking
about the SCM_RIGHTS receiving daemon instead, the "worse" more
concerning case than the suid.

I quote below Andy's relevant email:

======
It's worse if SCM_RIGHTS is involved.
======

Not all software will do this after calling recvmsg:

    if (cmsg->cmsg_type == SCM_RIGHTS) {
      /* oops we got attacked and an fd was involountarily installed
         because we received another AF_UNIX from a malicious attacker
         in control of the other end of the SCM_RIGHTS-receiving
         AF_UNIX connection instead of our expected socket family
         which doesn't even support SCM_RIGHTS so we would never have
         noticed an fd was installed after recvmsg */
    }
Daniel Colascione Nov. 7, 2019, 6:50 p.m. UTC | #20
On Thu, Nov 7, 2019 at 10:23 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
>
> On Thu, Nov 07, 2019 at 08:15:53AM -0800, Daniel Colascione wrote:
> > You're already paying for bounds checking. Receiving a message via a
> > datagram socket is basically the same thing as what UFFD's read is
> > doing anyway.
>
> Except it's synchronous and there are no dynamic allocations required
> in uffd, while af_netlink and af_unix both all deal with queue of
> events in skbs dynamically allocated.

Do you have any evidence that skb allocation is a significant cost
compared to a page fault and schedule? Regardless: if you don't want
to use skbs, don't. My point is that recvmsg is the ideal interface
for UFFD and i'm agnostic on the implementation of this interface.

> And should then eventfd also become a netlink then? I mean uffd was
> supposed to work like eventfd except it requires specialized events.

You've raised eventfd as a model for UFFD on several occasions. I
don't think eventfd is a good reference point. An eventfd is a single
object with 64 bits of state. It can notify interested parties in
changes to that state. Eventfd does not provide a queue. UFFD,
however, *is* a queue. It provides an arbitrary number of state change
notifications to a reader. In this way, UFFD is much more like a
socket than it's like eventfd. That is, eventfd is about level-change
notifications, but UFFD is about sending messages.

> > Programs generally don't go calling recvmsg() on random FDs they get
> > from the outside world. They do call read() on those FDs, which is why
>
> That programs generally don't do something only means the attack is
> less probable.
>
> Programs generally aren't suid. Programs generally don't use
> SCM_RIGHTS. Programs generally don't ignore the retval of
> open/socket/uffd syscalls. Programs generally don't make assumptions
> on the fd ID after one of those syscalls that install fds.
>
> If all programs generally do the right thing (where the most important
> is to not make assumptions on the fd IDs and to check all syscall
> retvals), there was never an issue to begin with even in uffd API.

"The right thing" is a matter of contracts. If a program calls read()
and behaves as if read() has  only the effects read() is documented to
have, that means that from the kernel's point of view, the program is
doing the right thing. That you think certain practices are more
prudent than others is irrelevant here. UFFD is a violation of
read()'s *contract* and so if programs break after calling read(),
it's the *kernel*'s fault.

> > read() having unexpected side effects is terrible.
>
> If having unexpected side effects in read is "terrible" (i.e. I
> personally prefer to use terms like terrible when there's at least
> something that can be exploited in practice, not for theoretical
> issues) for an SCM_RIGHTS receiving daemon, I just don't see how the
> exact same unexpected (still theoretical) side effects in recvmsg with
> an unexpected nested cmsg->cmsg_type == SCM_RIGHTS message being
> returned, isn't terrible too.

If a program calls recvmsg on an FD of unknown provenance, it *must*
be prepared to receive file descriptors via SCM_RIGHTS. If it doesn't,
it's a bug. The contract the kernel makes with userspace for recvmsg()
includes the possibility of creating file descriptors. The contract
the kernel makes with userspace for read() does not ordinarily involve
creating file descriptors, so if the kernel does in fact do that, it's
the kernel's problem.

> > If you call it with a non-empty ancillary data buffer, you know to
> > react to what you get. You're *opting into* the possibility of getting
> > file descriptors. Sure, it's theoretically possible that a program
> > calls recvmsg on random FDs it gets from unknown sources, sees
> > SCM_RIGHTS unexpectedly, and just the SCM_RIGHTS message and its FD
> > payload, but that's an outright bug, while calling read() on stdin is
> > no bug.
>
> I'm not talking about stdin and suid. recvmsg might mitigate the
> concern for suid (not certain, depends on the suid, if it's generally
> doing what you expect most suid to be doing or not), I was talking
> about the SCM_RIGHTS receiving daemon instead, the "worse" more
> concerning case than the suid.
>
> I quote below Andy's relevant email:
>
> ======
> It's worse if SCM_RIGHTS is involved.
> ======
>
> Not all software will do this after calling recvmsg:
>
>     if (cmsg->cmsg_type == SCM_RIGHTS) {
>       /* oops we got attacked and an fd was involountarily installed
>          because we received another AF_UNIX from a malicious attacker
>          in control of the other end of the SCM_RIGHTS-receiving
>          AF_UNIX connection instead of our expected socket family
>          which doesn't even support SCM_RIGHTS so we would never have
>          noticed an fd was installed after recvmsg */
>     }

If a program omits this code after calling recvmsg on a file
descriptor of unknown provenance and the program breaks, it's the
program's fault. It's reasonable to epect that recvmsg might create
file descriptors if you call it on an unknown FD. It's unreasonable to
expect a program to consider the possibility of read() creating file
descriptors because read isn't documented to do that.

>
Andrea Arcangeli Nov. 7, 2019, 7:27 p.m. UTC | #21
On Thu, Nov 07, 2019 at 10:50:26AM -0800, Daniel Colascione wrote:
> On Thu, Nov 7, 2019 at 10:23 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
> > Not all software will do this after calling recvmsg:
> >
> >     if (cmsg->cmsg_type == SCM_RIGHTS) {
> >       /* oops we got attacked and an fd was involountarily installed
> >          because we received another AF_UNIX from a malicious attacker
> >          in control of the other end of the SCM_RIGHTS-receiving
> >          AF_UNIX connection instead of our expected socket family
> >          which doesn't even support SCM_RIGHTS so we would never have
> >          noticed an fd was installed after recvmsg */
> >     }
> 
> If a program omits this code after calling recvmsg on a file
> descriptor of unknown provenance and the program breaks, it's the
> program's fault. [..]

Hmm, ok, would it be possible to do a research effort and check how
much software that receives an fd through SCM_RIGHTS and then calls
recvmsg on it, always remembers to follow the recvsmg with a if
(cmsg->cmsg_type == SCM_RIGHTS) path that closes the involuntarily
opened fd?

Frankly until today, I didn't realize that not adding a specialized
"cmsg->cmsg_type == SCM_RIGHTS" check after every recvmsg executed on
any fd received with SCM_RIGHTS, was a program's fault and it made the
program vulnerable (non robust) from an attack from the other end of
the AF_UNIX pipe just like if the program issued a read() with uffd
event fork being sent through SCM_RIGHTS (except in that case it
wasn't program's fault because read had not to install the fd while
recvmsg can always install fd if cmsg_type == SCM_RIGHTS is returned).

The main argument in favor of recvmsg would be that even if we use it
for uffd too, it won't make recvmsg on an fd received from SCM_RIGHTS
any less secure than it already is, but it's not exactly an example of
robustness in terms of API if it's a program's fault if it doesn't
follow every recvmsg with the above quoted check.

The ioctl is much more robust because there's no chance that somebody
can be attacked by forgetting a specialized non intuitive check after
calling the specialized ioctl that installs the fd.

Thanks,
Andrea
Andy Lutomirski Nov. 10, 2019, 5:02 p.m. UTC | #22
On Thu, Nov 7, 2019 at 10:23 AM Andrea Arcangeli <aarcange@redhat.com> wrote:
>
> On Thu, Nov 07, 2019 at 08:15:53AM -0800, Daniel Colascione wrote:
> > You're already paying for bounds checking. Receiving a message via a
> > datagram socket is basically the same thing as what UFFD's read is
> > doing anyway.
>
> Except it's synchronous and there are no dynamic allocations required
> in uffd, while af_netlink and af_unix both all deal with queue of
> events in skbs dynamically allocated.
>
> Ultimately if we strip away the skbs for performance reasons, there
> wouldn't be much code to share, so if the only benefit would be to
> call recvmsg which would still be as insecure as read for the "worse"
> case than suid, so I don't see the point.

Not sure what you mean.

>
> And should then eventfd also become a netlink then? I mean uffd was
> supposed to work like eventfd except it requires specialized events.

No. None of this even means that these objects should be sockets per
se.  The point is that anyone who calls recvmsg() and passes a control
buf *must* handle SCM_RIGHTS because even very old Unixes can
materialize file descriptors.  The only exception is if the program
knows a priori that the fd refers to a socket that can't use
SCM_RIGHTS.

In other words, failing to handle file descriptors returned by
recvmsg() is an application bug.  Failing to handle file descriptors
returned by read() is not an application bug -- it's a kernel bug.

> > If you call it with a non-empty ancillary data buffer, you know to
> > react to what you get. You're *opting into* the possibility of getting
> > file descriptors. Sure, it's theoretically possible that a program
> > calls recvmsg on random FDs it gets from unknown sources, sees
> > SCM_RIGHTS unexpectedly, and just the SCM_RIGHTS message and its FD
> > payload, but that's an outright bug, while calling read() on stdin is
> > no bug.
>
> I'm not talking about stdin and suid. recvmsg might mitigate the
> concern for suid (not certain, depends on the suid, if it's generally
> doing what you expect most suid to be doing or not), I was talking
> about the SCM_RIGHTS receiving daemon instead, the "worse" more
> concerning case than the suid.
>
> I quote below Andy's relevant email:
>
> ======
> It's worse if SCM_RIGHTS is involved.
> ======
>
> Not all software will do this after calling recvmsg:
>
>     if (cmsg->cmsg_type == SCM_RIGHTS) {
>       /* oops we got attacked and an fd was involountarily installed
>          because we received another AF_UNIX from a malicious attacker
>          in control of the other end of the SCM_RIGHTS-receiving
>          AF_UNIX connection instead of our expected socket family
>          which doesn't even support SCM_RIGHTS so we would never have
>          noticed an fd was installed after recvmsg */
>     }
>

You've misunderstood what you're quoting me as saying.  I'm saying
that the issue is worse if you pass the userfaultfd via SCM_RIGHTS to
an unsuspecting program.  It is perfectly valid to receive a file
descriptor via SCM_RIGHTS and then call read(), at least so long as
you are okay with potentially blocking.

If you receive a fd to a socket using SCM_RIGHTS and then you fail to
check cmsg_type as above, then you have a bug regardless of
userfaultfd.

--Andy
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f9fd18670e22..d99d166fd892 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1834,13 +1834,12 @@  static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
 		goto out;
 	features = uffdio_api.features;
-	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
-		memset(&uffdio_api, 0, sizeof(uffdio_api));
-		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
-			goto out;
-		ret = -EINVAL;
-		goto out;
-	}
+	ret = -EINVAL;
+	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+		goto err_out;
+	ret = -EPERM;
+	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
+		goto err_out;
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
@@ -1853,6 +1852,11 @@  static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	ret = 0;
 out:
 	return ret;
+err_out:
+	memset(&uffdio_api, 0, sizeof(uffdio_api));
+	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+		ret = -EFAULT;
+	goto out;
 }
 
 static long userfaultfd_ioctl(struct file *file, unsigned cmd,