diff mbox series

[3/3] util/userfaultfd: Support /dev/userfaultfd

Message ID 20230125224016.212529-4-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series util/userfaultfd: Support /dev/userfaultfd | expand

Commit Message

Peter Xu Jan. 25, 2023, 10:40 p.m. UTC
Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
system call if either it's not there or doesn't have enough permission.

Firstly, as long as the app has permission to access /dev/userfaultfd, it
always have the ability to trap kernel faults which QEMU mostly wants.
Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
forbidden, so it can be the major way to use postcopy in a restricted
environment with strict seccomp setup.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 util/trace-events  |  1 +
 util/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

Comments

Philippe Mathieu-Daudé Jan. 25, 2023, 11:08 p.m. UTC | #1
On 25/1/23 23:40, Peter Xu wrote:
> Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> system call if either it's not there or doesn't have enough permission.
> 
> Firstly, as long as the app has permission to access /dev/userfaultfd, it
> always have the ability to trap kernel faults which QEMU mostly wants.
> Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> forbidden, so it can be the major way to use postcopy in a restricted
> environment with strict seccomp setup.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   util/trace-events  |  1 +
>   util/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++++++
>   2 files changed, 37 insertions(+)
> 
> diff --git a/util/trace-events b/util/trace-events
> index c8f53d7d9f..16f78d8fe5 100644
> --- a/util/trace-events
> +++ b/util/trace-events
> @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
>   qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
>   
>   #userfaultfd.c
> +uffd_detect_open_mode(int mode) "%d"
>   uffd_query_features_nosys(int err) "errno: %i"
>   uffd_query_features_api_failed(int err) "errno: %i"
>   uffd_create_fd_nosys(int err) "errno: %i"
> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> index 9845a2ec81..360ecf8084 100644
> --- a/util/userfaultfd.c
> +++ b/util/userfaultfd.c
> @@ -18,10 +18,46 @@
>   #include <poll.h>
>   #include <sys/syscall.h>
>   #include <sys/ioctl.h>
> +#include <fcntl.h>
> +
> +typedef enum {
> +    UFFD_UNINITIALIZED = 0,
> +    UFFD_USE_DEV_PATH,
> +    UFFD_USE_SYSCALL,
> +} uffd_open_mode;
> +
> +static uffd_open_mode open_mode;

'open_mode' could be reduced to uffd_detect_open_mode()'s
scope.

> +static int uffd_dev;
> +
> +static uffd_open_mode uffd_detect_open_mode(void)
> +{
> +    if (open_mode == UFFD_UNINITIALIZED) {
> +        /*
> +         * Make /dev/userfaultfd the default approach because it has better
> +         * permission controls, meanwhile allows kernel faults without any
> +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> +         */
> +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> +        if (uffd_dev >= 0) {
> +            open_mode = UFFD_USE_DEV_PATH;
> +        } else {
> +            /* Fallback to the system call */
> +            open_mode = UFFD_USE_SYSCALL;
> +        }
> +        trace_uffd_detect_open_mode(open_mode);
> +    }
> +
> +    return open_mode;

If 'open_mode' isn't relevant, this function could return uffd_dev/-1 
instead. Not really an improvement :)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>

> +}
>   
>   int uffd_open(int flags)
>   {
>   #if defined(__linux__) && defined(__NR_userfaultfd)
> +    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
> +        assert(uffd_dev >= 0);
> +        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
> +    }
> +
>       return syscall(__NR_userfaultfd, flags);
>   #else
>       return -EINVAL;
Daniel P. Berrangé Jan. 26, 2023, 9:02 a.m. UTC | #2
On Wed, Jan 25, 2023 at 05:40:16PM -0500, Peter Xu wrote:
> Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> system call if either it's not there or doesn't have enough permission.
> 
> Firstly, as long as the app has permission to access /dev/userfaultfd, it
> always have the ability to trap kernel faults which QEMU mostly wants.
> Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> forbidden, so it can be the major way to use postcopy in a restricted
> environment with strict seccomp setup.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  util/trace-events  |  1 +
>  util/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 37 insertions(+)
> 
> diff --git a/util/trace-events b/util/trace-events
> index c8f53d7d9f..16f78d8fe5 100644
> --- a/util/trace-events
> +++ b/util/trace-events
> @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
>  qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
>  
>  #userfaultfd.c
> +uffd_detect_open_mode(int mode) "%d"
>  uffd_query_features_nosys(int err) "errno: %i"
>  uffd_query_features_api_failed(int err) "errno: %i"
>  uffd_create_fd_nosys(int err) "errno: %i"
> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> index 9845a2ec81..360ecf8084 100644
> --- a/util/userfaultfd.c
> +++ b/util/userfaultfd.c
> @@ -18,10 +18,46 @@
>  #include <poll.h>
>  #include <sys/syscall.h>
>  #include <sys/ioctl.h>
> +#include <fcntl.h>
> +
> +typedef enum {
> +    UFFD_UNINITIALIZED = 0,
> +    UFFD_USE_DEV_PATH,
> +    UFFD_USE_SYSCALL,
> +} uffd_open_mode;
> +
> +static uffd_open_mode open_mode;
> +static int uffd_dev;
> +
> +static uffd_open_mode uffd_detect_open_mode(void)
> +{
> +    if (open_mode == UFFD_UNINITIALIZED) {
> +        /*
> +         * Make /dev/userfaultfd the default approach because it has better
> +         * permission controls, meanwhile allows kernel faults without any
> +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> +         */
> +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);

qemu_open(), otherwise FD passing from the mgmt app won't work.

> +        if (uffd_dev >= 0) {
> +            open_mode = UFFD_USE_DEV_PATH;
> +        } else {
> +            /* Fallback to the system call */
> +            open_mode = UFFD_USE_SYSCALL;
> +        }
> +        trace_uffd_detect_open_mode(open_mode);
> +    }
> +
> +    return open_mode;
> +}

This leaves the /dev/userfaultfd FD open forever once it has been used
once. Is this really needed ? IIUC, the place where we call this is
not going to be impacted if we open + close it every time we need to
create a new FD, and it'll simplify this code right down.

>  
>  int uffd_open(int flags)
>  {
>  #if defined(__linux__) && defined(__NR_userfaultfd)
> +    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
> +        assert(uffd_dev >= 0);
> +        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
> +    }
> +
>      return syscall(__NR_userfaultfd, flags);
>  #else
>      return -EINVAL;
> -- 
> 2.37.3
> 
> 

With regards,
Daniel
Daniel P. Berrangé Jan. 26, 2023, 9:05 a.m. UTC | #3
On Thu, Jan 26, 2023 at 09:02:09AM +0000, Daniel P. Berrangé wrote:
> On Wed, Jan 25, 2023 at 05:40:16PM -0500, Peter Xu wrote:
> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> > system call if either it's not there or doesn't have enough permission.
> > 
> > Firstly, as long as the app has permission to access /dev/userfaultfd, it
> > always have the ability to trap kernel faults which QEMU mostly wants.
> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> > forbidden, so it can be the major way to use postcopy in a restricted
> > environment with strict seccomp setup.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  util/trace-events  |  1 +
> >  util/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++++++
> >  2 files changed, 37 insertions(+)
> > 
> > diff --git a/util/trace-events b/util/trace-events
> > index c8f53d7d9f..16f78d8fe5 100644
> > --- a/util/trace-events
> > +++ b/util/trace-events
> > @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
> >  qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
> >  
> >  #userfaultfd.c
> > +uffd_detect_open_mode(int mode) "%d"
> >  uffd_query_features_nosys(int err) "errno: %i"
> >  uffd_query_features_api_failed(int err) "errno: %i"
> >  uffd_create_fd_nosys(int err) "errno: %i"
> > diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> > index 9845a2ec81..360ecf8084 100644
> > --- a/util/userfaultfd.c
> > +++ b/util/userfaultfd.c
> > @@ -18,10 +18,46 @@
> >  #include <poll.h>
> >  #include <sys/syscall.h>
> >  #include <sys/ioctl.h>
> > +#include <fcntl.h>
> > +
> > +typedef enum {
> > +    UFFD_UNINITIALIZED = 0,
> > +    UFFD_USE_DEV_PATH,
> > +    UFFD_USE_SYSCALL,
> > +} uffd_open_mode;
> > +
> > +static uffd_open_mode open_mode;
> > +static int uffd_dev;
> > +
> > +static uffd_open_mode uffd_detect_open_mode(void)
> > +{
> > +    if (open_mode == UFFD_UNINITIALIZED) {
> > +        /*
> > +         * Make /dev/userfaultfd the default approach because it has better
> > +         * permission controls, meanwhile allows kernel faults without any
> > +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> > +         */
> > +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> 
> qemu_open(), otherwise FD passing from the mgmt app won't work.
> 
> > +        if (uffd_dev >= 0) {
> > +            open_mode = UFFD_USE_DEV_PATH;
> > +        } else {
> > +            /* Fallback to the system call */
> > +            open_mode = UFFD_USE_SYSCALL;
> > +        }
> > +        trace_uffd_detect_open_mode(open_mode);
> > +    }
> > +
> > +    return open_mode;
> > +}
> 
> This leaves the /dev/userfaultfd FD open forever once it has been used
> once. Is this really needed ? IIUC, the place where we call this is
> not going to be impacted if we open + close it every time we need to
> create a new FD, and it'll simplify this code right down.

Having said that, if we want to support passing the FD in from the
mgmt app, we need to keep it open persistently.

> 
> >  
> >  int uffd_open(int flags)
> >  {
> >  #if defined(__linux__) && defined(__NR_userfaultfd)
> > +    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
> > +        assert(uffd_dev >= 0);
> > +        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
> > +    }
> > +
> >      return syscall(__NR_userfaultfd, flags);
> >  #else
> >      return -EINVAL;
> > -- 
> > 2.37.3
> > 
> > 
> 
> With regards,
> Daniel
> -- 
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
> 
> 

With regards,
Daniel
Peter Xu Jan. 26, 2023, 5:33 p.m. UTC | #4
On Thu, Jan 26, 2023 at 12:08:33AM +0100, Philippe Mathieu-Daudé wrote:
> On 25/1/23 23:40, Peter Xu wrote:
> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> > system call if either it's not there or doesn't have enough permission.
> > 
> > Firstly, as long as the app has permission to access /dev/userfaultfd, it
> > always have the ability to trap kernel faults which QEMU mostly wants.
> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> > forbidden, so it can be the major way to use postcopy in a restricted
> > environment with strict seccomp setup.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   util/trace-events  |  1 +
> >   util/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++++++
> >   2 files changed, 37 insertions(+)
> > 
> > diff --git a/util/trace-events b/util/trace-events
> > index c8f53d7d9f..16f78d8fe5 100644
> > --- a/util/trace-events
> > +++ b/util/trace-events
> > @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
> >   qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
> >   #userfaultfd.c
> > +uffd_detect_open_mode(int mode) "%d"
> >   uffd_query_features_nosys(int err) "errno: %i"
> >   uffd_query_features_api_failed(int err) "errno: %i"
> >   uffd_create_fd_nosys(int err) "errno: %i"
> > diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> > index 9845a2ec81..360ecf8084 100644
> > --- a/util/userfaultfd.c
> > +++ b/util/userfaultfd.c
> > @@ -18,10 +18,46 @@
> >   #include <poll.h>
> >   #include <sys/syscall.h>
> >   #include <sys/ioctl.h>
> > +#include <fcntl.h>
> > +
> > +typedef enum {
> > +    UFFD_UNINITIALIZED = 0,
> > +    UFFD_USE_DEV_PATH,
> > +    UFFD_USE_SYSCALL,
> > +} uffd_open_mode;
> > +
> > +static uffd_open_mode open_mode;
> 
> 'open_mode' could be reduced to uffd_detect_open_mode()'s
> scope.

Yes, will do.

> 
> > +static int uffd_dev;
> > +
> > +static uffd_open_mode uffd_detect_open_mode(void)
> > +{
> > +    if (open_mode == UFFD_UNINITIALIZED) {
> > +        /*
> > +         * Make /dev/userfaultfd the default approach because it has better
> > +         * permission controls, meanwhile allows kernel faults without any
> > +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> > +         */
> > +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> > +        if (uffd_dev >= 0) {
> > +            open_mode = UFFD_USE_DEV_PATH;
> > +        } else {
> > +            /* Fallback to the system call */
> > +            open_mode = UFFD_USE_SYSCALL;
> > +        }
> > +        trace_uffd_detect_open_mode(open_mode);
> > +    }
> > +
> > +    return open_mode;
> 
> If 'open_mode' isn't relevant, this function could return uffd_dev/-1
> instead. Not really an improvement :)

Logically I think the two variables can be squashed into one.  I kept that
for clearance just to easily identify e.g. uffd_dev is not chosen to be
used, or uffd_dev open failed.

> 
> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>

Thanks, Phil.
Peter Xu Jan. 26, 2023, 8:03 p.m. UTC | #5
On Thu, Jan 26, 2023 at 09:05:01AM +0000, Daniel P. Berrangé wrote:
> On Thu, Jan 26, 2023 at 09:02:09AM +0000, Daniel P. Berrangé wrote:
> > On Wed, Jan 25, 2023 at 05:40:16PM -0500, Peter Xu wrote:
> > > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> > > system call if either it's not there or doesn't have enough permission.
> > > 
> > > Firstly, as long as the app has permission to access /dev/userfaultfd, it
> > > always have the ability to trap kernel faults which QEMU mostly wants.
> > > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> > > forbidden, so it can be the major way to use postcopy in a restricted
> > > environment with strict seccomp setup.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  util/trace-events  |  1 +
> > >  util/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 37 insertions(+)
> > > 
> > > diff --git a/util/trace-events b/util/trace-events
> > > index c8f53d7d9f..16f78d8fe5 100644
> > > --- a/util/trace-events
> > > +++ b/util/trace-events
> > > @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
> > >  qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
> > >  
> > >  #userfaultfd.c
> > > +uffd_detect_open_mode(int mode) "%d"
> > >  uffd_query_features_nosys(int err) "errno: %i"
> > >  uffd_query_features_api_failed(int err) "errno: %i"
> > >  uffd_create_fd_nosys(int err) "errno: %i"
> > > diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> > > index 9845a2ec81..360ecf8084 100644
> > > --- a/util/userfaultfd.c
> > > +++ b/util/userfaultfd.c
> > > @@ -18,10 +18,46 @@
> > >  #include <poll.h>
> > >  #include <sys/syscall.h>
> > >  #include <sys/ioctl.h>
> > > +#include <fcntl.h>
> > > +
> > > +typedef enum {
> > > +    UFFD_UNINITIALIZED = 0,
> > > +    UFFD_USE_DEV_PATH,
> > > +    UFFD_USE_SYSCALL,
> > > +} uffd_open_mode;
> > > +
> > > +static uffd_open_mode open_mode;
> > > +static int uffd_dev;
> > > +
> > > +static uffd_open_mode uffd_detect_open_mode(void)
> > > +{
> > > +    if (open_mode == UFFD_UNINITIALIZED) {
> > > +        /*
> > > +         * Make /dev/userfaultfd the default approach because it has better
> > > +         * permission controls, meanwhile allows kernel faults without any
> > > +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> > > +         */
> > > +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> > 
> > qemu_open(), otherwise FD passing from the mgmt app won't work.

[I've followed this up in the other thread on interfacing libvirt, so will
 skip here] 

> > 
> > > +        if (uffd_dev >= 0) {
> > > +            open_mode = UFFD_USE_DEV_PATH;
> > > +        } else {
> > > +            /* Fallback to the system call */
> > > +            open_mode = UFFD_USE_SYSCALL;
> > > +        }
> > > +        trace_uffd_detect_open_mode(open_mode);
> > > +    }
> > > +
> > > +    return open_mode;
> > > +}
> > 
> > This leaves the /dev/userfaultfd FD open forever once it has been used
> > once. Is this really needed ? IIUC, the place where we call this is
> > not going to be impacted if we open + close it every time we need to
> > create a new FD, and it'll simplify this code right down.
> 
> Having said that, if we want to support passing the FD in from the
> mgmt app, we need to keep it open persistently.

Right, since the plan is to further support libvirt, I'll keep it as is.

Meanwhile, right now QEMU detects uffd features by creating an uffd and
quickly close it, it's also efficient to keep it when it's firstly opened.
IIRC for each postcopy procedure we'll open uffd at least three times
during different phases.

Thanks,
diff mbox series

Patch

diff --git a/util/trace-events b/util/trace-events
index c8f53d7d9f..16f78d8fe5 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -93,6 +93,7 @@  qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
 qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
 
 #userfaultfd.c
+uffd_detect_open_mode(int mode) "%d"
 uffd_query_features_nosys(int err) "errno: %i"
 uffd_query_features_api_failed(int err) "errno: %i"
 uffd_create_fd_nosys(int err) "errno: %i"
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
index 9845a2ec81..360ecf8084 100644
--- a/util/userfaultfd.c
+++ b/util/userfaultfd.c
@@ -18,10 +18,46 @@ 
 #include <poll.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
+#include <fcntl.h>
+
+typedef enum {
+    UFFD_UNINITIALIZED = 0,
+    UFFD_USE_DEV_PATH,
+    UFFD_USE_SYSCALL,
+} uffd_open_mode;
+
+static uffd_open_mode open_mode;
+static int uffd_dev;
+
+static uffd_open_mode uffd_detect_open_mode(void)
+{
+    if (open_mode == UFFD_UNINITIALIZED) {
+        /*
+         * Make /dev/userfaultfd the default approach because it has better
+         * permission controls, meanwhile allows kernel faults without any
+         * privilege requirement (e.g. SYS_CAP_PTRACE).
+         */
+        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+        if (uffd_dev >= 0) {
+            open_mode = UFFD_USE_DEV_PATH;
+        } else {
+            /* Fallback to the system call */
+            open_mode = UFFD_USE_SYSCALL;
+        }
+        trace_uffd_detect_open_mode(open_mode);
+    }
+
+    return open_mode;
+}
 
 int uffd_open(int flags)
 {
 #if defined(__linux__) && defined(__NR_userfaultfd)
+    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
+        assert(uffd_dev >= 0);
+        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
+    }
+
     return syscall(__NR_userfaultfd, flags);
 #else
     return -EINVAL;