diff mbox series

[v4,1/5] extend libvhost to support IOThread and coroutine

Message ID 20200218050711.8133-2-coiby.xu@gmail.com (mailing list archive)
State New, archived
Headers show
Series vhost-user block device backend implementation | expand

Commit Message

Coiby Xu Feb. 18, 2020, 5:07 a.m. UTC
Previously libvhost dispatch events in its own GMainContext. Now vhost-user
client's kick event can be dispatched in block device drive's AioContext
thus IOThread is supported. And also allow vu_message_read and
vu_kick_cb to be replaced so QEMU can run them as coroutines.

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 contrib/libvhost-user/libvhost-user.c | 54 ++++++++++++++++++++++++---
 contrib/libvhost-user/libvhost-user.h | 38 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 7 deletions(-)

Comments

Stefan Hajnoczi Feb. 19, 2020, 4:33 p.m. UTC | #1
On Tue, Feb 18, 2020 at 01:07:07PM +0800, Coiby Xu wrote:
> +static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
> +{
> +    vu_read_msg_cb read_msg;
> +    if (dev->co_iface) {
> +        read_msg = dev->co_iface->read_msg;
> +    } else {
> +        read_msg = vu_message_read_;
> +    }
> +    return read_msg(dev, conn_fd, vmsg);
> +}

libvhost-user is already partially asynchronous (->set/remove_watch()
are used for eventfds).

Can you make the vhost-user socket I/O asynchronous too?

> +
>  static bool
>  vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
>  {
> @@ -1075,9 +1084,14 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>      }
>  
>      if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
> +        if (dev->set_watch_packed_data) {
> +            dev->set_watch_packed_data(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
> +                                       dev->co_iface->kick_callback,
> +                                       (void *)(long)index);
> +        } else {
>          dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
>                         vu_kick_cb, (void *)(long)index);

Why is it necessary to replace vu_kick_cb()?

The user can set a custom vq->handler() function with
vu_set_queue_handler().

Coroutines aren't needed for this part since eventfd_read() always
returns right away.

> @@ -1627,6 +1647,12 @@ vu_deinit(VuDev *dev)
>          }
>  
>          if (vq->kick_fd != -1) {
> +            /* remove watch for kick_fd
> +             * When client process is running in gdb and
> +             * quit command is run in gdb, QEMU will still dispatch the event
> +             * which will cause segment fault in the callback function
> +             */

Code and comments in libvhost-user should not refer to QEMU specifics.
Removing the watch is a good idea regardless of the application or event
loop implementation.  No comment is needed here.

> +            dev->remove_watch(dev, vq->kick_fd);
>              close(vq->kick_fd);
>              vq->kick_fd = -1;
>          }
> @@ -1682,7 +1708,7 @@ vu_init(VuDev *dev,
>  
>      assert(max_queues > 0);
>      assert(socket >= 0);
> -    assert(set_watch);
> +    /* assert(set_watch); */

?

> @@ -372,7 +389,8 @@ struct VuDev {
>      /* @set_watch: add or update the given fd to the watch set,
>       * call cb when condition is met */
>      vu_set_watch_cb set_watch;
> -
> +    /* AIO dispatch will only one data pointer to callback function */

I don't understand what this comment is saying.
Kevin Wolf Feb. 25, 2020, 2:55 p.m. UTC | #2
Am 18.02.2020 um 06:07 hat Coiby Xu geschrieben:
> Previously libvhost dispatch events in its own GMainContext. Now vhost-user
> client's kick event can be dispatched in block device drive's AioContext
> thus IOThread is supported. And also allow vu_message_read and
> vu_kick_cb to be replaced so QEMU can run them as coroutines.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> ---
>  contrib/libvhost-user/libvhost-user.c | 54 ++++++++++++++++++++++++---
>  contrib/libvhost-user/libvhost-user.h | 38 ++++++++++++++++++-
>  2 files changed, 85 insertions(+), 7 deletions(-)
> 
> diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
> index 5cb7708559..6aadeaa0f2 100644
> --- a/contrib/libvhost-user/libvhost-user.h
> +++ b/contrib/libvhost-user/libvhost-user.h
> @@ -30,6 +30,8 @@
>  
>  #define VHOST_MEMORY_MAX_NREGIONS 8
>  
> +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> +
>  typedef enum VhostSetConfigType {
>      VHOST_SET_CONFIG_TYPE_MASTER = 0,
>      VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
> @@ -201,6 +203,7 @@ typedef uint64_t (*vu_get_features_cb) (VuDev *dev);
>  typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features);
>  typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg,
>                                    int *do_reply);
> +typedef bool (*vu_read_msg_cb) (VuDev *dev, int sock, VhostUserMsg *vmsg);
>  typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started);
>  typedef bool (*vu_queue_is_processed_in_order_cb) (VuDev *dev, int qidx);
>  typedef int (*vu_get_config_cb) (VuDev *dev, uint8_t *config, uint32_t len);
> @@ -208,6 +211,20 @@ typedef int (*vu_set_config_cb) (VuDev *dev, const uint8_t *data,
>                                   uint32_t offset, uint32_t size,
>                                   uint32_t flags);
>  
> +typedef void (*vu_watch_cb_packed_data) (void *packed_data);
> +
> +typedef void (*vu_set_watch_cb_packed_data) (VuDev *dev, int fd, int condition,
> +                                             vu_watch_cb_packed_data cb,
> +                                             void *data);
> +/*
> + * allowing vu_read_msg_cb and kick_callback to be replaced so QEMU
> + * can run them as coroutines
> + */
> +typedef struct CoIface {
> +    vu_read_msg_cb read_msg;
> +    vu_watch_cb_packed_data kick_callback;
> +} CoIface;

I think this should be part of VuDevIface, so that it becomes a properly
integrated part of the design instead of an adapter hacked on top.

>  typedef struct VuDevIface {
>      /* called by VHOST_USER_GET_FEATURES to get the features bitmask */
>      vu_get_features_cb get_features;
> @@ -372,7 +389,8 @@ struct VuDev {
>      /* @set_watch: add or update the given fd to the watch set,
>       * call cb when condition is met */
>      vu_set_watch_cb set_watch;
> -
> +    /* AIO dispatch will only one data pointer to callback function */
> +    vu_set_watch_cb_packed_data set_watch_packed_data;
>      /* @remove_watch: remove the given fd from the watch set */
>      vu_remove_watch_cb remove_watch;
>  
> @@ -380,7 +398,7 @@ struct VuDev {
>       * re-initialize */
>      vu_panic_cb panic;
>      const VuDevIface *iface;
> -
> +    const CoIface *co_iface;
>      /* Postcopy data */
>      int postcopy_ufd;
>      bool postcopy_listening;
> @@ -417,6 +435,22 @@ bool vu_init(VuDev *dev,
>               const VuDevIface *iface);
>  
>  
> +/**
> + * vu_init_packed_data:
> + * Same as vu_init except for set_watch_packed_data which will pack
> + * two parameters into a struct

Be specific: Which two parameters and which struct?

I think it would be more helpful to name the function after the
additional piece of information that it uses rather than the fact that
it stores it internally in a struct.

We have:

typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
                                 vu_watch_cb cb, void *data);
typedef void (*vu_set_watch_cb_packed_data) (VuDev *dev, int fd, int condition,
                                             vu_watch_cb_packed_data cb,
                                             void *data);

Without looking at the implementation, they have the same set of
parameters. I suspect that the difference is in the content of *data,
but since it is declared void*, I suppose it's treated as an opaque data
type and will only be passed unchanged (and uninspected) to cb.

If so, there is no differene between both types.

> thus QEMU aio_dispatch can pass the
> + * required data to callback function.
> + *
> + * Returns: true on success, false on failure.
> + **/
> +bool vu_init_packed_data(VuDev *dev,
> +                         uint16_t max_queues,
> +                         int socket,
> +                         vu_panic_cb panic,
> +                         vu_set_watch_cb_packed_data set_watch_packed_data,
> +                         vu_remove_watch_cb remove_watch,
> +                         const VuDevIface *iface,
> +                         const CoIface *co_iface);
>  /**
>   * vu_deinit:
>   * @dev: a VuDev context
> -- 
> 2.25.0
> 
> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> index b89bf18501..f95664bb22 100644
> --- a/contrib/libvhost-user/libvhost-user.c
> +++ b/contrib/libvhost-user/libvhost-user.c
> @@ -67,8 +67,6 @@
>  /* The version of inflight buffer */
>  #define INFLIGHT_VERSION 1
>  
> -#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> -
>  /* The version of the protocol we support */
>  #define VHOST_USER_VERSION 1
>  #define LIBVHOST_USER_DEBUG 0
> @@ -260,7 +258,7 @@ have_userfault(void)
>  }
>  
>  static bool
> -vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
> +vu_message_read_(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)

Just adding a trailing underscore isn't a good name. It doesn't tell the
reader what the difference between vu_message_read_ and vu_message_read
is.

>  {
>      char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
>      struct iovec iov = {
> @@ -328,6 +326,17 @@ fail:
>      return false;
>  }
>  
> +static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
> +{
> +    vu_read_msg_cb read_msg;
> +    if (dev->co_iface) {
> +        read_msg = dev->co_iface->read_msg;
> +    } else {
> +        read_msg = vu_message_read_;
> +    }
> +    return read_msg(dev, conn_fd, vmsg);
> +}

If you change VuDevIface so that it contains the fields of CoIface
directly, you can just initialise dev->iface->read_msg with what is
called vu_message_read_() now for the non-QEMU case, and this whole
wrapper becomes unnecessary because the code path is the same for both
cases.

>  static bool
>  vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
>  {
> @@ -1075,9 +1084,14 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>      }
>  
>      if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
> +        if (dev->set_watch_packed_data) {
> +            dev->set_watch_packed_data(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
> +                                       dev->co_iface->kick_callback,
> +                                       (void *)(long)index);
> +        } else {
>          dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
>                         vu_kick_cb, (void *)(long)index);
> -
> +        }

Indentation is off here.

Also, this is almost exactly the same code for both cases. If you
generalise things to have a dev->iface->kick_callback that can be
initialised with vu_kick_cb in the non-QEMU case, you get rid of this
duplication, too.

>          DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
>                 dev->vq[index].kick_fd, index);
>      }
> @@ -1097,8 +1111,14 @@ void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
>      vq->handler = handler;
>      if (vq->kick_fd >= 0) {
>          if (handler) {
> +            if (dev->set_watch_packed_data) {
> +                dev->set_watch_packed_data(dev, vq->kick_fd, VU_WATCH_IN,
> +                                           dev->co_iface->kick_callback,
> +                                           (void *)(long)qidx);
> +            } else {
>              dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
>                             vu_kick_cb, (void *)(long)qidx);
> +            }

Same as above. (Indentation and duplicated code.)

>          } else {
>              dev->remove_watch(dev, vq->kick_fd);
>          }
> @@ -1627,6 +1647,12 @@ vu_deinit(VuDev *dev)
>          }
>  
>          if (vq->kick_fd != -1) {
> +            /* remove watch for kick_fd
> +             * When client process is running in gdb and
> +             * quit command is run in gdb, QEMU will still dispatch the event
> +             * which will cause segment fault in the callback function
> +             */

Reformat this comment to use a consistent line width, maybe like this:

            /*
             * remove watch for kick_fd.
             *
             * When client process is running in gdb and quit command is
             * run in gdb, QEMU will still dispatch the event which will
             * cause segment fault in the callback function
             */

I'm not sure what the comment wants to tell me: Is this an existing
problem in the code that we can run into segfaults, or do we remove the
watch to avoid segfaults?

> +            dev->remove_watch(dev, vq->kick_fd);
>              close(vq->kick_fd);
>              vq->kick_fd = -1;
>          }
> @@ -1682,7 +1708,7 @@ vu_init(VuDev *dev,
>  
>      assert(max_queues > 0);
>      assert(socket >= 0);
> -    assert(set_watch);
> +    /* assert(set_watch); */

Don't leave commented code around. Either leave it in, or remove it
completely.

I think this one should be left in. If you integrate CoIface into
VuDevIface, the assertion will hold true again.

>      assert(remove_watch);
>      assert(iface);
>      assert(panic);
> @@ -1715,6 +1741,24 @@ vu_init(VuDev *dev,
>      return true;
>  }
>  
> +bool
> +vu_init_packed_data(VuDev *dev,
> +        uint16_t max_queues,
> +        int socket,
> +        vu_panic_cb panic,
> +        vu_set_watch_cb_packed_data set_watch_packed_data,
> +        vu_remove_watch_cb remove_watch,
> +        const VuDevIface *iface,
> +        const CoIface *co_iface)
> +{
> +    if (vu_init(dev, max_queues, socket, panic, NULL, remove_watch, iface)) {
> +        dev->set_watch_packed_data = set_watch_packed_data;
> +        dev->co_iface = co_iface;
> +        return true;
> +    }
> +    return false;
> +}

With the integrated VuDevIface, this wrapper becomes unnecessary.

Kevin
diff mbox series

Patch

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index b89bf18501..f95664bb22 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -67,8 +67,6 @@ 
 /* The version of inflight buffer */
 #define INFLIGHT_VERSION 1
 
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
 /* The version of the protocol we support */
 #define VHOST_USER_VERSION 1
 #define LIBVHOST_USER_DEBUG 0
@@ -260,7 +258,7 @@  have_userfault(void)
 }
 
 static bool
-vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
+vu_message_read_(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
     char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
     struct iovec iov = {
@@ -328,6 +326,17 @@  fail:
     return false;
 }
 
+static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
+{
+    vu_read_msg_cb read_msg;
+    if (dev->co_iface) {
+        read_msg = dev->co_iface->read_msg;
+    } else {
+        read_msg = vu_message_read_;
+    }
+    return read_msg(dev, conn_fd, vmsg);
+}
+
 static bool
 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
@@ -1075,9 +1084,14 @@  vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
     }
 
     if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
+        if (dev->set_watch_packed_data) {
+            dev->set_watch_packed_data(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
+                                       dev->co_iface->kick_callback,
+                                       (void *)(long)index);
+        } else {
         dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
                        vu_kick_cb, (void *)(long)index);
-
+        }
         DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
                dev->vq[index].kick_fd, index);
     }
@@ -1097,8 +1111,14 @@  void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
     vq->handler = handler;
     if (vq->kick_fd >= 0) {
         if (handler) {
+            if (dev->set_watch_packed_data) {
+                dev->set_watch_packed_data(dev, vq->kick_fd, VU_WATCH_IN,
+                                           dev->co_iface->kick_callback,
+                                           (void *)(long)qidx);
+            } else {
             dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
                            vu_kick_cb, (void *)(long)qidx);
+            }
         } else {
             dev->remove_watch(dev, vq->kick_fd);
         }
@@ -1627,6 +1647,12 @@  vu_deinit(VuDev *dev)
         }
 
         if (vq->kick_fd != -1) {
+            /* remove watch for kick_fd
+             * When client process is running in gdb and
+             * quit command is run in gdb, QEMU will still dispatch the event
+             * which will cause segment fault in the callback function
+             */
+            dev->remove_watch(dev, vq->kick_fd);
             close(vq->kick_fd);
             vq->kick_fd = -1;
         }
@@ -1682,7 +1708,7 @@  vu_init(VuDev *dev,
 
     assert(max_queues > 0);
     assert(socket >= 0);
-    assert(set_watch);
+    /* assert(set_watch); */
     assert(remove_watch);
     assert(iface);
     assert(panic);
@@ -1715,6 +1741,24 @@  vu_init(VuDev *dev,
     return true;
 }
 
+bool
+vu_init_packed_data(VuDev *dev,
+        uint16_t max_queues,
+        int socket,
+        vu_panic_cb panic,
+        vu_set_watch_cb_packed_data set_watch_packed_data,
+        vu_remove_watch_cb remove_watch,
+        const VuDevIface *iface,
+        const CoIface *co_iface)
+{
+    if (vu_init(dev, max_queues, socket, panic, NULL, remove_watch, iface)) {
+        dev->set_watch_packed_data = set_watch_packed_data;
+        dev->co_iface = co_iface;
+        return true;
+    }
+    return false;
+}
+
 VuVirtq *
 vu_get_queue(VuDev *dev, int qidx)
 {
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 5cb7708559..6aadeaa0f2 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -30,6 +30,8 @@ 
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
 typedef enum VhostSetConfigType {
     VHOST_SET_CONFIG_TYPE_MASTER = 0,
     VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
@@ -201,6 +203,7 @@  typedef uint64_t (*vu_get_features_cb) (VuDev *dev);
 typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features);
 typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg,
                                   int *do_reply);
+typedef bool (*vu_read_msg_cb) (VuDev *dev, int sock, VhostUserMsg *vmsg);
 typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started);
 typedef bool (*vu_queue_is_processed_in_order_cb) (VuDev *dev, int qidx);
 typedef int (*vu_get_config_cb) (VuDev *dev, uint8_t *config, uint32_t len);
@@ -208,6 +211,20 @@  typedef int (*vu_set_config_cb) (VuDev *dev, const uint8_t *data,
                                  uint32_t offset, uint32_t size,
                                  uint32_t flags);
 
+typedef void (*vu_watch_cb_packed_data) (void *packed_data);
+
+typedef void (*vu_set_watch_cb_packed_data) (VuDev *dev, int fd, int condition,
+                                             vu_watch_cb_packed_data cb,
+                                             void *data);
+/*
+ * allowing vu_read_msg_cb and kick_callback to be replaced so QEMU
+ * can run them as coroutines
+ */
+typedef struct CoIface {
+    vu_read_msg_cb read_msg;
+    vu_watch_cb_packed_data kick_callback;
+} CoIface;
+
 typedef struct VuDevIface {
     /* called by VHOST_USER_GET_FEATURES to get the features bitmask */
     vu_get_features_cb get_features;
@@ -372,7 +389,8 @@  struct VuDev {
     /* @set_watch: add or update the given fd to the watch set,
      * call cb when condition is met */
     vu_set_watch_cb set_watch;
-
+    /* AIO dispatch will only one data pointer to callback function */
+    vu_set_watch_cb_packed_data set_watch_packed_data;
     /* @remove_watch: remove the given fd from the watch set */
     vu_remove_watch_cb remove_watch;
 
@@ -380,7 +398,7 @@  struct VuDev {
      * re-initialize */
     vu_panic_cb panic;
     const VuDevIface *iface;
-
+    const CoIface *co_iface;
     /* Postcopy data */
     int postcopy_ufd;
     bool postcopy_listening;
@@ -417,6 +435,22 @@  bool vu_init(VuDev *dev,
              const VuDevIface *iface);
 
 
+/**
+ * vu_init_packed_data:
+ * Same as vu_init except for set_watch_packed_data which will pack
+ * two parameters into a struct thus QEMU aio_dispatch can pass the
+ * required data to callback function.
+ *
+ * Returns: true on success, false on failure.
+ **/
+bool vu_init_packed_data(VuDev *dev,
+                         uint16_t max_queues,
+                         int socket,
+                         vu_panic_cb panic,
+                         vu_set_watch_cb_packed_data set_watch_packed_data,
+                         vu_remove_watch_cb remove_watch,
+                         const VuDevIface *iface,
+                         const CoIface *co_iface);
 /**
  * vu_deinit:
  * @dev: a VuDev context