diff mbox series

[RFC,13/27] vhost: Send buffers to device

Message ID 20201120185105.279030-14-eperezma@redhat.com (mailing list archive)
State New, archived
Headers show
Series vDPA software assisted live migration | expand

Commit Message

Eugenio Perez Martin Nov. 20, 2020, 6:50 p.m. UTC
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-sw-lm-ring.h |   3 +
 hw/virtio/vhost-sw-lm-ring.c | 134 +++++++++++++++++++++++++++++++++--
 hw/virtio/vhost.c            |  59 ++++++++++++++-
 3 files changed, 189 insertions(+), 7 deletions(-)

Comments

Stefan Hajnoczi Dec. 8, 2020, 8:16 a.m. UTC | #1
On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> -static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
> +static bool vhost_vring_should_kick_rcu(VhostShadowVirtqueue *vq)

"vhost_vring_" is a confusing name. This is not related to
vhost_virtqueue or the vhost_vring_* structs.

vhost_shadow_vq_should_kick_rcu()?

>  {
> -    return virtio_queue_get_used_notify_split(vq->vq);
> +    VirtIODevice *vdev = vq->vdev;
> +    vq->num_added = 0;

I'm surprised that a bool function modifies state. Is this assignment
intentional?

> +/* virtqueue_add:
> + * @vq: The #VirtQueue
> + * @elem: The #VirtQueueElement

The copy-pasted doc comment doesn't match this function.

> +int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem)
> +{
> +    int host_head = vhost_vring_add_split(vq, elem);
> +    if (vq->ring_id_maps[host_head]) {
> +        g_free(vq->ring_id_maps[host_head]);
> +    }

VirtQueueElement is freed lazily? Is there a reason for this approach? I
would have expected it to be freed when the used element is process by
the kick fd handler.

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 9352c56bfa..304e0baa61 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -956,8 +956,34 @@ static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq)
>      uint16_t idx = virtio_get_queue_index(vq);
>  
>      VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx];
> +    VirtQueueElement *elem;
>  
> -    vhost_vring_kick(svq);
> +    /*
> +     * Make available all buffers as possible.
> +     */
> +    do {
> +        if (virtio_queue_get_notification(vq)) {
> +            virtio_queue_set_notification(vq, false);
> +        }
> +
> +        while (true) {
> +            int r;
> +            if (virtio_queue_full(vq)) {
> +                break;
> +            }

Why is this check necessary? The guest cannot provide more descriptors
than there is ring space. If that happens somehow then it's a driver
error that is already reported in virtqueue_pop() below.

I wonder if you added this because the vring implementation above
doesn't support indirect descriptors? It's easy to exhaust the vhost
hdev vring while there is still room in the VirtIODevice's VirtQueue
vring.
Eugenio Perez Martin Dec. 9, 2020, 6:41 p.m. UTC | #2
On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > -static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
> > +static bool vhost_vring_should_kick_rcu(VhostShadowVirtqueue *vq)
>
> "vhost_vring_" is a confusing name. This is not related to
> vhost_virtqueue or the vhost_vring_* structs.
>
> vhost_shadow_vq_should_kick_rcu()?
>
> >  {
> > -    return virtio_queue_get_used_notify_split(vq->vq);
> > +    VirtIODevice *vdev = vq->vdev;
> > +    vq->num_added = 0;
>
> I'm surprised that a bool function modifies state. Is this assignment
> intentional?
>

It's from the kernel code, virtqueue_kick_prepare_split function. The
num_added member is internal (mutable) state, counting for the batch
so the driver sends a notification in case of uint16_t wrapping in
vhost_vring_add_split with no notification in between. I don't know if
some actual virtio devices could be actually affected from this, since
actual vqs are smaller than (uint16_t)-1 so they should be aware that
more buffers have been added anyway.

> > +/* virtqueue_add:
> > + * @vq: The #VirtQueue
> > + * @elem: The #VirtQueueElement
>
> The copy-pasted doc comment doesn't match this function.
>

Right, I will fix it.

> > +int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem)
> > +{
> > +    int host_head = vhost_vring_add_split(vq, elem);
> > +    if (vq->ring_id_maps[host_head]) {
> > +        g_free(vq->ring_id_maps[host_head]);
> > +    }
>
> VirtQueueElement is freed lazily? Is there a reason for this approach? I
> would have expected it to be freed when the used element is process by
> the kick fd handler.
>

Maybe it has more sense to free immediately in this commit and
introduce ring_id_maps in later commits, yes.

> > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > index 9352c56bfa..304e0baa61 100644
> > --- a/hw/virtio/vhost.c
> > +++ b/hw/virtio/vhost.c
> > @@ -956,8 +956,34 @@ static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq)
> >      uint16_t idx = virtio_get_queue_index(vq);
> >
> >      VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx];
> > +    VirtQueueElement *elem;
> >
> > -    vhost_vring_kick(svq);
> > +    /*
> > +     * Make available all buffers as possible.
> > +     */
> > +    do {
> > +        if (virtio_queue_get_notification(vq)) {
> > +            virtio_queue_set_notification(vq, false);
> > +        }
> > +
> > +        while (true) {
> > +            int r;
> > +            if (virtio_queue_full(vq)) {
> > +                break;
> > +            }
>
> Why is this check necessary? The guest cannot provide more descriptors
> than there is ring space. If that happens somehow then it's a driver
> error that is already reported in virtqueue_pop() below.
>

It's just checked because virtqueue_pop prints an error on that case,
and there is no way to tell the difference between a regular error and
another caused by other causes. Maybe the right thing to do is just to
not to print that error? Caller should do the error printing in that
case. Should we return an error code?

> I wonder if you added this because the vring implementation above
> doesn't support indirect descriptors? It's easy to exhaust the vhost
> hdev vring while there is still room in the VirtIODevice's VirtQueue
> vring.
Stefan Hajnoczi Dec. 10, 2020, 11:55 a.m. UTC | #3
On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > +        while (true) {
> > > +            int r;
> > > +            if (virtio_queue_full(vq)) {
> > > +                break;
> > > +            }
> >
> > Why is this check necessary? The guest cannot provide more descriptors
> > than there is ring space. If that happens somehow then it's a driver
> > error that is already reported in virtqueue_pop() below.
> >
> 
> It's just checked because virtqueue_pop prints an error on that case,
> and there is no way to tell the difference between a regular error and
> another caused by other causes. Maybe the right thing to do is just to
> not to print that error? Caller should do the error printing in that
> case. Should we return an error code?

The reason an error is printed today is because it's a guest error that
never happens with correct guest drivers. Something is broken and the
user should know about it.

Why is "virtio_queue_full" (I already forgot what that actually means,
it's not clear whether this is referring to avail elements or used
elements) a condition that should be silently ignored in shadow vqs?

Stefan
Eugenio Perez Martin Jan. 22, 2021, 6:18 p.m. UTC | #4
On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > +        while (true) {
> > > > +            int r;
> > > > +            if (virtio_queue_full(vq)) {
> > > > +                break;
> > > > +            }
> > >
> > > Why is this check necessary? The guest cannot provide more descriptors
> > > than there is ring space. If that happens somehow then it's a driver
> > > error that is already reported in virtqueue_pop() below.
> > >
> >
> > It's just checked because virtqueue_pop prints an error on that case,
> > and there is no way to tell the difference between a regular error and
> > another caused by other causes. Maybe the right thing to do is just to
> > not to print that error? Caller should do the error printing in that
> > case. Should we return an error code?
>
> The reason an error is printed today is because it's a guest error that
> never happens with correct guest drivers. Something is broken and the
> user should know about it.
>
> Why is "virtio_queue_full" (I already forgot what that actually means,
> it's not clear whether this is referring to avail elements or used
> elements) a condition that should be silently ignored in shadow vqs?
>

TL;DR: It can be changed to a check of the number of available
descriptors in shadow vq, instead of returning as a regular operation.
However, I think that making it a special return of virtqueue_pop
could help in devices that run to completion, avoiding having to
duplicate the count logic in them.

The function virtio_queue_empty checks if the vq has all descriptors
available, so the device has no more work to do until the driver makes
another descriptor available. I can see how it can be a bad name
choice, but virtio_queue_full means the opposite: device has pop()
every descriptor available, and it has not returned any, so the driver
cannot progress until the device marks some descriptors as used.

As I understand, if vq->in_use >vq->num would mean we have a bug in
the device vq code, not in the driver. virtio_queue_full could even be
changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
vq->vring.num", as long as vq->in_use is operated right.

If we hit vq->in_use == vq->num in virtqueue_pop it means the device
tried to pop() one more buffer after having all of them available and
pop'ed. This would be invalid if the device is counting right the
number of in_use descriptors, but then we are duplicating that logic
in the device and the vq.

In shadow vq this situation happens with the correct guest network
driver, since the rx queue is filled for the device to write. Network
device in qemu fetch descriptors on demand, but shadow vq fetch all
available in batching. If the driver just happens to fill the queue of
available descriptors, the log will raise, so we need to check in
handle_sw_lm_vq before calling pop(). Of course the shadow vq can
duplicate guest_vq->in_use instead of checking virtio_queue_full, but
then it needs to check two things for every virtqueue_pop() [1].

Having said that, would you prefer a checking of available slots in
the shadow vq?

Thanks!

[1] if we don't change virtqueue_pop code.
diff mbox series

Patch

diff --git a/hw/virtio/vhost-sw-lm-ring.h b/hw/virtio/vhost-sw-lm-ring.h
index 86dc081b93..29d21feaf4 100644
--- a/hw/virtio/vhost-sw-lm-ring.h
+++ b/hw/virtio/vhost-sw-lm-ring.h
@@ -18,6 +18,9 @@ 
 typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
 
 bool vhost_vring_kick(VhostShadowVirtqueue *vq);
+int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem);
+void vhost_vring_write_addr(const VhostShadowVirtqueue *vq,
+	                    struct vhost_vring_addr *addr);
 
 VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx);
 
diff --git a/hw/virtio/vhost-sw-lm-ring.c b/hw/virtio/vhost-sw-lm-ring.c
index cd7b5ba772..aed005c2d9 100644
--- a/hw/virtio/vhost-sw-lm-ring.c
+++ b/hw/virtio/vhost-sw-lm-ring.c
@@ -9,6 +9,7 @@ 
 
 #include "hw/virtio/vhost-sw-lm-ring.h"
 #include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio-access.h"
 
 #include "standard-headers/linux/vhost_types.h"
 #include "standard-headers/linux/virtio_ring.h"
@@ -19,21 +20,140 @@  typedef struct VhostShadowVirtqueue {
     struct vring vring;
     EventNotifier hdev_notifier;
     VirtQueue *vq;
+    VirtIODevice *vdev;
+
+    /* Map for returning guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next head to expose to device */
+    uint16_t avail_idx_shadow;
+
+    /* Number of descriptors added since last notification */
+    uint16_t num_added;
+
+    /* Next free descriptor */
+    uint16_t free_head;
 
     vring_desc_t descs[];
 } VhostShadowVirtqueue;
 
-static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
+static bool vhost_vring_should_kick_rcu(VhostShadowVirtqueue *vq)
 {
-    return virtio_queue_get_used_notify_split(vq->vq);
+    VirtIODevice *vdev = vq->vdev;
+    vq->num_added = 0;
+
+    smp_rmb();
+    return !(vq->vring.used->flags
+             & virtio_tswap16(vdev, VRING_USED_F_NO_NOTIFY));
 }
 
+static bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
+{
+    RCU_READ_LOCK_GUARD();
+    return vhost_vring_should_kick_rcu(vq);
+}
+
+
 bool vhost_vring_kick(VhostShadowVirtqueue *vq)
 {
     return vhost_vring_should_kick(vq) ? event_notifier_set(&vq->hdev_notifier)
                                        : true;
 }
 
+static void vhost_vring_write_descs(VhostShadowVirtqueue *vq,
+                                    const struct iovec *iovec,
+                                    size_t num, bool more_descs, bool write)
+{
+    uint16_t i = vq->free_head, last = vq->free_head;
+    unsigned n;
+    const VirtIODevice *vdev = vq->vdev;
+    uint16_t flags = write ? virtio_tswap16(vdev, VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = vq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | virtio_tswap16(vdev, VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = virtio_tswap64(vdev, (hwaddr)iovec[n].iov_base);
+        descs[i].len = virtio_tswap32(vdev, iovec[n].iov_len);
+
+        last = i;
+        i = virtio_tswap16(vdev, descs[i].next);
+    }
+
+    vq->free_head = virtio_tswap16(vdev, descs[last].next);
+}
+
+/* virtqueue_add:
+ * @vq: The #VirtQueue
+ * @elem: The #VirtQueueElement
+ *
+ * Add an avail element to a virtqueue.
+ */
+static int vhost_vring_add_split(VhostShadowVirtqueue *vq,
+                                 const VirtQueueElement *elem)
+{
+    int head;
+    unsigned avail_idx;
+    const VirtIODevice *vdev;
+    vring_avail_t *avail;
+
+    RCU_READ_LOCK_GUARD();
+    vdev = vq->vdev;
+    avail = vq->vring.avail;
+
+    head = vq->free_head;
+
+    /* We need some descriptors here */
+    assert(elem->out_num || elem->in_num);
+
+    vhost_vring_write_descs(vq, elem->out_sg, elem->out_num,
+                   elem->in_num > 0, false);
+    vhost_vring_write_descs(vq, elem->in_sg, elem->in_num, false, true);
+
+    /* Put entry in available array (but don't update avail->idx until they
+     * do sync). */
+    avail_idx = vq->avail_idx_shadow & (vq->vring.num - 1);
+    avail->ring[avail_idx] = virtio_tswap16(vdev, head);
+    vq->avail_idx_shadow++;
+
+    /* Expose descriptors to device */
+    smp_wmb();
+    avail->idx = virtio_tswap16(vdev, vq->avail_idx_shadow);
+
+    /* threoretically possible. Kick just in case */
+    if (unlikely(vq->num_added++ == (uint16_t)-1)) {
+        vhost_vring_kick(vq);
+    }
+
+    return head;
+}
+
+int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem)
+{
+    int host_head = vhost_vring_add_split(vq, elem);
+    if (vq->ring_id_maps[host_head]) {
+        g_free(vq->ring_id_maps[host_head]);
+    }
+
+    vq->ring_id_maps[host_head] = elem;
+    return 0;
+}
+
+void vhost_vring_write_addr(const VhostShadowVirtqueue *vq,
+                            struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)vq->vring.desc;
+    addr->avail_user_addr = (uint64_t)vq->vring.avail;
+    addr->used_user_addr = (uint64_t)vq->vring.used;
+}
+
 VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
 {
     struct vhost_vring_file file = {
@@ -43,9 +163,11 @@  VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
     unsigned num = virtio_queue_get_num(dev->vdev, idx);
     size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE);
     VhostShadowVirtqueue *svq;
-    int r;
+    int r, i;
 
     svq = g_malloc0(sizeof(*svq) + ring_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
+    svq->vdev = dev->vdev;
     svq->vq = vq;
 
     r = event_notifier_init(&svq->hdev_notifier, 0);
@@ -55,8 +177,9 @@  VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
     assert(r == 0);
 
-    vhost_virtqueue_mask(dev, dev->vdev, idx, true);
-    vhost_virtqueue_pending(dev, idx);
+    vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE);
+    for (i = 0; i < num - 1; i++)
+        svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1);
 
     return svq;
 }
@@ -64,5 +187,6 @@  VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
 void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq)
 {
     event_notifier_cleanup(&vq->hdev_notifier);
+    g_free(vq->ring_id_maps);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9352c56bfa..304e0baa61 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -956,8 +956,34 @@  static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq)
     uint16_t idx = virtio_get_queue_index(vq);
 
     VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx];
+    VirtQueueElement *elem;
 
-    vhost_vring_kick(svq);
+    /*
+     * Make available all buffers as possible.
+     */
+    do {
+        if (virtio_queue_get_notification(vq)) {
+            virtio_queue_set_notification(vq, false);
+        }
+
+        while (true) {
+            int r;
+            if (virtio_queue_full(vq)) {
+                break;
+            }
+
+            elem = virtqueue_pop(vq, sizeof(*elem));
+            if (!elem) {
+                break;
+            }
+
+            r = vhost_vring_add(svq, elem);
+            assert(r >= 0);
+            vhost_vring_kick(svq);
+        }
+
+        virtio_queue_set_notification(vq, true);
+    } while(!virtio_queue_empty(vq));
 }
 
 static void vhost_handle_call(EventNotifier *n)
@@ -975,6 +1001,11 @@  static void vhost_handle_call(EventNotifier *n)
     }
 }
 
+static void vhost_virtqueue_stop(struct vhost_dev *dev,
+                                 struct VirtIODevice *vdev,
+                                 struct vhost_virtqueue *vq,
+                                 unsigned idx);
+
 static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
 {
     int idx;
@@ -991,17 +1022,41 @@  static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
 
 static int vhost_sw_live_migration_start(struct vhost_dev *dev)
 {
-    int idx;
+    int idx, r;
+
+    assert(dev->vhost_ops->vhost_set_vring_enable);
+    dev->vhost_ops->vhost_set_vring_enable(dev, false);
 
     for (idx = 0; idx < dev->nvqs; ++idx) {
         struct vhost_virtqueue *vq = &dev->vqs[idx];
+        struct vhost_vring_addr addr = {
+            .index = idx,
+        };
+        struct vhost_vring_state s = {
+            .index = idx,
+        };
+
+        vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], idx);
 
         dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx);
         event_notifier_set_handler(&vq->masked_notifier, vhost_handle_call);
+
+        vhost_vring_write_addr(dev->sw_lm_shadow_vq[idx], &addr);
+        r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
+        assert(r == 0);
+
+        r = dev->vhost_ops->vhost_set_vring_base(dev, &s);
+        assert(r == 0);
     }
 
+    dev->vhost_ops->vhost_set_vring_enable(dev, true);
     vhost_dev_disable_notifiers(dev, dev->vdev);
 
+    for (idx = 0; idx < dev->nvqs; ++idx) {
+        vhost_virtqueue_mask(dev, dev->vdev, idx, true);
+        vhost_virtqueue_pending(dev, idx);
+    }
+
     return 0;
 }