Message ID | 20210519162903.1172366-18-eperezma@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vDPA software assisted live migration | expand |
在 2021/5/20 上午12:28, Eugenio Pérez 写道: > Initial version of shadow virtqueue that actually forward buffers. The > exposed addresses are the qemu's virtual address, so devices with IOMMU > that does not allow full mapping of qemu's address space does not work > at the moment. > > Also for simplicity it only supports modern devices, that expects vring > in little endian, with split ring and no event idx or indirect > descriptors. > > It reuses the VirtQueue code for the device part. The driver part is > based on Linux's virtio_ring driver, but with stripped functionality > and optimizations so it's easier to review. > > Later commits will solve some of these concerns. It would be more more easier to review if you squash those "enhancements" into this patch. > > Code also need to map used ring (device part) as RW in, and only in, > vhost-net. To map (or call vhost_device_iotlb_miss) inconditionally > would print an error in case of vhost devices with its own mapping > (vdpa). I think we should not depend on the IOTLB miss. Instead, we should program the device IOTLB before starting the svq. Or is there anything that prevent you from doing this? > To know if this call is needed, vhost_sw_live_migration_start_vq and > vhost_sw_live_migration_stop copy the test performed in > vhost_dev_start. Testing for the actual backend type could be cleaner, > or checking for non-NULL vhost_force_iommu, enable_custom_iommu, or > another vhostOp. We could extract this test in its own function too, > so its name could give a better hint. Just copy the vhost_dev_start > check at the moment. > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > hw/virtio/vhost-shadow-virtqueue.c | 205 +++++++++++++++++++++++++++-- > hw/virtio/vhost.c | 134 ++++++++++++++++++- > 2 files changed, 325 insertions(+), 14 deletions(-) > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > index ff50f12410..6d767fe248 100644 > --- a/hw/virtio/vhost-shadow-virtqueue.c > +++ b/hw/virtio/vhost-shadow-virtqueue.c > @@ -9,6 +9,7 @@ > > #include "hw/virtio/vhost-shadow-virtqueue.h" > #include "hw/virtio/vhost.h" > +#include "hw/virtio/virtio-access.h" > > #include "standard-headers/linux/vhost_types.h" > > @@ -48,9 +49,93 @@ typedef struct VhostShadowVirtqueue { > > /* Virtio device */ > VirtIODevice *vdev; > + > + /* Map for returning guest's descriptors */ > + VirtQueueElement **ring_id_maps; > + > + /* Next head to expose to device */ > + uint16_t avail_idx_shadow; > + > + /* Next free descriptor */ > + uint16_t free_head; > + > + /* Last seen used idx */ > + uint16_t shadow_used_idx; > + > + /* Next head to consume from device */ > + uint16_t used_idx; > } VhostShadowVirtqueue; > > -/* Forward guest notifications */ > +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > + const struct iovec *iovec, > + size_t num, bool more_descs, bool write) > +{ > + uint16_t i = svq->free_head, last = svq->free_head; > + unsigned n; > + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; > + vring_desc_t *descs = svq->vring.desc; > + > + if (num == 0) { > + return; > + } > + > + for (n = 0; n < num; n++) { > + if (more_descs || (n + 1 < num)) { > + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); > + } else { > + descs[i].flags = flags; > + } > + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); > + descs[i].len = cpu_to_le32(iovec[n].iov_len); > + > + last = i; > + i = cpu_to_le16(descs[i].next); > + } > + > + svq->free_head = le16_to_cpu(descs[last].next); > +} > + > +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > + VirtQueueElement *elem) > +{ > + int head; > + unsigned avail_idx; > + vring_avail_t *avail = svq->vring.avail; > + > + head = svq->free_head; > + > + /* We need some descriptors here */ > + assert(elem->out_num || elem->in_num); > + > + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > + elem->in_num > 0, false); > + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > + > + /* > + * Put entry in available array (but don't update avail->idx until they > + * do sync). > + */ > + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > + avail->ring[avail_idx] = cpu_to_le16(head); > + svq->avail_idx_shadow++; > + > + /* Expose descriptors to device */ It's better to describe the detail order. E.g "Update avail index after the descriptor is wrote" > + smp_wmb(); > + avail->idx = cpu_to_le16(svq->avail_idx_shadow); > + > + return head; > + > +} > + > +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > + VirtQueueElement *elem) > +{ > + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > + > + svq->ring_id_maps[qemu_head] = elem; > +} > + > +/* Handle guest->device notifications */ > static void vhost_handle_guest_kick(EventNotifier *n) > { > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > @@ -60,7 +145,67 @@ static void vhost_handle_guest_kick(EventNotifier *n) > return; > } > > - event_notifier_set(&svq->kick_notifier); > + /* Make available as many buffers as possible */ > + do { > + if (virtio_queue_get_notification(svq->vq)) { > + /* No more notifications until process all available */ > + virtio_queue_set_notification(svq->vq, false); > + } > + > + while (true) { > + VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem)); > + if (!elem) { > + break; > + } > + > + vhost_shadow_vq_add(svq, elem); > + event_notifier_set(&svq->kick_notifier); > + } > + > + virtio_queue_set_notification(svq->vq, true); > + } while (!virtio_queue_empty(svq->vq)); > +} > + > +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > +{ > + if (svq->used_idx != svq->shadow_used_idx) { > + return true; > + } > + > + /* Get used idx must not be reordered */ > + smp_rmb(); > + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); > + > + return svq->used_idx != svq->shadow_used_idx; > +} > + > +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > +{ > + vring_desc_t *descs = svq->vring.desc; > + const vring_used_t *used = svq->vring.used; > + vring_used_elem_t used_elem; > + uint16_t last_used; > + > + if (!vhost_shadow_vq_more_used(svq)) { > + return NULL; > + } > + > + last_used = svq->used_idx & (svq->vring.num - 1); > + used_elem.id = le32_to_cpu(used->ring[last_used].id); > + used_elem.len = le32_to_cpu(used->ring[last_used].len); > + > + if (unlikely(used_elem.id >= svq->vring.num)) { > + error_report("Device %s says index %u is available", svq->vdev->name, > + used_elem.id); > + return NULL; > + } > + > + descs[used_elem.id].next = svq->free_head; > + svq->free_head = used_elem.id; > + > + svq->used_idx++; > + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > } > > /* Forward vhost notifications */ > @@ -69,17 +214,33 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > call_notifier); > EventNotifier *masked_notifier; > + VirtQueue *vq = svq->vq; > > masked_notifier = svq->masked_notifier.n; > > - if (!masked_notifier) { > - unsigned n = virtio_get_queue_index(svq->vq); > - virtio_queue_invalidate_signalled_used(svq->vdev, n); > - virtio_notify_irqfd(svq->vdev, svq->vq); > - } else if (!svq->masked_notifier.signaled) { > - svq->masked_notifier.signaled = true; > - event_notifier_set(svq->masked_notifier.n); > - } > + /* Make as many buffers as possible used. */ > + do { > + unsigned i = 0; > + > + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > + while (true) { > + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > + if (!elem) { > + break; > + } > + > + assert(i < svq->vring.num); > + virtqueue_fill(vq, elem, elem->len, i++); > + } > + > + virtqueue_flush(vq, i); > + if (!masked_notifier) { > + virtio_notify_irqfd(svq->vdev, svq->vq); Any reason that you don't use virtio_notify() here? > + } else if (!svq->masked_notifier.signaled) { > + svq->masked_notifier.signaled = true; > + event_notifier_set(svq->masked_notifier.n); > + } This is an example of the extra complexity if you do shadow virtqueue at virtio level. If you do everything at e.g vhost-vdpa, you don't need to care about masked_notifer at all. Thanks
On Wed, Jun 2, 2021 at 11:51 AM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2021/5/20 上午12:28, Eugenio Pérez 写道: > > Initial version of shadow virtqueue that actually forward buffers. The > > exposed addresses are the qemu's virtual address, so devices with IOMMU > > that does not allow full mapping of qemu's address space does not work > > at the moment. > > > > Also for simplicity it only supports modern devices, that expects vring > > in little endian, with split ring and no event idx or indirect > > descriptors. > > > > It reuses the VirtQueue code for the device part. The driver part is > > based on Linux's virtio_ring driver, but with stripped functionality > > and optimizations so it's easier to review. > > > > Later commits will solve some of these concerns. > > > It would be more more easier to review if you squash those > "enhancements" into this patch. > Ok, they will be in the same commit for the next version. > > > > > Code also need to map used ring (device part) as RW in, and only in, > > vhost-net. To map (or call vhost_device_iotlb_miss) inconditionally > > would print an error in case of vhost devices with its own mapping > > (vdpa). > > > I think we should not depend on the IOTLB miss. Instead, we should > program the device IOTLB before starting the svq. Or is there anything > that prevent you from doing this? > Sorry for being unclear, that is what I meant in the message: No other device than kernel vhost needs the map (as "sent iotlb miss ahead"), so we must make it conditional. Doing it unconditionally would make nothing but an error appear on the screen, but it is incorrect anyway. Is it clearer this way? > > > To know if this call is needed, vhost_sw_live_migration_start_vq and > > vhost_sw_live_migration_stop copy the test performed in > > vhost_dev_start. Testing for the actual backend type could be cleaner, > > or checking for non-NULL vhost_force_iommu, enable_custom_iommu, or > > another vhostOp. We could extract this test in its own function too, > > so its name could give a better hint. Just copy the vhost_dev_start > > check at the moment. > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > hw/virtio/vhost-shadow-virtqueue.c | 205 +++++++++++++++++++++++++++-- > > hw/virtio/vhost.c | 134 ++++++++++++++++++- > > 2 files changed, 325 insertions(+), 14 deletions(-) > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > > index ff50f12410..6d767fe248 100644 > > --- a/hw/virtio/vhost-shadow-virtqueue.c > > +++ b/hw/virtio/vhost-shadow-virtqueue.c > > @@ -9,6 +9,7 @@ > > > > #include "hw/virtio/vhost-shadow-virtqueue.h" > > #include "hw/virtio/vhost.h" > > +#include "hw/virtio/virtio-access.h" > > > > #include "standard-headers/linux/vhost_types.h" > > > > @@ -48,9 +49,93 @@ typedef struct VhostShadowVirtqueue { > > > > /* Virtio device */ > > VirtIODevice *vdev; > > + > > + /* Map for returning guest's descriptors */ > > + VirtQueueElement **ring_id_maps; > > + > > + /* Next head to expose to device */ > > + uint16_t avail_idx_shadow; > > + > > + /* Next free descriptor */ > > + uint16_t free_head; > > + > > + /* Last seen used idx */ > > + uint16_t shadow_used_idx; > > + > > + /* Next head to consume from device */ > > + uint16_t used_idx; > > } VhostShadowVirtqueue; > > > > -/* Forward guest notifications */ > > +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > > + const struct iovec *iovec, > > + size_t num, bool more_descs, bool write) > > +{ > > + uint16_t i = svq->free_head, last = svq->free_head; > > + unsigned n; > > + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; > > + vring_desc_t *descs = svq->vring.desc; > > + > > + if (num == 0) { > > + return; > > + } > > + > > + for (n = 0; n < num; n++) { > > + if (more_descs || (n + 1 < num)) { > > + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); > > + } else { > > + descs[i].flags = flags; > > + } > > + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); > > + descs[i].len = cpu_to_le32(iovec[n].iov_len); > > + > > + last = i; > > + i = cpu_to_le16(descs[i].next); > > + } > > + > > + svq->free_head = le16_to_cpu(descs[last].next); > > +} > > + > > +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > > + VirtQueueElement *elem) > > +{ > > + int head; > > + unsigned avail_idx; > > + vring_avail_t *avail = svq->vring.avail; > > + > > + head = svq->free_head; > > + > > + /* We need some descriptors here */ > > + assert(elem->out_num || elem->in_num); > > + > > + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > > + elem->in_num > 0, false); > > + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > > + > > + /* > > + * Put entry in available array (but don't update avail->idx until they > > + * do sync). > > + */ > > + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > > + avail->ring[avail_idx] = cpu_to_le16(head); > > + svq->avail_idx_shadow++; > > + > > + /* Expose descriptors to device */ > > > It's better to describe the detail order. > > E.g "Update avail index after the descriptor is wrote" > Agree, I will replace it with your wording. > > > + smp_wmb(); > > + avail->idx = cpu_to_le16(svq->avail_idx_shadow); > > + > > + return head; > > + > > +} > > + > > +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > > + VirtQueueElement *elem) > > +{ > > + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > > + > > + svq->ring_id_maps[qemu_head] = elem; > > +} > > + > > +/* Handle guest->device notifications */ > > static void vhost_handle_guest_kick(EventNotifier *n) > > { > > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > @@ -60,7 +145,67 @@ static void vhost_handle_guest_kick(EventNotifier *n) > > return; > > } > > > > - event_notifier_set(&svq->kick_notifier); > > + /* Make available as many buffers as possible */ > > + do { > > + if (virtio_queue_get_notification(svq->vq)) { > > + /* No more notifications until process all available */ > > + virtio_queue_set_notification(svq->vq, false); > > + } > > + > > + while (true) { > > + VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem)); > > + if (!elem) { > > + break; > > + } > > + > > + vhost_shadow_vq_add(svq, elem); > > + event_notifier_set(&svq->kick_notifier); > > + } > > + > > + virtio_queue_set_notification(svq->vq, true); > > + } while (!virtio_queue_empty(svq->vq)); > > +} > > + > > +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > > +{ > > + if (svq->used_idx != svq->shadow_used_idx) { > > + return true; > > + } > > + > > + /* Get used idx must not be reordered */ > > + smp_rmb(); > > + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); > > + > > + return svq->used_idx != svq->shadow_used_idx; > > +} > > + > > +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > > +{ > > + vring_desc_t *descs = svq->vring.desc; > > + const vring_used_t *used = svq->vring.used; > > + vring_used_elem_t used_elem; > > + uint16_t last_used; > > + > > + if (!vhost_shadow_vq_more_used(svq)) { > > + return NULL; > > + } > > + > > + last_used = svq->used_idx & (svq->vring.num - 1); > > + used_elem.id = le32_to_cpu(used->ring[last_used].id); > > + used_elem.len = le32_to_cpu(used->ring[last_used].len); > > + > > + if (unlikely(used_elem.id >= svq->vring.num)) { > > + error_report("Device %s says index %u is available", svq->vdev->name, > > + used_elem.id); > > + return NULL; > > + } > > + > > + descs[used_elem.id].next = svq->free_head; > > + svq->free_head = used_elem.id; > > + > > + svq->used_idx++; > > + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > > + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > > } > > > > /* Forward vhost notifications */ > > @@ -69,17 +214,33 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > call_notifier); > > EventNotifier *masked_notifier; > > + VirtQueue *vq = svq->vq; > > > > masked_notifier = svq->masked_notifier.n; > > > > - if (!masked_notifier) { > > - unsigned n = virtio_get_queue_index(svq->vq); > > - virtio_queue_invalidate_signalled_used(svq->vdev, n); > > - virtio_notify_irqfd(svq->vdev, svq->vq); > > - } else if (!svq->masked_notifier.signaled) { > > - svq->masked_notifier.signaled = true; > > - event_notifier_set(svq->masked_notifier.n); > > - } > > + /* Make as many buffers as possible used. */ > > + do { > > + unsigned i = 0; > > + > > + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > > + while (true) { > > + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > > + if (!elem) { > > + break; > > + } > > + > > + assert(i < svq->vring.num); > > + virtqueue_fill(vq, elem, elem->len, i++); > > + } > > + > > + virtqueue_flush(vq, i); > > + if (!masked_notifier) { > > + virtio_notify_irqfd(svq->vdev, svq->vq); > > > Any reason that you don't use virtio_notify() here? > No reason but to make sure guest_notifier is used. I'm not sure if this is an implementation detail though. I can test to switch to virtio_notify, what would be the advantage here? > > > + } else if (!svq->masked_notifier.signaled) { > > + svq->masked_notifier.signaled = true; > > + event_notifier_set(svq->masked_notifier.n); > > + } > > > This is an example of the extra complexity if you do shadow virtqueue at .> virtio level. > > If you do everything at e.g vhost-vdpa, you don't need to care about > masked_notifer at all. > Correct me if I'm wrong, what you mean is to use the backend vhost_set_vring_call to set the guest notifier (from SVQ point of view), and then set it unconditionally. The function vhost_virtqueue_mask would set the masked one by itself, no modification is needed here. Backend would still need a conditional checking if SVQ is enabled, so it either sends call_fd to backend or to SVQ. The call to virtqueue_fill, would still be needed if we don't want to duplicate all the device virtio's logic in the vhost-vdpa backend. Another possibility would be to just store guest_notifier in SVQ, and replace it with masked notifier and back. I think this is more aligned with what you have in mind, but it still needs changes to vhost_virtqueue_mask. Note that the boolean store masked_notifier.signaled is just a (maybe premature) optimization to skip the unneeded write syscall, but it could be omitted for brevity. Or maybe a cleaner solution is to use io_uring for this write? :). Thanks! > Thanks >
在 2021/6/3 上午1:18, Eugenio Perez Martin 写道: > On Wed, Jun 2, 2021 at 11:51 AM Jason Wang <jasowang@redhat.com> wrote: >> >> 在 2021/5/20 上午12:28, Eugenio Pérez 写道: >>> Initial version of shadow virtqueue that actually forward buffers. The >>> exposed addresses are the qemu's virtual address, so devices with IOMMU >>> that does not allow full mapping of qemu's address space does not work >>> at the moment. >>> >>> Also for simplicity it only supports modern devices, that expects vring >>> in little endian, with split ring and no event idx or indirect >>> descriptors. >>> >>> It reuses the VirtQueue code for the device part. The driver part is >>> based on Linux's virtio_ring driver, but with stripped functionality >>> and optimizations so it's easier to review. >>> >>> Later commits will solve some of these concerns. >> >> It would be more more easier to review if you squash those >> "enhancements" into this patch. >> > Ok, they will be in the same commit for the next version. > >>> Code also need to map used ring (device part) as RW in, and only in, >>> vhost-net. To map (or call vhost_device_iotlb_miss) inconditionally >>> would print an error in case of vhost devices with its own mapping >>> (vdpa). >> >> I think we should not depend on the IOTLB miss. Instead, we should >> program the device IOTLB before starting the svq. Or is there anything >> that prevent you from doing this? >> > Sorry for being unclear, that is what I meant in the message: No other > device than kernel vhost needs the map (as "sent iotlb miss ahead"), > so we must make it conditional. Doing it unconditionally would make > nothing but an error appear on the screen, but it is incorrect anyway. > > Is it clearer this way? So what I'm worrying is the following code: int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) { ... if (dev->shadow_vqs_enabled) { /* Shadow virtqueue translations in its Virtual Address Space */ const VhostDMAMap *result; const VhostDMAMap needle = { .iova = iova, }; result = vhost_iova_tree_find_taddr(&dev->iova_map, &needle); ... } I wonder the reason for doing that (sorry if I've asked this in the previous version). I think the correct way is to use map those in the device IOTLB before, instead of using the miss. Then we can have a unified code for vhost-vDPA and vhost-kernel. > >>> To know if this call is needed, vhost_sw_live_migration_start_vq and >>> vhost_sw_live_migration_stop copy the test performed in >>> vhost_dev_start. Testing for the actual backend type could be cleaner, >>> or checking for non-NULL vhost_force_iommu, enable_custom_iommu, or >>> another vhostOp. We could extract this test in its own function too, >>> so its name could give a better hint. Just copy the vhost_dev_start >>> check at the moment. >>> >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> >>> --- >>> hw/virtio/vhost-shadow-virtqueue.c | 205 +++++++++++++++++++++++++++-- >>> hw/virtio/vhost.c | 134 ++++++++++++++++++- >>> 2 files changed, 325 insertions(+), 14 deletions(-) >>> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>> index ff50f12410..6d767fe248 100644 >>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>> @@ -9,6 +9,7 @@ >>> >>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>> #include "hw/virtio/vhost.h" >>> +#include "hw/virtio/virtio-access.h" >>> >>> #include "standard-headers/linux/vhost_types.h" >>> >>> @@ -48,9 +49,93 @@ typedef struct VhostShadowVirtqueue { >>> >>> /* Virtio device */ >>> VirtIODevice *vdev; >>> + >>> + /* Map for returning guest's descriptors */ >>> + VirtQueueElement **ring_id_maps; >>> + >>> + /* Next head to expose to device */ >>> + uint16_t avail_idx_shadow; >>> + >>> + /* Next free descriptor */ >>> + uint16_t free_head; >>> + >>> + /* Last seen used idx */ >>> + uint16_t shadow_used_idx; >>> + >>> + /* Next head to consume from device */ >>> + uint16_t used_idx; >>> } VhostShadowVirtqueue; >>> >>> -/* Forward guest notifications */ >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>> + const struct iovec *iovec, >>> + size_t num, bool more_descs, bool write) >>> +{ >>> + uint16_t i = svq->free_head, last = svq->free_head; >>> + unsigned n; >>> + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; >>> + vring_desc_t *descs = svq->vring.desc; >>> + >>> + if (num == 0) { >>> + return; >>> + } >>> + >>> + for (n = 0; n < num; n++) { >>> + if (more_descs || (n + 1 < num)) { >>> + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); >>> + } else { >>> + descs[i].flags = flags; >>> + } >>> + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); >>> + descs[i].len = cpu_to_le32(iovec[n].iov_len); >>> + >>> + last = i; >>> + i = cpu_to_le16(descs[i].next); >>> + } >>> + >>> + svq->free_head = le16_to_cpu(descs[last].next); >>> +} >>> + >>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, >>> + VirtQueueElement *elem) >>> +{ >>> + int head; >>> + unsigned avail_idx; >>> + vring_avail_t *avail = svq->vring.avail; >>> + >>> + head = svq->free_head; >>> + >>> + /* We need some descriptors here */ >>> + assert(elem->out_num || elem->in_num); >>> + >>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>> + elem->in_num > 0, false); >>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>> + >>> + /* >>> + * Put entry in available array (but don't update avail->idx until they >>> + * do sync). >>> + */ >>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); >>> + avail->ring[avail_idx] = cpu_to_le16(head); >>> + svq->avail_idx_shadow++; >>> + >>> + /* Expose descriptors to device */ >> >> It's better to describe the detail order. >> >> E.g "Update avail index after the descriptor is wrote" >> > Agree, I will replace it with your wording. > >>> + smp_wmb(); >>> + avail->idx = cpu_to_le16(svq->avail_idx_shadow); >>> + >>> + return head; >>> + >>> +} >>> + >>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, >>> + VirtQueueElement *elem) >>> +{ >>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); >>> + >>> + svq->ring_id_maps[qemu_head] = elem; >>> +} >>> + >>> +/* Handle guest->device notifications */ >>> static void vhost_handle_guest_kick(EventNotifier *n) >>> { >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>> @@ -60,7 +145,67 @@ static void vhost_handle_guest_kick(EventNotifier *n) >>> return; >>> } >>> >>> - event_notifier_set(&svq->kick_notifier); >>> + /* Make available as many buffers as possible */ >>> + do { >>> + if (virtio_queue_get_notification(svq->vq)) { >>> + /* No more notifications until process all available */ >>> + virtio_queue_set_notification(svq->vq, false); >>> + } >>> + >>> + while (true) { >>> + VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem)); >>> + if (!elem) { >>> + break; >>> + } >>> + >>> + vhost_shadow_vq_add(svq, elem); >>> + event_notifier_set(&svq->kick_notifier); >>> + } >>> + >>> + virtio_queue_set_notification(svq->vq, true); >>> + } while (!virtio_queue_empty(svq->vq)); >>> +} >>> + >>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) >>> +{ >>> + if (svq->used_idx != svq->shadow_used_idx) { >>> + return true; >>> + } >>> + >>> + /* Get used idx must not be reordered */ >>> + smp_rmb(); >>> + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); >>> + >>> + return svq->used_idx != svq->shadow_used_idx; >>> +} >>> + >>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) >>> +{ >>> + vring_desc_t *descs = svq->vring.desc; >>> + const vring_used_t *used = svq->vring.used; >>> + vring_used_elem_t used_elem; >>> + uint16_t last_used; >>> + >>> + if (!vhost_shadow_vq_more_used(svq)) { >>> + return NULL; >>> + } >>> + >>> + last_used = svq->used_idx & (svq->vring.num - 1); >>> + used_elem.id = le32_to_cpu(used->ring[last_used].id); >>> + used_elem.len = le32_to_cpu(used->ring[last_used].len); >>> + >>> + if (unlikely(used_elem.id >= svq->vring.num)) { >>> + error_report("Device %s says index %u is available", svq->vdev->name, >>> + used_elem.id); >>> + return NULL; >>> + } >>> + >>> + descs[used_elem.id].next = svq->free_head; >>> + svq->free_head = used_elem.id; >>> + >>> + svq->used_idx++; >>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; >>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); >>> } >>> >>> /* Forward vhost notifications */ >>> @@ -69,17 +214,33 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>> call_notifier); >>> EventNotifier *masked_notifier; >>> + VirtQueue *vq = svq->vq; >>> >>> masked_notifier = svq->masked_notifier.n; >>> >>> - if (!masked_notifier) { >>> - unsigned n = virtio_get_queue_index(svq->vq); >>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); >>> - virtio_notify_irqfd(svq->vdev, svq->vq); >>> - } else if (!svq->masked_notifier.signaled) { >>> - svq->masked_notifier.signaled = true; >>> - event_notifier_set(svq->masked_notifier.n); >>> - } >>> + /* Make as many buffers as possible used. */ >>> + do { >>> + unsigned i = 0; >>> + >>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ >>> + while (true) { >>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); >>> + if (!elem) { >>> + break; >>> + } >>> + >>> + assert(i < svq->vring.num); >>> + virtqueue_fill(vq, elem, elem->len, i++); >>> + } >>> + >>> + virtqueue_flush(vq, i); >>> + if (!masked_notifier) { >>> + virtio_notify_irqfd(svq->vdev, svq->vq); >> >> Any reason that you don't use virtio_notify() here? >> > No reason but to make sure guest_notifier is used. I'm not sure if > this is an implementation detail though. The difference is that virtio_notify() will go through the memory API which will finally go to irqfd in this case. > > I can test to switch to virtio_notify, what would be the advantage here? Probably no. > >>> + } else if (!svq->masked_notifier.signaled) { >>> + svq->masked_notifier.signaled = true; >>> + event_notifier_set(svq->masked_notifier.n); >>> + } >> >> This is an example of the extra complexity if you do shadow virtqueue at > .> virtio level. >> If you do everything at e.g vhost-vdpa, you don't need to care about >> masked_notifer at all. >> > Correct me if I'm wrong, what you mean is to use the backend > vhost_set_vring_call to set the guest notifier (from SVQ point of > view), and then set it unconditionally. The function > vhost_virtqueue_mask would set the masked one by itself, no > modification is needed here. Something like this, from the point of vhost, it doesn't even need to know whether or not the notifier is masked or not. All it needs is to write to the eventfd set via vq call. > > Backend would still need a conditional checking if SVQ is enabled, so > it either sends call_fd to backend or to SVQ. Yes. > The call to > virtqueue_fill, would still be needed if we don't want to duplicate > all the device virtio's logic in the vhost-vdpa backend. Yes, you can make the buffer forwarding a common library then it could be used other vhost backend in the future. The point is to avoid touching vhost protocols to reduce the changeset and have someting minimal for our requirements (vhost-vDPA mainly). > > Another possibility would be to just store guest_notifier in SVQ, and > replace it with masked notifier and back. I think this is more aligned > with what you have in mind, but it still needs changes to > vhost_virtqueue_mask. Note that the boolean store > masked_notifier.signaled is just a (maybe premature) optimization to > skip the unneeded write syscall, but it could be omitted for brevity. > Or maybe a cleaner solution is to use io_uring for this write? :). Looks like not what I meant :) To clarify, it works like: 1) record the vq call fd1 during vhost_vdpa_set_vring_call 2) when svq is not enabled, set this fd1 to vhost-VDPA via VHOST_SET_VRING_CALL 3) when svq is enabled, initialize and set fd2 to vhost-vDPA, poll and handle guest kick via fd1 and rely fd1 to fd2 So we don't need to care much about the masking, in the svq codes, we just stick to use the fd that is set via recent vhost_vdpa_set_vring_call(). That means, if virtqueue is masked, we're using mased_notifier actually, but it's totally transparent to us. So the idea is behave like a normal vhost-vDPA backend, and hide the shadowing from the virtio codes. Thanks > > Thanks! > >> Thanks >>
On Thu, Jun 3, 2021 at 5:35 AM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2021/6/3 上午1:18, Eugenio Perez Martin 写道: > > On Wed, Jun 2, 2021 at 11:51 AM Jason Wang <jasowang@redhat.com> wrote: > >> > >> 在 2021/5/20 上午12:28, Eugenio Pérez 写道: > >>> Initial version of shadow virtqueue that actually forward buffers. The > >>> exposed addresses are the qemu's virtual address, so devices with IOMMU > >>> that does not allow full mapping of qemu's address space does not work > >>> at the moment. > >>> > >>> Also for simplicity it only supports modern devices, that expects vring > >>> in little endian, with split ring and no event idx or indirect > >>> descriptors. > >>> > >>> It reuses the VirtQueue code for the device part. The driver part is > >>> based on Linux's virtio_ring driver, but with stripped functionality > >>> and optimizations so it's easier to review. > >>> > >>> Later commits will solve some of these concerns. > >> > >> It would be more more easier to review if you squash those > >> "enhancements" into this patch. > >> > > Ok, they will be in the same commit for the next version. > > > >>> Code also need to map used ring (device part) as RW in, and only in, > >>> vhost-net. To map (or call vhost_device_iotlb_miss) inconditionally > >>> would print an error in case of vhost devices with its own mapping > >>> (vdpa). > >> > >> I think we should not depend on the IOTLB miss. Instead, we should > >> program the device IOTLB before starting the svq. Or is there anything > >> that prevent you from doing this? > >> > > Sorry for being unclear, that is what I meant in the message: No other > > device than kernel vhost needs the map (as "sent iotlb miss ahead"), > > so we must make it conditional. Doing it unconditionally would make > > nothing but an error appear on the screen, but it is incorrect anyway. > > > > Is it clearer this way? > > > So what I'm worrying is the following code: > > int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) > { > ... > > if (dev->shadow_vqs_enabled) { > /* Shadow virtqueue translations in its Virtual Address Space */ > const VhostDMAMap *result; > const VhostDMAMap needle = { > .iova = iova, > }; > > result = vhost_iova_tree_find_taddr(&dev->iova_map, &needle); > ... > > } > > I wonder the reason for doing that (sorry if I've asked this in the > previous version). > > I think the correct way is to use map those in the device IOTLB before, > instead of using the miss. > > Then we can have a unified code for vhost-vDPA and vhost-kernel. > Sure we can do it that way, this just followed the usual vhost + IOMMU way of working. Since in this case we are using vIOMMU, the code should also take care of guest's updates. However, I agree it's better to leave this use case for a future patch, and start just taking into account vhost-vdpa map/unmap. > > > > >>> To know if this call is needed, vhost_sw_live_migration_start_vq and > >>> vhost_sw_live_migration_stop copy the test performed in > >>> vhost_dev_start. Testing for the actual backend type could be cleaner, > >>> or checking for non-NULL vhost_force_iommu, enable_custom_iommu, or > >>> another vhostOp. We could extract this test in its own function too, > >>> so its name could give a better hint. Just copy the vhost_dev_start > >>> check at the moment. > >>> > >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > >>> --- > >>> hw/virtio/vhost-shadow-virtqueue.c | 205 +++++++++++++++++++++++++++-- > >>> hw/virtio/vhost.c | 134 ++++++++++++++++++- > >>> 2 files changed, 325 insertions(+), 14 deletions(-) > >>> > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > >>> index ff50f12410..6d767fe248 100644 > >>> --- a/hw/virtio/vhost-shadow-virtqueue.c > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c > >>> @@ -9,6 +9,7 @@ > >>> > >>> #include "hw/virtio/vhost-shadow-virtqueue.h" > >>> #include "hw/virtio/vhost.h" > >>> +#include "hw/virtio/virtio-access.h" > >>> > >>> #include "standard-headers/linux/vhost_types.h" > >>> > >>> @@ -48,9 +49,93 @@ typedef struct VhostShadowVirtqueue { > >>> > >>> /* Virtio device */ > >>> VirtIODevice *vdev; > >>> + > >>> + /* Map for returning guest's descriptors */ > >>> + VirtQueueElement **ring_id_maps; > >>> + > >>> + /* Next head to expose to device */ > >>> + uint16_t avail_idx_shadow; > >>> + > >>> + /* Next free descriptor */ > >>> + uint16_t free_head; > >>> + > >>> + /* Last seen used idx */ > >>> + uint16_t shadow_used_idx; > >>> + > >>> + /* Next head to consume from device */ > >>> + uint16_t used_idx; > >>> } VhostShadowVirtqueue; > >>> > >>> -/* Forward guest notifications */ > >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > >>> + const struct iovec *iovec, > >>> + size_t num, bool more_descs, bool write) > >>> +{ > >>> + uint16_t i = svq->free_head, last = svq->free_head; > >>> + unsigned n; > >>> + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; > >>> + vring_desc_t *descs = svq->vring.desc; > >>> + > >>> + if (num == 0) { > >>> + return; > >>> + } > >>> + > >>> + for (n = 0; n < num; n++) { > >>> + if (more_descs || (n + 1 < num)) { > >>> + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); > >>> + } else { > >>> + descs[i].flags = flags; > >>> + } > >>> + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); > >>> + descs[i].len = cpu_to_le32(iovec[n].iov_len); > >>> + > >>> + last = i; > >>> + i = cpu_to_le16(descs[i].next); > >>> + } > >>> + > >>> + svq->free_head = le16_to_cpu(descs[last].next); > >>> +} > >>> + > >>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > >>> + VirtQueueElement *elem) > >>> +{ > >>> + int head; > >>> + unsigned avail_idx; > >>> + vring_avail_t *avail = svq->vring.avail; > >>> + > >>> + head = svq->free_head; > >>> + > >>> + /* We need some descriptors here */ > >>> + assert(elem->out_num || elem->in_num); > >>> + > >>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > >>> + elem->in_num > 0, false); > >>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > >>> + > >>> + /* > >>> + * Put entry in available array (but don't update avail->idx until they > >>> + * do sync). > >>> + */ > >>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > >>> + avail->ring[avail_idx] = cpu_to_le16(head); > >>> + svq->avail_idx_shadow++; > >>> + > >>> + /* Expose descriptors to device */ > >> > >> It's better to describe the detail order. > >> > >> E.g "Update avail index after the descriptor is wrote" > >> > > Agree, I will replace it with your wording. > > > >>> + smp_wmb(); > >>> + avail->idx = cpu_to_le16(svq->avail_idx_shadow); > >>> + > >>> + return head; > >>> + > >>> +} > >>> + > >>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > >>> + VirtQueueElement *elem) > >>> +{ > >>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > >>> + > >>> + svq->ring_id_maps[qemu_head] = elem; > >>> +} > >>> + > >>> +/* Handle guest->device notifications */ > >>> static void vhost_handle_guest_kick(EventNotifier *n) > >>> { > >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>> @@ -60,7 +145,67 @@ static void vhost_handle_guest_kick(EventNotifier *n) > >>> return; > >>> } > >>> > >>> - event_notifier_set(&svq->kick_notifier); > >>> + /* Make available as many buffers as possible */ > >>> + do { > >>> + if (virtio_queue_get_notification(svq->vq)) { > >>> + /* No more notifications until process all available */ > >>> + virtio_queue_set_notification(svq->vq, false); > >>> + } > >>> + > >>> + while (true) { > >>> + VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem)); > >>> + if (!elem) { > >>> + break; > >>> + } > >>> + > >>> + vhost_shadow_vq_add(svq, elem); > >>> + event_notifier_set(&svq->kick_notifier); > >>> + } > >>> + > >>> + virtio_queue_set_notification(svq->vq, true); > >>> + } while (!virtio_queue_empty(svq->vq)); > >>> +} > >>> + > >>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > >>> +{ > >>> + if (svq->used_idx != svq->shadow_used_idx) { > >>> + return true; > >>> + } > >>> + > >>> + /* Get used idx must not be reordered */ > >>> + smp_rmb(); > >>> + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); > >>> + > >>> + return svq->used_idx != svq->shadow_used_idx; > >>> +} > >>> + > >>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > >>> +{ > >>> + vring_desc_t *descs = svq->vring.desc; > >>> + const vring_used_t *used = svq->vring.used; > >>> + vring_used_elem_t used_elem; > >>> + uint16_t last_used; > >>> + > >>> + if (!vhost_shadow_vq_more_used(svq)) { > >>> + return NULL; > >>> + } > >>> + > >>> + last_used = svq->used_idx & (svq->vring.num - 1); > >>> + used_elem.id = le32_to_cpu(used->ring[last_used].id); > >>> + used_elem.len = le32_to_cpu(used->ring[last_used].len); > >>> + > >>> + if (unlikely(used_elem.id >= svq->vring.num)) { > >>> + error_report("Device %s says index %u is available", svq->vdev->name, > >>> + used_elem.id); > >>> + return NULL; > >>> + } > >>> + > >>> + descs[used_elem.id].next = svq->free_head; > >>> + svq->free_head = used_elem.id; > >>> + > >>> + svq->used_idx++; > >>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > >>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > >>> } > >>> > >>> /* Forward vhost notifications */ > >>> @@ -69,17 +214,33 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>> call_notifier); > >>> EventNotifier *masked_notifier; > >>> + VirtQueue *vq = svq->vq; > >>> > >>> masked_notifier = svq->masked_notifier.n; > >>> > >>> - if (!masked_notifier) { > >>> - unsigned n = virtio_get_queue_index(svq->vq); > >>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); > >>> - virtio_notify_irqfd(svq->vdev, svq->vq); > >>> - } else if (!svq->masked_notifier.signaled) { > >>> - svq->masked_notifier.signaled = true; > >>> - event_notifier_set(svq->masked_notifier.n); > >>> - } > >>> + /* Make as many buffers as possible used. */ > >>> + do { > >>> + unsigned i = 0; > >>> + > >>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > >>> + while (true) { > >>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > >>> + if (!elem) { > >>> + break; > >>> + } > >>> + > >>> + assert(i < svq->vring.num); > >>> + virtqueue_fill(vq, elem, elem->len, i++); > >>> + } > >>> + > >>> + virtqueue_flush(vq, i); > >>> + if (!masked_notifier) { > >>> + virtio_notify_irqfd(svq->vdev, svq->vq); > >> > >> Any reason that you don't use virtio_notify() here? > >> > > No reason but to make sure guest_notifier is used. I'm not sure if > > this is an implementation detail though. > > > The difference is that virtio_notify() will go through the memory API > which will finally go to irqfd in this case. > > > > > > I can test to switch to virtio_notify, what would be the advantage here? > > > Probably no. > > > > > >>> + } else if (!svq->masked_notifier.signaled) { > >>> + svq->masked_notifier.signaled = true; > >>> + event_notifier_set(svq->masked_notifier.n); > >>> + } > >> > >> This is an example of the extra complexity if you do shadow virtqueue at > > .> virtio level. > >> If you do everything at e.g vhost-vdpa, you don't need to care about > >> masked_notifer at all. > >> > > Correct me if I'm wrong, what you mean is to use the backend > > vhost_set_vring_call to set the guest notifier (from SVQ point of > > view), and then set it unconditionally. The function > > vhost_virtqueue_mask would set the masked one by itself, no > > modification is needed here. > > > Something like this, from the point of vhost, it doesn't even need to > know whether or not the notifier is masked or not. All it needs is to > write to the eventfd set via vq call. > > > > > > Backend would still need a conditional checking if SVQ is enabled, so > > it either sends call_fd to backend or to SVQ. > > > Yes. > > > > The call to > > virtqueue_fill, would still be needed if we don't want to duplicate > > all the device virtio's logic in the vhost-vdpa backend. > > > Yes, you can make the buffer forwarding a common library then it could > be used other vhost backend in the future. > > The point is to avoid touching vhost protocols to reduce the changeset > and have someting minimal for our requirements (vhost-vDPA mainly). > > > > > > Another possibility would be to just store guest_notifier in SVQ, and > > replace it with masked notifier and back. I think this is more aligned > > with what you have in mind, but it still needs changes to > > vhost_virtqueue_mask. Note that the boolean store > > masked_notifier.signaled is just a (maybe premature) optimization to > > skip the unneeded write syscall, but it could be omitted for brevity. > > Or maybe a cleaner solution is to use io_uring for this write? :). > > > Looks like not what I meant :) > > To clarify, it works like: > > 1) record the vq call fd1 during vhost_vdpa_set_vring_call > 2) when svq is not enabled, set this fd1 to vhost-VDPA via > VHOST_SET_VRING_CALL > 3) when svq is enabled, initialize and set fd2 to vhost-vDPA, poll and > handle guest kick via fd1 and rely fd1 to fd2 > > So we don't need to care much about the masking, in the svq codes, we > just stick to use the fd that is set via recent vhost_vdpa_set_vring_call(). > > That means, if virtqueue is masked, we're using mased_notifier actually, > but it's totally transparent to us. > > So the idea is behave like a normal vhost-vDPA backend, and hide the > shadowing from the virtio codes. > > Thanks > I'm fine with that approach. It could write many times to masked_notifier if the guest does mask the device and active poll the ring, but 1) I'm not sure if any driver relies on it actually, and they should also indicate that they don't want to be notified through the avail ring flags. 2) Actual devices cannot do these optimizations, it would write repeatedly to masked_notifier. So even if a synthetic test proves that it is beneficial, it probably is of no use in real use cases. > > > > > Thanks! > > > >> Thanks > >> >
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index ff50f12410..6d767fe248 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -9,6 +9,7 @@ #include "hw/virtio/vhost-shadow-virtqueue.h" #include "hw/virtio/vhost.h" +#include "hw/virtio/virtio-access.h" #include "standard-headers/linux/vhost_types.h" @@ -48,9 +49,93 @@ typedef struct VhostShadowVirtqueue { /* Virtio device */ VirtIODevice *vdev; + + /* Map for returning guest's descriptors */ + VirtQueueElement **ring_id_maps; + + /* Next head to expose to device */ + uint16_t avail_idx_shadow; + + /* Next free descriptor */ + uint16_t free_head; + + /* Last seen used idx */ + uint16_t shadow_used_idx; + + /* Next head to consume from device */ + uint16_t used_idx; } VhostShadowVirtqueue; -/* Forward guest notifications */ +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, + const struct iovec *iovec, + size_t num, bool more_descs, bool write) +{ + uint16_t i = svq->free_head, last = svq->free_head; + unsigned n; + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; + vring_desc_t *descs = svq->vring.desc; + + if (num == 0) { + return; + } + + for (n = 0; n < num; n++) { + if (more_descs || (n + 1 < num)) { + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); + } else { + descs[i].flags = flags; + } + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); + descs[i].len = cpu_to_le32(iovec[n].iov_len); + + last = i; + i = cpu_to_le16(descs[i].next); + } + + svq->free_head = le16_to_cpu(descs[last].next); +} + +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, + VirtQueueElement *elem) +{ + int head; + unsigned avail_idx; + vring_avail_t *avail = svq->vring.avail; + + head = svq->free_head; + + /* We need some descriptors here */ + assert(elem->out_num || elem->in_num); + + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, + elem->in_num > 0, false); + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); + + /* + * Put entry in available array (but don't update avail->idx until they + * do sync). + */ + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); + avail->ring[avail_idx] = cpu_to_le16(head); + svq->avail_idx_shadow++; + + /* Expose descriptors to device */ + smp_wmb(); + avail->idx = cpu_to_le16(svq->avail_idx_shadow); + + return head; + +} + +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, + VirtQueueElement *elem) +{ + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); + + svq->ring_id_maps[qemu_head] = elem; +} + +/* Handle guest->device notifications */ static void vhost_handle_guest_kick(EventNotifier *n) { VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, @@ -60,7 +145,67 @@ static void vhost_handle_guest_kick(EventNotifier *n) return; } - event_notifier_set(&svq->kick_notifier); + /* Make available as many buffers as possible */ + do { + if (virtio_queue_get_notification(svq->vq)) { + /* No more notifications until process all available */ + virtio_queue_set_notification(svq->vq, false); + } + + while (true) { + VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem)); + if (!elem) { + break; + } + + vhost_shadow_vq_add(svq, elem); + event_notifier_set(&svq->kick_notifier); + } + + virtio_queue_set_notification(svq->vq, true); + } while (!virtio_queue_empty(svq->vq)); +} + +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) +{ + if (svq->used_idx != svq->shadow_used_idx) { + return true; + } + + /* Get used idx must not be reordered */ + smp_rmb(); + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); + + return svq->used_idx != svq->shadow_used_idx; +} + +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) +{ + vring_desc_t *descs = svq->vring.desc; + const vring_used_t *used = svq->vring.used; + vring_used_elem_t used_elem; + uint16_t last_used; + + if (!vhost_shadow_vq_more_used(svq)) { + return NULL; + } + + last_used = svq->used_idx & (svq->vring.num - 1); + used_elem.id = le32_to_cpu(used->ring[last_used].id); + used_elem.len = le32_to_cpu(used->ring[last_used].len); + + if (unlikely(used_elem.id >= svq->vring.num)) { + error_report("Device %s says index %u is available", svq->vdev->name, + used_elem.id); + return NULL; + } + + descs[used_elem.id].next = svq->free_head; + svq->free_head = used_elem.id; + + svq->used_idx++; + svq->ring_id_maps[used_elem.id]->len = used_elem.len; + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); } /* Forward vhost notifications */ @@ -69,17 +214,33 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, call_notifier); EventNotifier *masked_notifier; + VirtQueue *vq = svq->vq; masked_notifier = svq->masked_notifier.n; - if (!masked_notifier) { - unsigned n = virtio_get_queue_index(svq->vq); - virtio_queue_invalidate_signalled_used(svq->vdev, n); - virtio_notify_irqfd(svq->vdev, svq->vq); - } else if (!svq->masked_notifier.signaled) { - svq->masked_notifier.signaled = true; - event_notifier_set(svq->masked_notifier.n); - } + /* Make as many buffers as possible used. */ + do { + unsigned i = 0; + + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ + while (true) { + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); + if (!elem) { + break; + } + + assert(i < svq->vring.num); + virtqueue_fill(vq, elem, elem->len, i++); + } + + virtqueue_flush(vq, i); + if (!masked_notifier) { + virtio_notify_irqfd(svq->vdev, svq->vq); + } else if (!svq->masked_notifier.signaled) { + svq->masked_notifier.signaled = true; + event_notifier_set(svq->masked_notifier.n); + } + } while (vhost_shadow_vq_more_used(svq)); } static void vhost_shadow_vq_handle_call(EventNotifier *n) @@ -243,7 +404,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, unsigned idx, VhostShadowVirtqueue *svq) { + int i; int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); + + assert(!dev->shadow_vqs_enabled); + if (unlikely(r < 0)) { error_report("Couldn't restore vq kick fd: %s", strerror(-r)); } @@ -255,6 +420,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, /* Restore vhost call */ vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, dev->vqs[idx].notifier_is_masked); + + + for (i = 0; i < svq->vring.num; ++i) { + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; + /* + * Although the doc says we must unpop in order, it's ok to unpop + * everything. + */ + if (elem) { + virtqueue_unpop(svq->vq, elem, elem->len); + } + } } /* @@ -269,7 +446,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) size_t driver_size; size_t device_size; g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); - int r; + int r, i; r = event_notifier_init(&svq->kick_notifier, 0); if (r != 0) { @@ -295,6 +472,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) memset(svq->vring.desc, 0, driver_size); svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size); memset(svq->vring.used, 0, device_size); + for (i = 0; i < num - 1; i++) { + svq->vring.desc[i].next = cpu_to_le16(i + 1); + } + + svq->ring_id_maps = g_new0(VirtQueueElement *, num); event_notifier_set_handler(&svq->call_notifier, vhost_shadow_vq_handle_call); return g_steal_pointer(&svq); @@ -314,6 +496,7 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) event_notifier_cleanup(&vq->kick_notifier); event_notifier_set_handler(&vq->call_notifier, NULL); event_notifier_cleanup(&vq->call_notifier); + g_free(vq->ring_id_maps); qemu_vfree(vq->vring.desc); qemu_vfree(vq->vring.used); g_free(vq); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 333877ca3b..5b5001a08a 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) trace_vhost_iotlb_miss(dev, 1); + if (dev->shadow_vqs_enabled) { + uaddr = iova; + len = 4096; + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, + IOMMU_RW); + if (ret) { + trace_vhost_iotlb_miss(dev, 2); + error_report("Fail to update device iotlb"); + } + + return ret; + } + iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, iova, write, MEMTXATTRS_UNSPECIFIED); @@ -1222,12 +1235,37 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev, static int vhost_sw_live_migration_stop(struct vhost_dev *dev) { - int idx; + int idx, r; dev->shadow_vqs_enabled = false; + r = dev->vhost_ops->vhost_vring_pause(dev); + assert(r == 0); + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { + error_report("Fail to invalidate device iotlb"); + } + for (idx = 0; idx < dev->nvqs; ++idx) { + struct vhost_virtqueue *vq = dev->vqs + idx; + if (vhost_dev_has_iommu(dev) && + dev->vhost_ops->vhost_set_iotlb_callback) { + /* + * Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this. + */ + vhost_device_iotlb_miss(dev, vq->used_phys, true); + } + vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], + dev->vq_index + idx); + } + + /* Enable guest's vq vring */ + r = dev->vhost_ops->vhost_dev_start(dev, true); + assert(r == 0); + + for (idx = 0; idx < dev->nvqs; ++idx) { vhost_shadow_vq_free(dev->shadow_vqs[idx]); } @@ -1236,9 +1274,64 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) return 0; } +/* + * Start shadow virtqueue in a given queue. + * In failure case, this function leaves queue working as regular vhost mode. + */ +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, + unsigned idx) +{ + struct vhost_vring_addr addr = { + .index = idx, + }; + struct vhost_vring_state s = { + .index = idx, + }; + int r; + bool ok; + + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); + if (unlikely(!ok)) { + return false; + } + + /* From this point, vhost_virtqueue_start can reset these changes */ + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); + if (unlikely(r != 0)) { + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); + goto err; + } + + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); + if (unlikely(r != 0)) { + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); + goto err; + } + + if (vhost_dev_has_iommu(dev) && dev->vhost_ops->vhost_set_iotlb_callback) { + /* + * Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this. + */ + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); + if (unlikely(r != 0)) { + /* Debug message already printed */ + goto err; + } + } + + return true; + +err: + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); + return false; +} + static int vhost_sw_live_migration_start(struct vhost_dev *dev) { - int idx, stop_idx; + int r, idx, stop_idx; dev->shadow_vqs = g_new0(VhostShadowVirtqueue *, dev->nvqs); for (idx = 0; idx < dev->nvqs; ++idx) { @@ -1248,23 +1341,37 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) } } + r = dev->vhost_ops->vhost_vring_pause(dev); + assert(r == 0); + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { + error_report("Fail to invalidate device iotlb"); + } + + /* Can be read by vhost_virtqueue_mask, from vm exit */ dev->shadow_vqs_enabled = true; for (idx = 0; idx < dev->nvqs; ++idx) { - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); + bool ok = vhost_sw_live_migration_start_vq(dev, idx); if (unlikely(!ok)) { goto err_start; } } + /* Enable shadow vq vring */ + r = dev->vhost_ops->vhost_dev_start(dev, true); + assert(r == 0); return 0; err_start: dev->shadow_vqs_enabled = false; for (stop_idx = 0; stop_idx < idx; stop_idx++) { vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], + dev->vq_index + stop_idx); } err_new: + /* Enable guest's vring */ + dev->vhost_ops->vhost_set_vring_enable(dev, true); for (idx = 0; idx < dev->nvqs; ++idx) { vhost_shadow_vq_free(dev->shadow_vqs[idx]); } @@ -1979,6 +2086,27 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) if (!hdev->started) { err_cause = "Device is not started"; + } else if (!vhost_dev_has_iommu(hdev)) { + err_cause = "Does not support iommu"; + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + err_cause = "Is packed"; + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { + err_cause = "Have event idx"; + } else if (hdev->acked_features & + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { + err_cause = "Supports indirect descriptors"; + } else if (!hdev->vhost_ops->vhost_vring_pause || + !hdev->vhost_ops->vhost_dev_start) { + err_cause = "Cannot pause device"; + } else if (hdev->vhost_ops->vhost_get_iova_range) { + err_cause = "Device may not support all iova range"; + } else if (hdev->vhost_ops->vhost_enable_custom_iommu) { + err_cause = "Device does not use regular IOMMU"; + } else if (!virtio_vdev_has_feature(hdev->vdev, VIRTIO_F_VERSION_1)) { + err_cause = "Legacy VirtIO device"; + } + + if (err_cause) { goto err; }
Initial version of shadow virtqueue that actually forward buffers. The exposed addresses are the qemu's virtual address, so devices with IOMMU that does not allow full mapping of qemu's address space does not work at the moment. Also for simplicity it only supports modern devices, that expects vring in little endian, with split ring and no event idx or indirect descriptors. It reuses the VirtQueue code for the device part. The driver part is based on Linux's virtio_ring driver, but with stripped functionality and optimizations so it's easier to review. Later commits will solve some of these concerns. Code also need to map used ring (device part) as RW in, and only in, vhost-net. To map (or call vhost_device_iotlb_miss) inconditionally would print an error in case of vhost devices with its own mapping (vdpa). To know if this call is needed, vhost_sw_live_migration_start_vq and vhost_sw_live_migration_stop copy the test performed in vhost_dev_start. Testing for the actual backend type could be cleaner, or checking for non-NULL vhost_force_iommu, enable_custom_iommu, or another vhostOp. We could extract this test in its own function too, so its name could give a better hint. Just copy the vhost_dev_start check at the moment. Signed-off-by: Eugenio Pérez <eperezma@redhat.com> --- hw/virtio/vhost-shadow-virtqueue.c | 205 +++++++++++++++++++++++++++-- hw/virtio/vhost.c | 134 ++++++++++++++++++- 2 files changed, 325 insertions(+), 14 deletions(-)