diff mbox series

[3/6] vhost: Do not reset suspended devices on stop

Message ID 20230711155230.64277-4-hreitz@redhat.com (mailing list archive)
State New, archived
Headers show
Series vhost-user: Add suspend/resume | expand

Commit Message

Hanna Czenczek July 11, 2023, 3:52 p.m. UTC
Move the `suspended` field from vhost_vdpa into the global vhost_dev
struct, so vhost_dev_stop() can check whether the back-end has been
suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
there is no need to reset it; the reset is just a fall-back to stop
device operations for back-ends that do not support suspend.

Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
when the device is re-started, we still have to do the reset to have it
un-suspend.

Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
 include/hw/virtio/vhost-vdpa.h |  2 --
 include/hw/virtio/vhost.h      |  8 ++++++++
 hw/virtio/vhost-vdpa.c         | 11 +++++++----
 hw/virtio/vhost.c              |  8 +++++++-
 4 files changed, 22 insertions(+), 7 deletions(-)

Comments

Stefan Hajnoczi July 18, 2023, 2:33 p.m. UTC | #1
On Tue, Jul 11, 2023 at 05:52:25PM +0200, Hanna Czenczek wrote:
> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> struct, so vhost_dev_stop() can check whether the back-end has been
> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> there is no need to reset it; the reset is just a fall-back to stop
> device operations for back-ends that do not support suspend.
> 
> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> when the device is re-started, we still have to do the reset to have it
> un-suspend.
> 
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  include/hw/virtio/vhost-vdpa.h |  2 --
>  include/hw/virtio/vhost.h      |  8 ++++++++
>  hw/virtio/vhost-vdpa.c         | 11 +++++++----
>  hw/virtio/vhost.c              |  8 +++++++-
>  4 files changed, 22 insertions(+), 7 deletions(-)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Eugenio Perez Martin July 21, 2023, 3:25 p.m. UTC | #2
On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>
> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> struct, so vhost_dev_stop() can check whether the back-end has been
> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> there is no need to reset it; the reset is just a fall-back to stop
> device operations for back-ends that do not support suspend.
>
> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> when the device is re-started, we still have to do the reset to have it
> un-suspend.
>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  include/hw/virtio/vhost-vdpa.h |  2 --
>  include/hw/virtio/vhost.h      |  8 ++++++++
>  hw/virtio/vhost-vdpa.c         | 11 +++++++----
>  hw/virtio/vhost.c              |  8 +++++++-
>  4 files changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> index e64bfc7f98..72c3686b7f 100644
> --- a/include/hw/virtio/vhost-vdpa.h
> +++ b/include/hw/virtio/vhost-vdpa.h
> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
>      bool shadow_vqs_enabled;
>      /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
>      bool shadow_data;
> -    /* Device suspended successfully */
> -    bool suspended;
>      /* IOVA mapping used by the Shadow Virtqueue */
>      VhostIOVATree *iova_tree;
>      GPtrArray *shadow_vqs;
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 6a173cb9fa..69bf59d630 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -120,6 +120,14 @@ struct vhost_dev {
>      uint64_t backend_cap;
>      /* @started: is the vhost device started? */
>      bool started;
> +    /**
> +     * @suspended: Whether the vhost device is currently suspended.  Set
> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> +     * are supposed to automatically suspend/resume in their
> +     * vhost_dev_start handlers as required.  Must also be cleared when
> +     * the device is reset.
> +     */
> +    bool suspended;
>      bool log_enabled;
>      uint64_t log_size;
>      Error *migration_blocker;
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 7b7dee468e..f7fd19a203 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
>
>  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
>  {
> -    struct vhost_vdpa *v = dev->opaque;
>      int ret;
>      uint8_t status = 0;
>
>      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
>      trace_vhost_vdpa_reset_device(dev);
> -    v->suspended = false;
> +    dev->suspended = false;
>      return ret;
>  }
>
> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
>          if (unlikely(r)) {
>              error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
>          } else {
> -            v->suspended = true;
> +            dev->suspended = true;
>              return;
>          }
>      }
> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>              return -1;
>          }
>          vhost_vdpa_set_vring_ready(dev);
> +        if (dev->suspended) {
> +            /* TODO: When RESUME is available, use it instead of resetting */
> +            vhost_vdpa_reset_status(dev);

How is that we reset the status at each vhost_vdpa_dev_start? That
will clean all the vqs configured, features negotiated, etc. in the
vDPA device. Or am I missing something?

I'm totally ok with the move of "suspended" member.

Thanks!


> +        }
>      } else {
>          vhost_vdpa_suspend(dev);
>          vhost_vdpa_svqs_stop(dev);
> @@ -1400,7 +1403,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
>          return 0;
>      }
>
> -    if (!v->suspended) {
> +    if (!dev->suspended) {
>          /*
>           * Cannot trust in value returned by device, let vhost recover used
>           * idx from guest.
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index abf0d03c8d..2e28e58da7 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -2059,7 +2059,13 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
>                               hdev->vqs + i,
>                               hdev->vq_index + i);
>      }
> -    if (hdev->vhost_ops->vhost_reset_status) {
> +
> +    /*
> +     * If we failed to successfully stop the device via SUSPEND (should have
> +     * been attempted by `vhost_dev_start(hdev, false)`), reset it to stop it.
> +     * Stateful devices where this would be problematic must implement SUSPEND.
> +     */
> +    if (!hdev->suspended && hdev->vhost_ops->vhost_reset_status) {
>          hdev->vhost_ops->vhost_reset_status(hdev);
>      }
>
> --
> 2.41.0
>
Hanna Czenczek July 21, 2023, 4:07 p.m. UTC | #3
On 21.07.23 17:25, Eugenio Perez Martin wrote:
> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
>> struct, so vhost_dev_stop() can check whether the back-end has been
>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
>> there is no need to reset it; the reset is just a fall-back to stop
>> device operations for back-ends that do not support suspend.
>>
>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
>> when the device is re-started, we still have to do the reset to have it
>> un-suspend.
>>
>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>> ---
>>   include/hw/virtio/vhost-vdpa.h |  2 --
>>   include/hw/virtio/vhost.h      |  8 ++++++++
>>   hw/virtio/vhost-vdpa.c         | 11 +++++++----
>>   hw/virtio/vhost.c              |  8 +++++++-
>>   4 files changed, 22 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
>> index e64bfc7f98..72c3686b7f 100644
>> --- a/include/hw/virtio/vhost-vdpa.h
>> +++ b/include/hw/virtio/vhost-vdpa.h
>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
>>       bool shadow_vqs_enabled;
>>       /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
>>       bool shadow_data;
>> -    /* Device suspended successfully */
>> -    bool suspended;
>>       /* IOVA mapping used by the Shadow Virtqueue */
>>       VhostIOVATree *iova_tree;
>>       GPtrArray *shadow_vqs;
>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>> index 6a173cb9fa..69bf59d630 100644
>> --- a/include/hw/virtio/vhost.h
>> +++ b/include/hw/virtio/vhost.h
>> @@ -120,6 +120,14 @@ struct vhost_dev {
>>       uint64_t backend_cap;
>>       /* @started: is the vhost device started? */
>>       bool started;
>> +    /**
>> +     * @suspended: Whether the vhost device is currently suspended.  Set
>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
>> +     * are supposed to automatically suspend/resume in their
>> +     * vhost_dev_start handlers as required.  Must also be cleared when
>> +     * the device is reset.
>> +     */
>> +    bool suspended;
>>       bool log_enabled;
>>       uint64_t log_size;
>>       Error *migration_blocker;
>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
>> index 7b7dee468e..f7fd19a203 100644
>> --- a/hw/virtio/vhost-vdpa.c
>> +++ b/hw/virtio/vhost-vdpa.c
>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
>>
>>   static int vhost_vdpa_reset_device(struct vhost_dev *dev)
>>   {
>> -    struct vhost_vdpa *v = dev->opaque;
>>       int ret;
>>       uint8_t status = 0;
>>
>>       ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
>>       trace_vhost_vdpa_reset_device(dev);
>> -    v->suspended = false;
>> +    dev->suspended = false;
>>       return ret;
>>   }
>>
>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
>>           if (unlikely(r)) {
>>               error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
>>           } else {
>> -            v->suspended = true;
>> +            dev->suspended = true;
>>               return;
>>           }
>>       }
>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>>               return -1;
>>           }
>>           vhost_vdpa_set_vring_ready(dev);
>> +        if (dev->suspended) {
>> +            /* TODO: When RESUME is available, use it instead of resetting */
>> +            vhost_vdpa_reset_status(dev);
> How is that we reset the status at each vhost_vdpa_dev_start? That
> will clean all the vqs configured, features negotiated, etc. in the
> vDPA device. Or am I missing something?

What alternative do you propose?  We don’t have RESUME for vDPA in qemu, 
but we somehow need to lift the previous SUSPEND so the device will 
again respond to guest requests, do we not?

But more generally, is this any different from what is done before this 
patch?  Before this patch, vhost_dev_stop() unconditionally invokes 
vhost_reset_status(), so the device is reset in every stop/start cycle, 
that doesn’t change.  And we still won’t reset it on the first 
vhost_dev_start(), because dev->suspended will be false then, only on 
subsequent stop/start cycles, as before.  So the only difference is that 
now the device is reset on start, not on stop.

Hanna
Eugenio Perez Martin July 24, 2023, 3:48 p.m. UTC | #4
On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>
> On 21.07.23 17:25, Eugenio Perez Martin wrote:
> > On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> >> struct, so vhost_dev_stop() can check whether the back-end has been
> >> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> >> there is no need to reset it; the reset is just a fall-back to stop
> >> device operations for back-ends that do not support suspend.
> >>
> >> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> >> when the device is re-started, we still have to do the reset to have it
> >> un-suspend.
> >>
> >> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> >> ---
> >>   include/hw/virtio/vhost-vdpa.h |  2 --
> >>   include/hw/virtio/vhost.h      |  8 ++++++++
> >>   hw/virtio/vhost-vdpa.c         | 11 +++++++----
> >>   hw/virtio/vhost.c              |  8 +++++++-
> >>   4 files changed, 22 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> >> index e64bfc7f98..72c3686b7f 100644
> >> --- a/include/hw/virtio/vhost-vdpa.h
> >> +++ b/include/hw/virtio/vhost-vdpa.h
> >> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
> >>       bool shadow_vqs_enabled;
> >>       /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
> >>       bool shadow_data;
> >> -    /* Device suspended successfully */
> >> -    bool suspended;
> >>       /* IOVA mapping used by the Shadow Virtqueue */
> >>       VhostIOVATree *iova_tree;
> >>       GPtrArray *shadow_vqs;
> >> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >> index 6a173cb9fa..69bf59d630 100644
> >> --- a/include/hw/virtio/vhost.h
> >> +++ b/include/hw/virtio/vhost.h
> >> @@ -120,6 +120,14 @@ struct vhost_dev {
> >>       uint64_t backend_cap;
> >>       /* @started: is the vhost device started? */
> >>       bool started;
> >> +    /**
> >> +     * @suspended: Whether the vhost device is currently suspended.  Set
> >> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> >> +     * are supposed to automatically suspend/resume in their
> >> +     * vhost_dev_start handlers as required.  Must also be cleared when
> >> +     * the device is reset.
> >> +     */
> >> +    bool suspended;
> >>       bool log_enabled;
> >>       uint64_t log_size;
> >>       Error *migration_blocker;
> >> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >> index 7b7dee468e..f7fd19a203 100644
> >> --- a/hw/virtio/vhost-vdpa.c
> >> +++ b/hw/virtio/vhost-vdpa.c
> >> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
> >>
> >>   static int vhost_vdpa_reset_device(struct vhost_dev *dev)
> >>   {
> >> -    struct vhost_vdpa *v = dev->opaque;
> >>       int ret;
> >>       uint8_t status = 0;
> >>
> >>       ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
> >>       trace_vhost_vdpa_reset_device(dev);
> >> -    v->suspended = false;
> >> +    dev->suspended = false;
> >>       return ret;
> >>   }
> >>
> >> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
> >>           if (unlikely(r)) {
> >>               error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
> >>           } else {
> >> -            v->suspended = true;
> >> +            dev->suspended = true;
> >>               return;
> >>           }
> >>       }
> >> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >>               return -1;
> >>           }
> >>           vhost_vdpa_set_vring_ready(dev);
> >> +        if (dev->suspended) {
> >> +            /* TODO: When RESUME is available, use it instead of resetting */
> >> +            vhost_vdpa_reset_status(dev);
> > How is that we reset the status at each vhost_vdpa_dev_start? That
> > will clean all the vqs configured, features negotiated, etc. in the
> > vDPA device. Or am I missing something?
>
> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
> but we somehow need to lift the previous SUSPEND so the device will
> again respond to guest requests, do we not?
>

Reset also clears the suspend state in vDPA, and it should be called
at vhost_dev_stop. So the device should never be in suspended state
here. Does that solve your concerns?

> But more generally, is this any different from what is done before this
> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
> vhost_reset_status(), so the device is reset in every stop/start cycle,
> that doesn’t change.  And we still won’t reset it on the first
> vhost_dev_start(), because dev->suspended will be false then, only on
> subsequent stop/start cycles, as before.  So the only difference is that
> now the device is reset on start, not on stop.
>

The difference is that vhost_vdpa_dev_start is called after features
ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
configuration (using vhost_virtqueue_start). A device reset forces the
device to forget about all of that, and qemu cannot configure them
again until qemu acks the features again.
Hanna Czenczek July 25, 2023, 7:53 a.m. UTC | #5
On 24.07.23 17:48, Eugenio Perez Martin wrote:
> On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>> On 21.07.23 17:25, Eugenio Perez Martin wrote:
>>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
>>>> struct, so vhost_dev_stop() can check whether the back-end has been
>>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
>>>> there is no need to reset it; the reset is just a fall-back to stop
>>>> device operations for back-ends that do not support suspend.
>>>>
>>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
>>>> when the device is re-started, we still have to do the reset to have it
>>>> un-suspend.
>>>>
>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>>>> ---
>>>>    include/hw/virtio/vhost-vdpa.h |  2 --
>>>>    include/hw/virtio/vhost.h      |  8 ++++++++
>>>>    hw/virtio/vhost-vdpa.c         | 11 +++++++----
>>>>    hw/virtio/vhost.c              |  8 +++++++-
>>>>    4 files changed, 22 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
>>>> index e64bfc7f98..72c3686b7f 100644
>>>> --- a/include/hw/virtio/vhost-vdpa.h
>>>> +++ b/include/hw/virtio/vhost-vdpa.h
>>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
>>>>        bool shadow_vqs_enabled;
>>>>        /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
>>>>        bool shadow_data;
>>>> -    /* Device suspended successfully */
>>>> -    bool suspended;
>>>>        /* IOVA mapping used by the Shadow Virtqueue */
>>>>        VhostIOVATree *iova_tree;
>>>>        GPtrArray *shadow_vqs;
>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>> index 6a173cb9fa..69bf59d630 100644
>>>> --- a/include/hw/virtio/vhost.h
>>>> +++ b/include/hw/virtio/vhost.h
>>>> @@ -120,6 +120,14 @@ struct vhost_dev {
>>>>        uint64_t backend_cap;
>>>>        /* @started: is the vhost device started? */
>>>>        bool started;
>>>> +    /**
>>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
>>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
>>>> +     * are supposed to automatically suspend/resume in their
>>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
>>>> +     * the device is reset.
>>>> +     */
>>>> +    bool suspended;
>>>>        bool log_enabled;
>>>>        uint64_t log_size;
>>>>        Error *migration_blocker;
>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
>>>> index 7b7dee468e..f7fd19a203 100644
>>>> --- a/hw/virtio/vhost-vdpa.c
>>>> +++ b/hw/virtio/vhost-vdpa.c
>>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
>>>>
>>>>    static int vhost_vdpa_reset_device(struct vhost_dev *dev)
>>>>    {
>>>> -    struct vhost_vdpa *v = dev->opaque;
>>>>        int ret;
>>>>        uint8_t status = 0;
>>>>
>>>>        ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
>>>>        trace_vhost_vdpa_reset_device(dev);
>>>> -    v->suspended = false;
>>>> +    dev->suspended = false;
>>>>        return ret;
>>>>    }
>>>>
>>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
>>>>            if (unlikely(r)) {
>>>>                error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
>>>>            } else {
>>>> -            v->suspended = true;
>>>> +            dev->suspended = true;
>>>>                return;
>>>>            }
>>>>        }
>>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>>>>                return -1;
>>>>            }
>>>>            vhost_vdpa_set_vring_ready(dev);
>>>> +        if (dev->suspended) {
>>>> +            /* TODO: When RESUME is available, use it instead of resetting */
>>>> +            vhost_vdpa_reset_status(dev);
>>> How is that we reset the status at each vhost_vdpa_dev_start? That
>>> will clean all the vqs configured, features negotiated, etc. in the
>>> vDPA device. Or am I missing something?
>> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
>> but we somehow need to lift the previous SUSPEND so the device will
>> again respond to guest requests, do we not?
>>
> Reset also clears the suspend state in vDPA, and it should be called
> at vhost_dev_stop. So the device should never be in suspended state
> here. Does that solve your concerns?

My intention with this patch was precisely not to reset in 
vhost_dev_stop when suspending is supported.  So now I’m more confused 
than before.

>> But more generally, is this any different from what is done before this
>> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
>> vhost_reset_status(), so the device is reset in every stop/start cycle,
>> that doesn’t change.  And we still won’t reset it on the first
>> vhost_dev_start(), because dev->suspended will be false then, only on
>> subsequent stop/start cycles, as before.  So the only difference is that
>> now the device is reset on start, not on stop.
>>
> The difference is that vhost_vdpa_dev_start is called after features
> ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
> configuration (using vhost_virtqueue_start). A device reset forces the
> device to forget about all of that, and qemu cannot configure them
> again until qemu acks the features again.

Now I’m completely confused, because I don’t see the point of 
implementing suspend at all if we rely on a reset immediately afterwards 
anyway.  It was my impression this whole time that suspending would 
remove the need to reset.  Well, at least until the device should be 
resumed again, i.e. in vhost_dev_start().

In addition, I also don’t understand the magnitude of the problem with 
ordering.  If the order in vhost_dev_start() is wrong, can we not easily 
fix it?  E.g. add a full vhost_dev_resume callback to invoke right at 
the start of vhost_dev_start(); or check (in the same place) whether the 
back-end supports resuming, and if it doesn’t (and it is currently 
suspended), reset it there.

In any case, if we need to reset in vhost_dev_stop(), i.e. immediately 
after suspend, I don’t see the point of suspending, indicating to me 
that I still fail to understand its purpose.

Hanna
Eugenio Perez Martin July 25, 2023, 10:03 a.m. UTC | #6
On Tue, Jul 25, 2023 at 9:53 AM Hanna Czenczek <hreitz@redhat.com> wrote:
>
> On 24.07.23 17:48, Eugenio Perez Martin wrote:
> > On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >> On 21.07.23 17:25, Eugenio Perez Martin wrote:
> >>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> >>>> struct, so vhost_dev_stop() can check whether the back-end has been
> >>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> >>>> there is no need to reset it; the reset is just a fall-back to stop
> >>>> device operations for back-ends that do not support suspend.
> >>>>
> >>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> >>>> when the device is re-started, we still have to do the reset to have it
> >>>> un-suspend.
> >>>>
> >>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> >>>> ---
> >>>>    include/hw/virtio/vhost-vdpa.h |  2 --
> >>>>    include/hw/virtio/vhost.h      |  8 ++++++++
> >>>>    hw/virtio/vhost-vdpa.c         | 11 +++++++----
> >>>>    hw/virtio/vhost.c              |  8 +++++++-
> >>>>    4 files changed, 22 insertions(+), 7 deletions(-)
> >>>>
> >>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> >>>> index e64bfc7f98..72c3686b7f 100644
> >>>> --- a/include/hw/virtio/vhost-vdpa.h
> >>>> +++ b/include/hw/virtio/vhost-vdpa.h
> >>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
> >>>>        bool shadow_vqs_enabled;
> >>>>        /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
> >>>>        bool shadow_data;
> >>>> -    /* Device suspended successfully */
> >>>> -    bool suspended;
> >>>>        /* IOVA mapping used by the Shadow Virtqueue */
> >>>>        VhostIOVATree *iova_tree;
> >>>>        GPtrArray *shadow_vqs;
> >>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >>>> index 6a173cb9fa..69bf59d630 100644
> >>>> --- a/include/hw/virtio/vhost.h
> >>>> +++ b/include/hw/virtio/vhost.h
> >>>> @@ -120,6 +120,14 @@ struct vhost_dev {
> >>>>        uint64_t backend_cap;
> >>>>        /* @started: is the vhost device started? */
> >>>>        bool started;
> >>>> +    /**
> >>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
> >>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> >>>> +     * are supposed to automatically suspend/resume in their
> >>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
> >>>> +     * the device is reset.
> >>>> +     */
> >>>> +    bool suspended;
> >>>>        bool log_enabled;
> >>>>        uint64_t log_size;
> >>>>        Error *migration_blocker;
> >>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >>>> index 7b7dee468e..f7fd19a203 100644
> >>>> --- a/hw/virtio/vhost-vdpa.c
> >>>> +++ b/hw/virtio/vhost-vdpa.c
> >>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
> >>>>
> >>>>    static int vhost_vdpa_reset_device(struct vhost_dev *dev)
> >>>>    {
> >>>> -    struct vhost_vdpa *v = dev->opaque;
> >>>>        int ret;
> >>>>        uint8_t status = 0;
> >>>>
> >>>>        ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
> >>>>        trace_vhost_vdpa_reset_device(dev);
> >>>> -    v->suspended = false;
> >>>> +    dev->suspended = false;
> >>>>        return ret;
> >>>>    }
> >>>>
> >>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
> >>>>            if (unlikely(r)) {
> >>>>                error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
> >>>>            } else {
> >>>> -            v->suspended = true;
> >>>> +            dev->suspended = true;
> >>>>                return;
> >>>>            }
> >>>>        }
> >>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >>>>                return -1;
> >>>>            }
> >>>>            vhost_vdpa_set_vring_ready(dev);
> >>>> +        if (dev->suspended) {
> >>>> +            /* TODO: When RESUME is available, use it instead of resetting */
> >>>> +            vhost_vdpa_reset_status(dev);
> >>> How is that we reset the status at each vhost_vdpa_dev_start? That
> >>> will clean all the vqs configured, features negotiated, etc. in the
> >>> vDPA device. Or am I missing something?
> >> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
> >> but we somehow need to lift the previous SUSPEND so the device will
> >> again respond to guest requests, do we not?
> >>
> > Reset also clears the suspend state in vDPA, and it should be called
> > at vhost_dev_stop. So the device should never be in suspended state
> > here. Does that solve your concerns?
>
> My intention with this patch was precisely not to reset in
> vhost_dev_stop when suspending is supported.  So now I’m more confused
> than before.
>

At this moment, I think that should be focused as an optimization and
not to be included in the main series.

> >> But more generally, is this any different from what is done before this
> >> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
> >> vhost_reset_status(), so the device is reset in every stop/start cycle,
> >> that doesn’t change.  And we still won’t reset it on the first
> >> vhost_dev_start(), because dev->suspended will be false then, only on
> >> subsequent stop/start cycles, as before.  So the only difference is that
> >> now the device is reset on start, not on stop.
> >>
> > The difference is that vhost_vdpa_dev_start is called after features
> > ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
> > configuration (using vhost_virtqueue_start). A device reset forces the
> > device to forget about all of that, and qemu cannot configure them
> > again until qemu acks the features again.
>
> Now I’m completely confused, because I don’t see the point of
> implementing suspend at all if we rely on a reset immediately afterwards
> anyway.

It's not immediate. From vhost_dev_stop, comments added only in this mail:

void vhost_virtqueue_stop(struct vhost_dev *dev,
                          struct VirtIODevice *vdev,
                          struct vhost_virtqueue *vq,
                          unsigned idx)
{
    ...
    // Get each vring indexes, trusting the destination device can
continue safely from there
    r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
    if (r < 0) {
        VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
        /* Connection to the backend is broken, so let's sync internal
         * last avail idx to the device used idx.
         */
        virtio_queue_restore_last_avail_idx(vdev, idx);
    } else {
        virtio_queue_set_last_avail_idx(vdev, idx, state.num);
    }
    ...
}

void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
{
    ...
    // Suspend the device, so we can trust in vring indexes / vq state
    if (hdev->vhost_ops->vhost_dev_start) {
        hdev->vhost_ops->vhost_dev_start(hdev, false);
    }
    if (vrings) {
        vhost_dev_set_vring_enable(hdev, false);
    }

    // Fetch each vq index / state and store in vdev->vq[i]
    for (i = 0; i < hdev->nvqs; ++i) {
        vhost_virtqueue_stop(hdev,
                             vdev,
                             hdev->vqs + i,
                             hdev->vq_index + i);
    }

    // Reset the device, as we don't need it anymore and it can
release the resources
    if (hdev->vhost_ops->vhost_reset_status) {
        hdev->vhost_ops->vhost_reset_status(hdev);
    }
}
---

>  It was my impression this whole time that suspending would
> remove the need to reset.  Well, at least until the device should be
> resumed again, i.e. in vhost_dev_start().
>

It cannot. vhost_dev_stop is also called when the guest reset the
device, so the guest trusts the device to be in a clean state.

Also, the reset is the moment when the device frees the unused
resources. This would mandate the device to

> In addition, I also don’t understand the magnitude of the problem with
> ordering.  If the order in vhost_dev_start() is wrong, can we not easily
> fix it?

The order in vhost_dev_start follows the VirtIO standard. The move of
the reset should be to remove it from vhost_dev_stop to something like
both virtio_reset and the end of virtio_save. I'm not sure if I'm
forgetting some other use cases.

> E.g. add a full vhost_dev_resume callback to invoke right at
> the start of vhost_dev_start(); or check (in the same place) whether the
> back-end supports resuming, and if it doesn’t (and it is currently
> suspended), reset it there.
>
> In any case, if we need to reset in vhost_dev_stop(), i.e. immediately
> after suspend, I don’t see the point of suspending, indicating to me
> that I still fail to understand its purpose.
>
> Hanna
>
Hanna Czenczek July 25, 2023, 1:09 p.m. UTC | #7
On 25.07.23 12:03, Eugenio Perez Martin wrote:
> On Tue, Jul 25, 2023 at 9:53 AM Hanna Czenczek <hreitz@redhat.com> wrote:
>> On 24.07.23 17:48, Eugenio Perez Martin wrote:
>>> On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>>>> On 21.07.23 17:25, Eugenio Perez Martin wrote:
>>>>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>>>>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
>>>>>> struct, so vhost_dev_stop() can check whether the back-end has been
>>>>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
>>>>>> there is no need to reset it; the reset is just a fall-back to stop
>>>>>> device operations for back-ends that do not support suspend.
>>>>>>
>>>>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
>>>>>> when the device is re-started, we still have to do the reset to have it
>>>>>> un-suspend.
>>>>>>
>>>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>>>>>> ---
>>>>>>     include/hw/virtio/vhost-vdpa.h |  2 --
>>>>>>     include/hw/virtio/vhost.h      |  8 ++++++++
>>>>>>     hw/virtio/vhost-vdpa.c         | 11 +++++++----
>>>>>>     hw/virtio/vhost.c              |  8 +++++++-
>>>>>>     4 files changed, 22 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
>>>>>> index e64bfc7f98..72c3686b7f 100644
>>>>>> --- a/include/hw/virtio/vhost-vdpa.h
>>>>>> +++ b/include/hw/virtio/vhost-vdpa.h
>>>>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
>>>>>>         bool shadow_vqs_enabled;
>>>>>>         /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
>>>>>>         bool shadow_data;
>>>>>> -    /* Device suspended successfully */
>>>>>> -    bool suspended;
>>>>>>         /* IOVA mapping used by the Shadow Virtqueue */
>>>>>>         VhostIOVATree *iova_tree;
>>>>>>         GPtrArray *shadow_vqs;
>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>>> index 6a173cb9fa..69bf59d630 100644
>>>>>> --- a/include/hw/virtio/vhost.h
>>>>>> +++ b/include/hw/virtio/vhost.h
>>>>>> @@ -120,6 +120,14 @@ struct vhost_dev {
>>>>>>         uint64_t backend_cap;
>>>>>>         /* @started: is the vhost device started? */
>>>>>>         bool started;
>>>>>> +    /**
>>>>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
>>>>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
>>>>>> +     * are supposed to automatically suspend/resume in their
>>>>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
>>>>>> +     * the device is reset.
>>>>>> +     */
>>>>>> +    bool suspended;
>>>>>>         bool log_enabled;
>>>>>>         uint64_t log_size;
>>>>>>         Error *migration_blocker;
>>>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
>>>>>> index 7b7dee468e..f7fd19a203 100644
>>>>>> --- a/hw/virtio/vhost-vdpa.c
>>>>>> +++ b/hw/virtio/vhost-vdpa.c
>>>>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
>>>>>>
>>>>>>     static int vhost_vdpa_reset_device(struct vhost_dev *dev)
>>>>>>     {
>>>>>> -    struct vhost_vdpa *v = dev->opaque;
>>>>>>         int ret;
>>>>>>         uint8_t status = 0;
>>>>>>
>>>>>>         ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
>>>>>>         trace_vhost_vdpa_reset_device(dev);
>>>>>> -    v->suspended = false;
>>>>>> +    dev->suspended = false;
>>>>>>         return ret;
>>>>>>     }
>>>>>>
>>>>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
>>>>>>             if (unlikely(r)) {
>>>>>>                 error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
>>>>>>             } else {
>>>>>> -            v->suspended = true;
>>>>>> +            dev->suspended = true;
>>>>>>                 return;
>>>>>>             }
>>>>>>         }
>>>>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>>>>>>                 return -1;
>>>>>>             }
>>>>>>             vhost_vdpa_set_vring_ready(dev);
>>>>>> +        if (dev->suspended) {
>>>>>> +            /* TODO: When RESUME is available, use it instead of resetting */
>>>>>> +            vhost_vdpa_reset_status(dev);
>>>>> How is that we reset the status at each vhost_vdpa_dev_start? That
>>>>> will clean all the vqs configured, features negotiated, etc. in the
>>>>> vDPA device. Or am I missing something?
>>>> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
>>>> but we somehow need to lift the previous SUSPEND so the device will
>>>> again respond to guest requests, do we not?
>>>>
>>> Reset also clears the suspend state in vDPA, and it should be called
>>> at vhost_dev_stop. So the device should never be in suspended state
>>> here. Does that solve your concerns?
>> My intention with this patch was precisely not to reset in
>> vhost_dev_stop when suspending is supported.  So now I’m more confused
>> than before.
>>
> At this moment, I think that should be focused as an optimization and
> not to be included in the main series.

It is absolutely not an optimization but vital for my use case. 
virtiofsd does not currently implement resetting, but if it did (and we 
want this support in the future), resetting it during stop/cont would be 
catastrophic.  There must be a way to have qemu not issue a reset.  This 
patch is the reason why this series exists.

>>>> But more generally, is this any different from what is done before this
>>>> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
>>>> vhost_reset_status(), so the device is reset in every stop/start cycle,
>>>> that doesn’t change.  And we still won’t reset it on the first
>>>> vhost_dev_start(), because dev->suspended will be false then, only on
>>>> subsequent stop/start cycles, as before.  So the only difference is that
>>>> now the device is reset on start, not on stop.
>>>>
>>> The difference is that vhost_vdpa_dev_start is called after features
>>> ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
>>> configuration (using vhost_virtqueue_start). A device reset forces the
>>> device to forget about all of that, and qemu cannot configure them
>>> again until qemu acks the features again.
>> Now I’m completely confused, because I don’t see the point of
>> implementing suspend at all if we rely on a reset immediately afterwards
>> anyway.
> It's not immediate. From vhost_dev_stop, comments added only in this mail:
>
> void vhost_virtqueue_stop(struct vhost_dev *dev,
>                            struct VirtIODevice *vdev,
>                            struct vhost_virtqueue *vq,
>                            unsigned idx)
> {
>      ...
>      // Get each vring indexes, trusting the destination device can
> continue safely from there
>      r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
>      if (r < 0) {
>          VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
>          /* Connection to the backend is broken, so let's sync internal
>           * last avail idx to the device used idx.
>           */
>          virtio_queue_restore_last_avail_idx(vdev, idx);
>      } else {
>          virtio_queue_set_last_avail_idx(vdev, idx, state.num);
>      }
>      ...
> }
>
> void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
> {
>      ...
>      // Suspend the device, so we can trust in vring indexes / vq state

I don’t understand this purpose.  GET_VRING_BASE stops the vring in 
question, so the vring index returned must be trustworthy, no?

>      if (hdev->vhost_ops->vhost_dev_start) {
>          hdev->vhost_ops->vhost_dev_start(hdev, false);
>      }
>      if (vrings) {
>          vhost_dev_set_vring_enable(hdev, false);
>      }
>
>      // Fetch each vq index / state and store in vdev->vq[i]
>      for (i = 0; i < hdev->nvqs; ++i) {
>          vhost_virtqueue_stop(hdev,
>                               vdev,
>                               hdev->vqs + i,
>                               hdev->vq_index + i);
>      }
>
>      // Reset the device, as we don't need it anymore and it can
> release the resources
>      if (hdev->vhost_ops->vhost_reset_status) {
>          hdev->vhost_ops->vhost_reset_status(hdev);
>      }
> }
> ---
>
>>   It was my impression this whole time that suspending would
>> remove the need to reset.  Well, at least until the device should be
>> resumed again, i.e. in vhost_dev_start().
>>
> It cannot. vhost_dev_stop is also called when the guest reset the
> device, so the guest trusts the device to be in a clean state.
>
> Also, the reset is the moment when the device frees the unused
> resources. This would mandate the device to

What resources are we talking about?  This function is called when the 
VM is paused (stop).  If a stateful device is reset to free “unused 
resources”, this means dropping its internal state, which is absolutely 
wrong in a stop/cont cycle.

>> In addition, I also don’t understand the magnitude of the problem with
>> ordering.  If the order in vhost_dev_start() is wrong, can we not easily
>> fix it?
> The order in vhost_dev_start follows the VirtIO standard.

What does the VirtIO standard say about suspended vhost back-ends?

Hanna

> The move of
> the reset should be to remove it from vhost_dev_stop to something like
> both virtio_reset and the end of virtio_save. I'm not sure if I'm
> forgetting some other use cases.
>
>> E.g. add a full vhost_dev_resume callback to invoke right at
>> the start of vhost_dev_start(); or check (in the same place) whether the
>> back-end supports resuming, and if it doesn’t (and it is currently
>> suspended), reset it there.
>>
>> In any case, if we need to reset in vhost_dev_stop(), i.e. immediately
>> after suspend, I don’t see the point of suspending, indicating to me
>> that I still fail to understand its purpose.
>>
>> Hanna
>>
Eugenio Perez Martin July 25, 2023, 6:53 p.m. UTC | #8
On Tue, Jul 25, 2023 at 3:09 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>
> On 25.07.23 12:03, Eugenio Perez Martin wrote:
> > On Tue, Jul 25, 2023 at 9:53 AM Hanna Czenczek <hreitz@redhat.com> wrote:
> >> On 24.07.23 17:48, Eugenio Perez Martin wrote:
> >>> On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >>>> On 21.07.23 17:25, Eugenio Perez Martin wrote:
> >>>>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >>>>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> >>>>>> struct, so vhost_dev_stop() can check whether the back-end has been
> >>>>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> >>>>>> there is no need to reset it; the reset is just a fall-back to stop
> >>>>>> device operations for back-ends that do not support suspend.
> >>>>>>
> >>>>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> >>>>>> when the device is re-started, we still have to do the reset to have it
> >>>>>> un-suspend.
> >>>>>>
> >>>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> >>>>>> ---
> >>>>>>     include/hw/virtio/vhost-vdpa.h |  2 --
> >>>>>>     include/hw/virtio/vhost.h      |  8 ++++++++
> >>>>>>     hw/virtio/vhost-vdpa.c         | 11 +++++++----
> >>>>>>     hw/virtio/vhost.c              |  8 +++++++-
> >>>>>>     4 files changed, 22 insertions(+), 7 deletions(-)
> >>>>>>
> >>>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> >>>>>> index e64bfc7f98..72c3686b7f 100644
> >>>>>> --- a/include/hw/virtio/vhost-vdpa.h
> >>>>>> +++ b/include/hw/virtio/vhost-vdpa.h
> >>>>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
> >>>>>>         bool shadow_vqs_enabled;
> >>>>>>         /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
> >>>>>>         bool shadow_data;
> >>>>>> -    /* Device suspended successfully */
> >>>>>> -    bool suspended;
> >>>>>>         /* IOVA mapping used by the Shadow Virtqueue */
> >>>>>>         VhostIOVATree *iova_tree;
> >>>>>>         GPtrArray *shadow_vqs;
> >>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >>>>>> index 6a173cb9fa..69bf59d630 100644
> >>>>>> --- a/include/hw/virtio/vhost.h
> >>>>>> +++ b/include/hw/virtio/vhost.h
> >>>>>> @@ -120,6 +120,14 @@ struct vhost_dev {
> >>>>>>         uint64_t backend_cap;
> >>>>>>         /* @started: is the vhost device started? */
> >>>>>>         bool started;
> >>>>>> +    /**
> >>>>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
> >>>>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> >>>>>> +     * are supposed to automatically suspend/resume in their
> >>>>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
> >>>>>> +     * the device is reset.
> >>>>>> +     */
> >>>>>> +    bool suspended;
> >>>>>>         bool log_enabled;
> >>>>>>         uint64_t log_size;
> >>>>>>         Error *migration_blocker;
> >>>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >>>>>> index 7b7dee468e..f7fd19a203 100644
> >>>>>> --- a/hw/virtio/vhost-vdpa.c
> >>>>>> +++ b/hw/virtio/vhost-vdpa.c
> >>>>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
> >>>>>>
> >>>>>>     static int vhost_vdpa_reset_device(struct vhost_dev *dev)
> >>>>>>     {
> >>>>>> -    struct vhost_vdpa *v = dev->opaque;
> >>>>>>         int ret;
> >>>>>>         uint8_t status = 0;
> >>>>>>
> >>>>>>         ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
> >>>>>>         trace_vhost_vdpa_reset_device(dev);
> >>>>>> -    v->suspended = false;
> >>>>>> +    dev->suspended = false;
> >>>>>>         return ret;
> >>>>>>     }
> >>>>>>
> >>>>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
> >>>>>>             if (unlikely(r)) {
> >>>>>>                 error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
> >>>>>>             } else {
> >>>>>> -            v->suspended = true;
> >>>>>> +            dev->suspended = true;
> >>>>>>                 return;
> >>>>>>             }
> >>>>>>         }
> >>>>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >>>>>>                 return -1;
> >>>>>>             }
> >>>>>>             vhost_vdpa_set_vring_ready(dev);
> >>>>>> +        if (dev->suspended) {
> >>>>>> +            /* TODO: When RESUME is available, use it instead of resetting */
> >>>>>> +            vhost_vdpa_reset_status(dev);
> >>>>> How is that we reset the status at each vhost_vdpa_dev_start? That
> >>>>> will clean all the vqs configured, features negotiated, etc. in the
> >>>>> vDPA device. Or am I missing something?
> >>>> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
> >>>> but we somehow need to lift the previous SUSPEND so the device will
> >>>> again respond to guest requests, do we not?
> >>>>
> >>> Reset also clears the suspend state in vDPA, and it should be called
> >>> at vhost_dev_stop. So the device should never be in suspended state
> >>> here. Does that solve your concerns?
> >> My intention with this patch was precisely not to reset in
> >> vhost_dev_stop when suspending is supported.  So now I’m more confused
> >> than before.
> >>
> > At this moment, I think that should be focused as an optimization and
> > not to be included in the main series.
>
> It is absolutely not an optimization but vital for my use case.
> virtiofsd does not currently implement resetting, but if it did (and we
> want this support in the future), resetting it during stop/cont would be
> catastrophic.  There must be a way to have qemu not issue a reset.  This
> patch is the reason why this series exists.
>

Sorry, I see I confused things with the first reply. Let me do a recap.

If I understand the problem correctly, your use case requires that
qemu does not reset the device before the device state is fetched with
virtio_save in the case of a migration. This is understandable and I
think we have a solution for that: to move the vhost_ops call to
virtio_reset and the end of virtio_save. To remove the reset call from
vhost_dev_stop is somehow mandatory, as it is called before
virtio_save.

But we cannot move to vhost_vdpa_dev_start, as proposed here. The reasons are:
* All the features, vq parameters, etc are set before any
vhost_vdpa_dev_start call. To reset at any vhost_vdpa_dev_start would
wipe them.
* The device needs to hold all the resources until it is reset. Things
like descriptor status etc.

And, regarding the comment "When RESUME is available, use it instead
of resetting", we cannot use resume to replace reset in all cases.
This is because the semantics are different.

For example, vhost_dev_stop and vhost_dev_start are also called when
the guest reset by itself the device. You can check it rmmoding and
modprobing virtio-net driver, for example. In this case, the driver
expects to find all vqs to start with 0, but the resume must not reset
these indexes.

It can be applied as an optimizations sometimes, but not for the general case.

> >>>> But more generally, is this any different from what is done before this
> >>>> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
> >>>> vhost_reset_status(), so the device is reset in every stop/start cycle,
> >>>> that doesn’t change.  And we still won’t reset it on the first
> >>>> vhost_dev_start(), because dev->suspended will be false then, only on
> >>>> subsequent stop/start cycles, as before.  So the only difference is that
> >>>> now the device is reset on start, not on stop.
> >>>>
> >>> The difference is that vhost_vdpa_dev_start is called after features
> >>> ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
> >>> configuration (using vhost_virtqueue_start). A device reset forces the
> >>> device to forget about all of that, and qemu cannot configure them
> >>> again until qemu acks the features again.
> >> Now I’m completely confused, because I don’t see the point of
> >> implementing suspend at all if we rely on a reset immediately afterwards
> >> anyway.
> > It's not immediate. From vhost_dev_stop, comments added only in this mail:
> >
> > void vhost_virtqueue_stop(struct vhost_dev *dev,
> >                            struct VirtIODevice *vdev,
> >                            struct vhost_virtqueue *vq,
> >                            unsigned idx)
> > {
> >      ...
> >      // Get each vring indexes, trusting the destination device can
> > continue safely from there
> >      r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
> >      if (r < 0) {
> >          VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
> >          /* Connection to the backend is broken, so let's sync internal
> >           * last avail idx to the device used idx.
> >           */
> >          virtio_queue_restore_last_avail_idx(vdev, idx);
> >      } else {
> >          virtio_queue_set_last_avail_idx(vdev, idx, state.num);
> >      }
> >      ...
> > }
> >
> > void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
> > {
> >      ...
> >      // Suspend the device, so we can trust in vring indexes / vq state
>
> I don’t understand this purpose.  GET_VRING_BASE stops the vring in
> question, so the vring index returned must be trustworthy, no?
>

That only happens in vhost-user, not in vhost-vdpa.

> >      if (hdev->vhost_ops->vhost_dev_start) {
> >          hdev->vhost_ops->vhost_dev_start(hdev, false);
> >      }
> >      if (vrings) {
> >          vhost_dev_set_vring_enable(hdev, false);
> >      }
> >
> >      // Fetch each vq index / state and store in vdev->vq[i]
> >      for (i = 0; i < hdev->nvqs; ++i) {
> >          vhost_virtqueue_stop(hdev,
> >                               vdev,
> >                               hdev->vqs + i,
> >                               hdev->vq_index + i);
> >      }
> >
> >      // Reset the device, as we don't need it anymore and it can
> > release the resources
> >      if (hdev->vhost_ops->vhost_reset_status) {
> >          hdev->vhost_ops->vhost_reset_status(hdev);
> >      }
> > }
> > ---
> >
> >>   It was my impression this whole time that suspending would
> >> remove the need to reset.  Well, at least until the device should be
> >> resumed again, i.e. in vhost_dev_start().
> >>
> > It cannot. vhost_dev_stop is also called when the guest reset the
> > device, so the guest trusts the device to be in a clean state.
> >
> > Also, the reset is the moment when the device frees the unused
> > resources. This would mandate the device to
>
> What resources are we talking about?  This function is called when the
> VM is paused (stop).  If a stateful device is reset to free “unused
> resources”, this means dropping its internal state, which is absolutely
> wrong in a stop/cont cycle.
>

But is the expected result in the virtio reset cycle. We need to split
these paths.

> >> In addition, I also don’t understand the magnitude of the problem with
> >> ordering.  If the order in vhost_dev_start() is wrong, can we not easily
> >> fix it?
> > The order in vhost_dev_start follows the VirtIO standard.
>
> What does the VirtIO standard say about suspended vhost back-ends?
>

Suspend does not exist in the VirtIO standard. I meant the device
initialization order in "3.1 Device Initialization":

1. Reset the device.
...
5. Set the FEATURES_OK status bit. [...]
...
7. Perform device-specific setup, including discovery of virtqueues
for the device, optional per-bus setup, reading and possibly writing
the device’s virtio configuration space, and population of virtqueues.
8.Set the DRIVER_OK status bit. At this point the device is “live”.

Steps 4-8 are all done in vhost_dev_start, in that particular order.
To call vhost_vdpa_reset_status from vhost_vdpa_dev_start(true) would
reset the device back to step 1, but there is no more code to set all
configuration from 2-7 before 8 (DRIVER_OK).

> Hanna
>
> > The move of
> > the reset should be to remove it from vhost_dev_stop to something like
> > both virtio_reset and the end of virtio_save. I'm not sure if I'm
> > forgetting some other use cases.
> >
> >> E.g. add a full vhost_dev_resume callback to invoke right at
> >> the start of vhost_dev_start(); or check (in the same place) whether the
> >> back-end supports resuming, and if it doesn’t (and it is currently
> >> suspended), reset it there.
> >>
> >> In any case, if we need to reset in vhost_dev_stop(), i.e. immediately
> >> after suspend, I don’t see the point of suspending, indicating to me
> >> that I still fail to understand its purpose.
> >>
> >> Hanna
> >>
>
Hanna Czenczek July 26, 2023, 6:57 a.m. UTC | #9
On 25.07.23 20:53, Eugenio Perez Martin wrote:
> On Tue, Jul 25, 2023 at 3:09 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>> On 25.07.23 12:03, Eugenio Perez Martin wrote:
>>> On Tue, Jul 25, 2023 at 9:53 AM Hanna Czenczek <hreitz@redhat.com> wrote:
>>>> On 24.07.23 17:48, Eugenio Perez Martin wrote:
>>>>> On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>>>>>> On 21.07.23 17:25, Eugenio Perez Martin wrote:
>>>>>>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>>>>>>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
>>>>>>>> struct, so vhost_dev_stop() can check whether the back-end has been
>>>>>>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
>>>>>>>> there is no need to reset it; the reset is just a fall-back to stop
>>>>>>>> device operations for back-ends that do not support suspend.
>>>>>>>>
>>>>>>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
>>>>>>>> when the device is re-started, we still have to do the reset to have it
>>>>>>>> un-suspend.
>>>>>>>>
>>>>>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>>>>>>>> ---
>>>>>>>>      include/hw/virtio/vhost-vdpa.h |  2 --
>>>>>>>>      include/hw/virtio/vhost.h      |  8 ++++++++
>>>>>>>>      hw/virtio/vhost-vdpa.c         | 11 +++++++----
>>>>>>>>      hw/virtio/vhost.c              |  8 +++++++-
>>>>>>>>      4 files changed, 22 insertions(+), 7 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
>>>>>>>> index e64bfc7f98..72c3686b7f 100644
>>>>>>>> --- a/include/hw/virtio/vhost-vdpa.h
>>>>>>>> +++ b/include/hw/virtio/vhost-vdpa.h
>>>>>>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
>>>>>>>>          bool shadow_vqs_enabled;
>>>>>>>>          /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
>>>>>>>>          bool shadow_data;
>>>>>>>> -    /* Device suspended successfully */
>>>>>>>> -    bool suspended;
>>>>>>>>          /* IOVA mapping used by the Shadow Virtqueue */
>>>>>>>>          VhostIOVATree *iova_tree;
>>>>>>>>          GPtrArray *shadow_vqs;
>>>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>>>>> index 6a173cb9fa..69bf59d630 100644
>>>>>>>> --- a/include/hw/virtio/vhost.h
>>>>>>>> +++ b/include/hw/virtio/vhost.h
>>>>>>>> @@ -120,6 +120,14 @@ struct vhost_dev {
>>>>>>>>          uint64_t backend_cap;
>>>>>>>>          /* @started: is the vhost device started? */
>>>>>>>>          bool started;
>>>>>>>> +    /**
>>>>>>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
>>>>>>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
>>>>>>>> +     * are supposed to automatically suspend/resume in their
>>>>>>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
>>>>>>>> +     * the device is reset.
>>>>>>>> +     */
>>>>>>>> +    bool suspended;
>>>>>>>>          bool log_enabled;
>>>>>>>>          uint64_t log_size;
>>>>>>>>          Error *migration_blocker;
>>>>>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
>>>>>>>> index 7b7dee468e..f7fd19a203 100644
>>>>>>>> --- a/hw/virtio/vhost-vdpa.c
>>>>>>>> +++ b/hw/virtio/vhost-vdpa.c
>>>>>>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
>>>>>>>>
>>>>>>>>      static int vhost_vdpa_reset_device(struct vhost_dev *dev)
>>>>>>>>      {
>>>>>>>> -    struct vhost_vdpa *v = dev->opaque;
>>>>>>>>          int ret;
>>>>>>>>          uint8_t status = 0;
>>>>>>>>
>>>>>>>>          ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
>>>>>>>>          trace_vhost_vdpa_reset_device(dev);
>>>>>>>> -    v->suspended = false;
>>>>>>>> +    dev->suspended = false;
>>>>>>>>          return ret;
>>>>>>>>      }
>>>>>>>>
>>>>>>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
>>>>>>>>              if (unlikely(r)) {
>>>>>>>>                  error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
>>>>>>>>              } else {
>>>>>>>> -            v->suspended = true;
>>>>>>>> +            dev->suspended = true;
>>>>>>>>                  return;
>>>>>>>>              }
>>>>>>>>          }
>>>>>>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>>>>>>>>                  return -1;
>>>>>>>>              }
>>>>>>>>              vhost_vdpa_set_vring_ready(dev);
>>>>>>>> +        if (dev->suspended) {
>>>>>>>> +            /* TODO: When RESUME is available, use it instead of resetting */
>>>>>>>> +            vhost_vdpa_reset_status(dev);
>>>>>>> How is that we reset the status at each vhost_vdpa_dev_start? That
>>>>>>> will clean all the vqs configured, features negotiated, etc. in the
>>>>>>> vDPA device. Or am I missing something?
>>>>>> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
>>>>>> but we somehow need to lift the previous SUSPEND so the device will
>>>>>> again respond to guest requests, do we not?
>>>>>>
>>>>> Reset also clears the suspend state in vDPA, and it should be called
>>>>> at vhost_dev_stop. So the device should never be in suspended state
>>>>> here. Does that solve your concerns?
>>>> My intention with this patch was precisely not to reset in
>>>> vhost_dev_stop when suspending is supported.  So now I’m more confused
>>>> than before.
>>>>
>>> At this moment, I think that should be focused as an optimization and
>>> not to be included in the main series.
>> It is absolutely not an optimization but vital for my use case.
>> virtiofsd does not currently implement resetting, but if it did (and we
>> want this support in the future), resetting it during stop/cont would be
>> catastrophic.  There must be a way to have qemu not issue a reset.  This
>> patch is the reason why this series exists.
>>
> Sorry, I see I confused things with the first reply. Let me do a recap.
>
> If I understand the problem correctly, your use case requires that
> qemu does not reset the device before the device state is fetched with
> virtio_save in the case of a migration.

That is only part of the problem, the bigger picture has nothing to do 
with migration at all.  The problem is that when the VM is paused 
(stop), we invoke vhost_dev_stop(), and when it is resumed (cont), we 
invoke vhost_dev_start().  To me, it therefore sounds absolutely wrong 
to reset the back-end in either of these functions.  For stateless 
devices, it was determined to not be an issue (I still find it extremely 
strange), and as far as I’ve understood, we’ve come to the agreement 
that it’s basically a fallback for when there is no other way to stop 
the back-end.  But stateful devices like virtio-fs would be completely 
broken by resetting them there.

Therefore, if virtiofsd did implement reset through SET_STATUS, 
stop/cont would break it today.  Maybe other vhost-user devices, too, 
which just implement RESET_OWNER/RESET_DEVICE, which aren’t even called 
when the device is supposed to be reset in vhost_dev_stop() (patch 6).

So not just because of migration, but in general, there must be a way 
for back-ends to force qemu not to reset them in vhost_dev_start()/stop().

Or we stop using vhost_dev_start()/stop() when the VM is paused/resumed 
(stop/cont).

> This is understandable and I
> think we have a solution for that: to move the vhost_ops call to
> virtio_reset and the end of virtio_save.

Why would we reset the device in virtio_save()?

> To remove the reset call from
> vhost_dev_stop is somehow mandatory, as it is called before
> virtio_save.
>
> But we cannot move to vhost_vdpa_dev_start, as proposed here. The reasons are:
> * All the features, vq parameters, etc are set before any
> vhost_vdpa_dev_start call. To reset at any vhost_vdpa_dev_start would
> wipe them.
> * The device needs to hold all the resources until it is reset. Things
> like descriptor status etc.
>
> And, regarding the comment "When RESUME is available, use it instead
> of resetting", we cannot use resume to replace reset in all cases.
> This is because the semantics are different.
>
> For example, vhost_dev_stop and vhost_dev_start are also called when
> the guest reset by itself the device. You can check it rmmoding and
> modprobing virtio-net driver, for example. In this case, the driver
> expects to find all vqs to start with 0, but the resume must not reset
> these indexes.

This isn’t quite clear to me.  I understand this to mean that there must 
be a reset somewhere in vhost_dev_stop() and/or vhost_dev_start().  But 
above, you proposed moving the reset from vhost_dev_stop() into 
virtio_reset().  Is virtio_reset() called in addition to 
vhost_dev_stop() and vhost_dev_start() when the guest driver is changed?

Because we can’t have an always-present reset in vhost_dev_stop() or 
vhost_dev_start().  It just doesn’t work with stop/cont.  At the same 
time, I understand that you say we must have it because 
vhost_dev_{stop,start}() are also used when the guest driver changes.  
Consequently, it sounds clear to me that using the exact same functions 
in stop/cont and when the guest driver is unloaded/loaded is and has 
always been wrong.  Because in stop/cont, the guest driver never 
changes, so we shouldn’t tell the back-end that it did (by sending 
SET_STATUS(0)).

> It can be applied as an optimizations sometimes, but not for the general case.
>
>>>>>> But more generally, is this any different from what is done before this
>>>>>> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
>>>>>> vhost_reset_status(), so the device is reset in every stop/start cycle,
>>>>>> that doesn’t change.  And we still won’t reset it on the first
>>>>>> vhost_dev_start(), because dev->suspended will be false then, only on
>>>>>> subsequent stop/start cycles, as before.  So the only difference is that
>>>>>> now the device is reset on start, not on stop.
>>>>>>
>>>>> The difference is that vhost_vdpa_dev_start is called after features
>>>>> ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
>>>>> configuration (using vhost_virtqueue_start). A device reset forces the
>>>>> device to forget about all of that, and qemu cannot configure them
>>>>> again until qemu acks the features again.
>>>> Now I’m completely confused, because I don’t see the point of
>>>> implementing suspend at all if we rely on a reset immediately afterwards
>>>> anyway.
>>> It's not immediate. From vhost_dev_stop, comments added only in this mail:
>>>
>>> void vhost_virtqueue_stop(struct vhost_dev *dev,
>>>                             struct VirtIODevice *vdev,
>>>                             struct vhost_virtqueue *vq,
>>>                             unsigned idx)
>>> {
>>>       ...
>>>       // Get each vring indexes, trusting the destination device can
>>> continue safely from there
>>>       r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
>>>       if (r < 0) {
>>>           VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
>>>           /* Connection to the backend is broken, so let's sync internal
>>>            * last avail idx to the device used idx.
>>>            */
>>>           virtio_queue_restore_last_avail_idx(vdev, idx);
>>>       } else {
>>>           virtio_queue_set_last_avail_idx(vdev, idx, state.num);
>>>       }
>>>       ...
>>> }
>>>
>>> void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
>>> {
>>>       ...
>>>       // Suspend the device, so we can trust in vring indexes / vq state
>> I don’t understand this purpose.  GET_VRING_BASE stops the vring in
>> question, so the vring index returned must be trustworthy, no?
>>
> That only happens in vhost-user, not in vhost-vdpa.

OK, so that begs the question: Was SUSPEND ever intended to do anything 
but stop all vrings?  Because this series is about to make its meaning a 
whole lot broader than that in vhost-user.

>>>       if (hdev->vhost_ops->vhost_dev_start) {
>>>           hdev->vhost_ops->vhost_dev_start(hdev, false);
>>>       }
>>>       if (vrings) {
>>>           vhost_dev_set_vring_enable(hdev, false);
>>>       }
>>>
>>>       // Fetch each vq index / state and store in vdev->vq[i]
>>>       for (i = 0; i < hdev->nvqs; ++i) {
>>>           vhost_virtqueue_stop(hdev,
>>>                                vdev,
>>>                                hdev->vqs + i,
>>>                                hdev->vq_index + i);
>>>       }
>>>
>>>       // Reset the device, as we don't need it anymore and it can
>>> release the resources
>>>       if (hdev->vhost_ops->vhost_reset_status) {
>>>           hdev->vhost_ops->vhost_reset_status(hdev);
>>>       }
>>> }
>>> ---
>>>
>>>>    It was my impression this whole time that suspending would
>>>> remove the need to reset.  Well, at least until the device should be
>>>> resumed again, i.e. in vhost_dev_start().
>>>>
>>> It cannot. vhost_dev_stop is also called when the guest reset the
>>> device, so the guest trusts the device to be in a clean state.
>>>
>>> Also, the reset is the moment when the device frees the unused
>>> resources. This would mandate the device to
>> What resources are we talking about?  This function is called when the
>> VM is paused (stop).  If a stateful device is reset to free “unused
>> resources”, this means dropping its internal state, which is absolutely
>> wrong in a stop/cont cycle.
>>
> But is the expected result in the virtio reset cycle. We need to split
> these paths.
>
>>>> In addition, I also don’t understand the magnitude of the problem with
>>>> ordering.  If the order in vhost_dev_start() is wrong, can we not easily
>>>> fix it?
>>> The order in vhost_dev_start follows the VirtIO standard.
>> What does the VirtIO standard say about suspended vhost back-ends?
>>
> Suspend does not exist in the VirtIO standard. I meant the device
> initialization order in "3.1 Device Initialization":
>
> 1. Reset the device.
> ...
> 5. Set the FEATURES_OK status bit. [...]
> ...
> 7. Perform device-specific setup, including discovery of virtqueues
> for the device, optional per-bus setup, reading and possibly writing
> the device’s virtio configuration space, and population of virtqueues.
> 8.Set the DRIVER_OK status bit. At this point the device is “live”.
>
> Steps 4-8 are all done in vhost_dev_start, in that particular order.
> To call vhost_vdpa_reset_status from vhost_vdpa_dev_start(true) would
> reset the device back to step 1, but there is no more code to set all
> configuration from 2-7 before 8 (DRIVER_OK).

That’s why I’ve proposed doing the reset at the start of 
vhost_dev_start() (quoted below still).  To me, that sounds in line with 
the virtio specification.

Still, if you insist there must a reset somewhere in 
vhost_dev_start()/stop() because it may be guest-initiated, then there 
is no solution that can work for both.  We must be able to distinguish 
between a paused VM and a change in the guest driver.

>> Hanna
>>
>>> The move of
>>> the reset should be to remove it from vhost_dev_stop to something like
>>> both virtio_reset and the end of virtio_save. I'm not sure if I'm
>>> forgetting some other use cases.
>>>
>>>> E.g. add a full vhost_dev_resume callback to invoke right at
>>>> the start of vhost_dev_start(); or check (in the same place) whether the
>>>> back-end supports resuming, and if it doesn’t (and it is currently
>>>> suspended), reset it there.
Eugenio Perez Martin July 27, 2023, 12:49 p.m. UTC | #10
On Wed, Jul 26, 2023 at 8:57 AM Hanna Czenczek <hreitz@redhat.com> wrote:
>
> On 25.07.23 20:53, Eugenio Perez Martin wrote:
> > On Tue, Jul 25, 2023 at 3:09 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >> On 25.07.23 12:03, Eugenio Perez Martin wrote:
> >>> On Tue, Jul 25, 2023 at 9:53 AM Hanna Czenczek <hreitz@redhat.com> wrote:
> >>>> On 24.07.23 17:48, Eugenio Perez Martin wrote:
> >>>>> On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >>>>>> On 21.07.23 17:25, Eugenio Perez Martin wrote:
> >>>>>>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> >>>>>>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> >>>>>>>> struct, so vhost_dev_stop() can check whether the back-end has been
> >>>>>>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> >>>>>>>> there is no need to reset it; the reset is just a fall-back to stop
> >>>>>>>> device operations for back-ends that do not support suspend.
> >>>>>>>>
> >>>>>>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> >>>>>>>> when the device is re-started, we still have to do the reset to have it
> >>>>>>>> un-suspend.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> >>>>>>>> ---
> >>>>>>>>      include/hw/virtio/vhost-vdpa.h |  2 --
> >>>>>>>>      include/hw/virtio/vhost.h      |  8 ++++++++
> >>>>>>>>      hw/virtio/vhost-vdpa.c         | 11 +++++++----
> >>>>>>>>      hw/virtio/vhost.c              |  8 +++++++-
> >>>>>>>>      4 files changed, 22 insertions(+), 7 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> >>>>>>>> index e64bfc7f98..72c3686b7f 100644
> >>>>>>>> --- a/include/hw/virtio/vhost-vdpa.h
> >>>>>>>> +++ b/include/hw/virtio/vhost-vdpa.h
> >>>>>>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
> >>>>>>>>          bool shadow_vqs_enabled;
> >>>>>>>>          /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
> >>>>>>>>          bool shadow_data;
> >>>>>>>> -    /* Device suspended successfully */
> >>>>>>>> -    bool suspended;
> >>>>>>>>          /* IOVA mapping used by the Shadow Virtqueue */
> >>>>>>>>          VhostIOVATree *iova_tree;
> >>>>>>>>          GPtrArray *shadow_vqs;
> >>>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >>>>>>>> index 6a173cb9fa..69bf59d630 100644
> >>>>>>>> --- a/include/hw/virtio/vhost.h
> >>>>>>>> +++ b/include/hw/virtio/vhost.h
> >>>>>>>> @@ -120,6 +120,14 @@ struct vhost_dev {
> >>>>>>>>          uint64_t backend_cap;
> >>>>>>>>          /* @started: is the vhost device started? */
> >>>>>>>>          bool started;
> >>>>>>>> +    /**
> >>>>>>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
> >>>>>>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> >>>>>>>> +     * are supposed to automatically suspend/resume in their
> >>>>>>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
> >>>>>>>> +     * the device is reset.
> >>>>>>>> +     */
> >>>>>>>> +    bool suspended;
> >>>>>>>>          bool log_enabled;
> >>>>>>>>          uint64_t log_size;
> >>>>>>>>          Error *migration_blocker;
> >>>>>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >>>>>>>> index 7b7dee468e..f7fd19a203 100644
> >>>>>>>> --- a/hw/virtio/vhost-vdpa.c
> >>>>>>>> +++ b/hw/virtio/vhost-vdpa.c
> >>>>>>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
> >>>>>>>>
> >>>>>>>>      static int vhost_vdpa_reset_device(struct vhost_dev *dev)
> >>>>>>>>      {
> >>>>>>>> -    struct vhost_vdpa *v = dev->opaque;
> >>>>>>>>          int ret;
> >>>>>>>>          uint8_t status = 0;
> >>>>>>>>
> >>>>>>>>          ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
> >>>>>>>>          trace_vhost_vdpa_reset_device(dev);
> >>>>>>>> -    v->suspended = false;
> >>>>>>>> +    dev->suspended = false;
> >>>>>>>>          return ret;
> >>>>>>>>      }
> >>>>>>>>
> >>>>>>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
> >>>>>>>>              if (unlikely(r)) {
> >>>>>>>>                  error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
> >>>>>>>>              } else {
> >>>>>>>> -            v->suspended = true;
> >>>>>>>> +            dev->suspended = true;
> >>>>>>>>                  return;
> >>>>>>>>              }
> >>>>>>>>          }
> >>>>>>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >>>>>>>>                  return -1;
> >>>>>>>>              }
> >>>>>>>>              vhost_vdpa_set_vring_ready(dev);
> >>>>>>>> +        if (dev->suspended) {
> >>>>>>>> +            /* TODO: When RESUME is available, use it instead of resetting */
> >>>>>>>> +            vhost_vdpa_reset_status(dev);
> >>>>>>> How is that we reset the status at each vhost_vdpa_dev_start? That
> >>>>>>> will clean all the vqs configured, features negotiated, etc. in the
> >>>>>>> vDPA device. Or am I missing something?
> >>>>>> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
> >>>>>> but we somehow need to lift the previous SUSPEND so the device will
> >>>>>> again respond to guest requests, do we not?
> >>>>>>
> >>>>> Reset also clears the suspend state in vDPA, and it should be called
> >>>>> at vhost_dev_stop. So the device should never be in suspended state
> >>>>> here. Does that solve your concerns?
> >>>> My intention with this patch was precisely not to reset in
> >>>> vhost_dev_stop when suspending is supported.  So now I’m more confused
> >>>> than before.
> >>>>
> >>> At this moment, I think that should be focused as an optimization and
> >>> not to be included in the main series.
> >> It is absolutely not an optimization but vital for my use case.
> >> virtiofsd does not currently implement resetting, but if it did (and we
> >> want this support in the future), resetting it during stop/cont would be
> >> catastrophic.  There must be a way to have qemu not issue a reset.  This
> >> patch is the reason why this series exists.
> >>
> > Sorry, I see I confused things with the first reply. Let me do a recap.
> >
> > If I understand the problem correctly, your use case requires that
> > qemu does not reset the device before the device state is fetched with
> > virtio_save in the case of a migration.
>
> That is only part of the problem, the bigger picture has nothing to do
> with migration at all.  The problem is that when the VM is paused
> (stop), we invoke vhost_dev_stop(), and when it is resumed (cont), we
> invoke vhost_dev_start().  To me, it therefore sounds absolutely wrong
> to reset the back-end in either of these functions.  For stateless
> devices, it was determined to not be an issue (I still find it extremely
> strange), and as far as I’ve understood, we’ve come to the agreement
> that it’s basically a fallback for when there is no other way to stop
> the back-end.  But stateful devices like virtio-fs would be completely
> broken by resetting them there.
>
> Therefore, if virtiofsd did implement reset through SET_STATUS,
> stop/cont would break it today.  Maybe other vhost-user devices, too,
> which just implement RESET_OWNER/RESET_DEVICE, which aren’t even called
> when the device is supposed to be reset in vhost_dev_stop() (patch 6).
>
> So not just because of migration, but in general, there must be a way
> for back-ends to force qemu not to reset them in vhost_dev_start()/stop().
>
> Or we stop using vhost_dev_start()/stop() when the VM is paused/resumed
> (stop/cont).
>

Yes, that comes back to the thread [1].

As a third alternative, you can keep vhost_dev_start and let the
function check the current state and initialize the device only if
needed. But you can keep symmetrical functions and call one or another
at the device's code, of course. Not sure what is cleaner or requires
less changes.

> > This is understandable and I
> > think we have a solution for that: to move the vhost_ops call to
> > virtio_reset and the end of virtio_save.
>
> Why would we reset the device in virtio_save()?
>

If the VM continues in the source because of whatever reason,
vhost_dev_start would expect the device to be clean. You can test it
with the command "cont" after the LM.

> > To remove the reset call from
> > vhost_dev_stop is somehow mandatory, as it is called before
> > virtio_save.
> >
> > But we cannot move to vhost_vdpa_dev_start, as proposed here. The reasons are:
> > * All the features, vq parameters, etc are set before any
> > vhost_vdpa_dev_start call. To reset at any vhost_vdpa_dev_start would
> > wipe them.
> > * The device needs to hold all the resources until it is reset. Things
> > like descriptor status etc.
> >
> > And, regarding the comment "When RESUME is available, use it instead
> > of resetting", we cannot use resume to replace reset in all cases.
> > This is because the semantics are different.
> >
> > For example, vhost_dev_stop and vhost_dev_start are also called when
> > the guest reset by itself the device. You can check it rmmoding and
> > modprobing virtio-net driver, for example. In this case, the driver
> > expects to find all vqs to start with 0, but the resume must not reset
> > these indexes.
>
> This isn’t quite clear to me.  I understand this to mean that there must
> be a reset somewhere in vhost_dev_stop() and/or vhost_dev_start().  But
> above, you proposed moving the reset from vhost_dev_stop() into
> virtio_reset().  Is virtio_reset() called in addition to
> vhost_dev_stop() and vhost_dev_start() when the guest driver is changed?
>

Right. Maybe another option is virtio_set_status?

The point is that other parts of qemu / guest trust the device to be
reset after (current version of) vhost_dev_stop, so if we are going to
move the reset it must be added to the callers at least. To trace
these callers is needed, so maybe it is easy to add another function
(vhost_dev_suspend?), your first alternative.

> Because we can’t have an always-present reset in vhost_dev_stop() or
> vhost_dev_start().  It just doesn’t work with stop/cont.  At the same
> time, I understand that you say we must have it because
> vhost_dev_{stop,start}() are also used when the guest driver changes.
> Consequently, it sounds clear to me that using the exact same functions
> in stop/cont and when the guest driver is unloaded/loaded is and has
> always been wrong.  Because in stop/cont, the guest driver never
> changes, so we shouldn’t tell the back-end that it did (by sending
> SET_STATUS(0)).
>

Talking just about vhost_net here, the device dumps all the state
(like vqs) to qemu in vhost_dev_stop. That is what allows to have, for
example, an unified code in migrating: qemu only needs to know how to
migrate its emulated device, and vhost code just writes or read at
suspend / continue or live migrating. To suspend and continue is the
same operation actually for vhost_net.

> > It can be applied as an optimizations sometimes, but not for the general case.
> >
> >>>>>> But more generally, is this any different from what is done before this
> >>>>>> patch?  Before this patch, vhost_dev_stop() unconditionally invokes
> >>>>>> vhost_reset_status(), so the device is reset in every stop/start cycle,
> >>>>>> that doesn’t change.  And we still won’t reset it on the first
> >>>>>> vhost_dev_start(), because dev->suspended will be false then, only on
> >>>>>> subsequent stop/start cycles, as before.  So the only difference is that
> >>>>>> now the device is reset on start, not on stop.
> >>>>>>
> >>>>> The difference is that vhost_vdpa_dev_start is called after features
> >>>>> ack (via vhost_dev_start, through vhost_dev_set_features call) and vq
> >>>>> configuration (using vhost_virtqueue_start). A device reset forces the
> >>>>> device to forget about all of that, and qemu cannot configure them
> >>>>> again until qemu acks the features again.
> >>>> Now I’m completely confused, because I don’t see the point of
> >>>> implementing suspend at all if we rely on a reset immediately afterwards
> >>>> anyway.
> >>> It's not immediate. From vhost_dev_stop, comments added only in this mail:
> >>>
> >>> void vhost_virtqueue_stop(struct vhost_dev *dev,
> >>>                             struct VirtIODevice *vdev,
> >>>                             struct vhost_virtqueue *vq,
> >>>                             unsigned idx)
> >>> {
> >>>       ...
> >>>       // Get each vring indexes, trusting the destination device can
> >>> continue safely from there
> >>>       r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
> >>>       if (r < 0) {
> >>>           VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
> >>>           /* Connection to the backend is broken, so let's sync internal
> >>>            * last avail idx to the device used idx.
> >>>            */
> >>>           virtio_queue_restore_last_avail_idx(vdev, idx);
> >>>       } else {
> >>>           virtio_queue_set_last_avail_idx(vdev, idx, state.num);
> >>>       }
> >>>       ...
> >>> }
> >>>
> >>> void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
> >>> {
> >>>       ...
> >>>       // Suspend the device, so we can trust in vring indexes / vq state
> >> I don’t understand this purpose.  GET_VRING_BASE stops the vring in
> >> question, so the vring index returned must be trustworthy, no?
> >>
> > That only happens in vhost-user, not in vhost-vdpa.
>
> OK, so that begs the question: Was SUSPEND ever intended to do anything
> but stop all vrings?  Because this series is about to make its meaning a
> whole lot broader than that in vhost-user.
>

That was the big part, yes. The device must also stop modifying
config, like in the case of link status changes. Other actions like
fetch vq state are performed later (at vhost_virtqueue_stop).

VHOST_VDPA_SUSPEND requires the device to preserve all the needed
state to be recovered or continued:

Suspend a device so it does not process virtqueue requests anymore

After the return of ioctl the device must preserve all the necessary state
(the virtqueue vring base plus the possible device specific states) that is
required for restoring in the future. The device must not change its
configuration after that point.
---

> >>>       if (hdev->vhost_ops->vhost_dev_start) {
> >>>           hdev->vhost_ops->vhost_dev_start(hdev, false);
> >>>       }
> >>>       if (vrings) {
> >>>           vhost_dev_set_vring_enable(hdev, false);
> >>>       }
> >>>
> >>>       // Fetch each vq index / state and store in vdev->vq[i]
> >>>       for (i = 0; i < hdev->nvqs; ++i) {
> >>>           vhost_virtqueue_stop(hdev,
> >>>                                vdev,
> >>>                                hdev->vqs + i,
> >>>                                hdev->vq_index + i);
> >>>       }
> >>>
> >>>       // Reset the device, as we don't need it anymore and it can
> >>> release the resources
> >>>       if (hdev->vhost_ops->vhost_reset_status) {
> >>>           hdev->vhost_ops->vhost_reset_status(hdev);
> >>>       }
> >>> }
> >>> ---
> >>>
> >>>>    It was my impression this whole time that suspending would
> >>>> remove the need to reset.  Well, at least until the device should be
> >>>> resumed again, i.e. in vhost_dev_start().
> >>>>
> >>> It cannot. vhost_dev_stop is also called when the guest reset the
> >>> device, so the guest trusts the device to be in a clean state.
> >>>
> >>> Also, the reset is the moment when the device frees the unused
> >>> resources. This would mandate the device to
> >> What resources are we talking about?  This function is called when the
> >> VM is paused (stop).  If a stateful device is reset to free “unused
> >> resources”, this means dropping its internal state, which is absolutely
> >> wrong in a stop/cont cycle.
> >>
> > But is the expected result in the virtio reset cycle. We need to split
> > these paths.
> >
> >>>> In addition, I also don’t understand the magnitude of the problem with
> >>>> ordering.  If the order in vhost_dev_start() is wrong, can we not easily
> >>>> fix it?
> >>> The order in vhost_dev_start follows the VirtIO standard.
> >> What does the VirtIO standard say about suspended vhost back-ends?
> >>
> > Suspend does not exist in the VirtIO standard. I meant the device
> > initialization order in "3.1 Device Initialization":
> >
> > 1. Reset the device.
> > ...
> > 5. Set the FEATURES_OK status bit. [...]
> > ...
> > 7. Perform device-specific setup, including discovery of virtqueues
> > for the device, optional per-bus setup, reading and possibly writing
> > the device’s virtio configuration space, and population of virtqueues.
> > 8.Set the DRIVER_OK status bit. At this point the device is “live”.
> >
> > Steps 4-8 are all done in vhost_dev_start, in that particular order.
> > To call vhost_vdpa_reset_status from vhost_vdpa_dev_start(true) would
> > reset the device back to step 1, but there is no more code to set all
> > configuration from 2-7 before 8 (DRIVER_OK).
>
> That’s why I’ve proposed doing the reset at the start of
> vhost_dev_start() (quoted below still).  To me, that sounds in line with
> the virtio specification.
>
> Still, if you insist there must a reset somewhere in
> vhost_dev_start()/stop() because it may be guest-initiated, then there
> is no solution that can work for both.  We must be able to distinguish
> between a paused VM and a change in the guest driver.
>

Right.

[1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg954916.html

> >> Hanna
> >>
> >>> The move of
> >>> the reset should be to remove it from vhost_dev_stop to something like
> >>> both virtio_reset and the end of virtio_save. I'm not sure if I'm
> >>> forgetting some other use cases.
> >>>
> >>>> E.g. add a full vhost_dev_resume callback to invoke right at
> >>>> the start of vhost_dev_start(); or check (in the same place) whether the
> >>>> back-end supports resuming, and if it doesn’t (and it is currently
> >>>> suspended), reset it there.
>
>
Stefan Hajnoczi July 27, 2023, 8:26 p.m. UTC | #11
On Thu, Jul 27, 2023 at 02:49:04PM +0200, Eugenio Perez Martin wrote:
> On Wed, Jul 26, 2023 at 8:57 AM Hanna Czenczek <hreitz@redhat.com> wrote:
> >
> > On 25.07.23 20:53, Eugenio Perez Martin wrote:
> > > On Tue, Jul 25, 2023 at 3:09 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> > >> On 25.07.23 12:03, Eugenio Perez Martin wrote:
> > >>> On Tue, Jul 25, 2023 at 9:53 AM Hanna Czenczek <hreitz@redhat.com> wrote:
> > >>>> On 24.07.23 17:48, Eugenio Perez Martin wrote:
> > >>>>> On Fri, Jul 21, 2023 at 6:07 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> > >>>>>> On 21.07.23 17:25, Eugenio Perez Martin wrote:
> > >>>>>>> On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek <hreitz@redhat.com> wrote:
> > >>>>>>>> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> > >>>>>>>> struct, so vhost_dev_stop() can check whether the back-end has been
> > >>>>>>>> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> > >>>>>>>> there is no need to reset it; the reset is just a fall-back to stop
> > >>>>>>>> device operations for back-ends that do not support suspend.
> > >>>>>>>>
> > >>>>>>>> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> > >>>>>>>> when the device is re-started, we still have to do the reset to have it
> > >>>>>>>> un-suspend.
> > >>>>>>>>
> > >>>>>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> > >>>>>>>> ---
> > >>>>>>>>      include/hw/virtio/vhost-vdpa.h |  2 --
> > >>>>>>>>      include/hw/virtio/vhost.h      |  8 ++++++++
> > >>>>>>>>      hw/virtio/vhost-vdpa.c         | 11 +++++++----
> > >>>>>>>>      hw/virtio/vhost.c              |  8 +++++++-
> > >>>>>>>>      4 files changed, 22 insertions(+), 7 deletions(-)
> > >>>>>>>>
> > >>>>>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> > >>>>>>>> index e64bfc7f98..72c3686b7f 100644
> > >>>>>>>> --- a/include/hw/virtio/vhost-vdpa.h
> > >>>>>>>> +++ b/include/hw/virtio/vhost-vdpa.h
> > >>>>>>>> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
> > >>>>>>>>          bool shadow_vqs_enabled;
> > >>>>>>>>          /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
> > >>>>>>>>          bool shadow_data;
> > >>>>>>>> -    /* Device suspended successfully */
> > >>>>>>>> -    bool suspended;
> > >>>>>>>>          /* IOVA mapping used by the Shadow Virtqueue */
> > >>>>>>>>          VhostIOVATree *iova_tree;
> > >>>>>>>>          GPtrArray *shadow_vqs;
> > >>>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> > >>>>>>>> index 6a173cb9fa..69bf59d630 100644
> > >>>>>>>> --- a/include/hw/virtio/vhost.h
> > >>>>>>>> +++ b/include/hw/virtio/vhost.h
> > >>>>>>>> @@ -120,6 +120,14 @@ struct vhost_dev {
> > >>>>>>>>          uint64_t backend_cap;
> > >>>>>>>>          /* @started: is the vhost device started? */
> > >>>>>>>>          bool started;
> > >>>>>>>> +    /**
> > >>>>>>>> +     * @suspended: Whether the vhost device is currently suspended.  Set
> > >>>>>>>> +     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> > >>>>>>>> +     * are supposed to automatically suspend/resume in their
> > >>>>>>>> +     * vhost_dev_start handlers as required.  Must also be cleared when
> > >>>>>>>> +     * the device is reset.
> > >>>>>>>> +     */
> > >>>>>>>> +    bool suspended;
> > >>>>>>>>          bool log_enabled;
> > >>>>>>>>          uint64_t log_size;
> > >>>>>>>>          Error *migration_blocker;
> > >>>>>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > >>>>>>>> index 7b7dee468e..f7fd19a203 100644
> > >>>>>>>> --- a/hw/virtio/vhost-vdpa.c
> > >>>>>>>> +++ b/hw/virtio/vhost-vdpa.c
> > >>>>>>>> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
> > >>>>>>>>
> > >>>>>>>>      static int vhost_vdpa_reset_device(struct vhost_dev *dev)
> > >>>>>>>>      {
> > >>>>>>>> -    struct vhost_vdpa *v = dev->opaque;
> > >>>>>>>>          int ret;
> > >>>>>>>>          uint8_t status = 0;
> > >>>>>>>>
> > >>>>>>>>          ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
> > >>>>>>>>          trace_vhost_vdpa_reset_device(dev);
> > >>>>>>>> -    v->suspended = false;
> > >>>>>>>> +    dev->suspended = false;
> > >>>>>>>>          return ret;
> > >>>>>>>>      }
> > >>>>>>>>
> > >>>>>>>> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
> > >>>>>>>>              if (unlikely(r)) {
> > >>>>>>>>                  error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
> > >>>>>>>>              } else {
> > >>>>>>>> -            v->suspended = true;
> > >>>>>>>> +            dev->suspended = true;
> > >>>>>>>>                  return;
> > >>>>>>>>              }
> > >>>>>>>>          }
> > >>>>>>>> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> > >>>>>>>>                  return -1;
> > >>>>>>>>              }
> > >>>>>>>>              vhost_vdpa_set_vring_ready(dev);
> > >>>>>>>> +        if (dev->suspended) {
> > >>>>>>>> +            /* TODO: When RESUME is available, use it instead of resetting */
> > >>>>>>>> +            vhost_vdpa_reset_status(dev);
> > >>>>>>> How is that we reset the status at each vhost_vdpa_dev_start? That
> > >>>>>>> will clean all the vqs configured, features negotiated, etc. in the
> > >>>>>>> vDPA device. Or am I missing something?
> > >>>>>> What alternative do you propose?  We don’t have RESUME for vDPA in qemu,
> > >>>>>> but we somehow need to lift the previous SUSPEND so the device will
> > >>>>>> again respond to guest requests, do we not?
> > >>>>>>
> > >>>>> Reset also clears the suspend state in vDPA, and it should be called
> > >>>>> at vhost_dev_stop. So the device should never be in suspended state
> > >>>>> here. Does that solve your concerns?
> > >>>> My intention with this patch was precisely not to reset in
> > >>>> vhost_dev_stop when suspending is supported.  So now I’m more confused
> > >>>> than before.
> > >>>>
> > >>> At this moment, I think that should be focused as an optimization and
> > >>> not to be included in the main series.
> > >> It is absolutely not an optimization but vital for my use case.
> > >> virtiofsd does not currently implement resetting, but if it did (and we
> > >> want this support in the future), resetting it during stop/cont would be
> > >> catastrophic.  There must be a way to have qemu not issue a reset.  This
> > >> patch is the reason why this series exists.
> > >>
> > > Sorry, I see I confused things with the first reply. Let me do a recap.
> > >
> > > If I understand the problem correctly, your use case requires that
> > > qemu does not reset the device before the device state is fetched with
> > > virtio_save in the case of a migration.
> >
> > That is only part of the problem, the bigger picture has nothing to do
> > with migration at all.  The problem is that when the VM is paused
> > (stop), we invoke vhost_dev_stop(), and when it is resumed (cont), we
> > invoke vhost_dev_start().  To me, it therefore sounds absolutely wrong
> > to reset the back-end in either of these functions.  For stateless
> > devices, it was determined to not be an issue (I still find it extremely
> > strange), and as far as I’ve understood, we’ve come to the agreement
> > that it’s basically a fallback for when there is no other way to stop
> > the back-end.  But stateful devices like virtio-fs would be completely
> > broken by resetting them there.
> >
> > Therefore, if virtiofsd did implement reset through SET_STATUS,
> > stop/cont would break it today.  Maybe other vhost-user devices, too,
> > which just implement RESET_OWNER/RESET_DEVICE, which aren’t even called
> > when the device is supposed to be reset in vhost_dev_stop() (patch 6).
> >
> > So not just because of migration, but in general, there must be a way
> > for back-ends to force qemu not to reset them in vhost_dev_start()/stop().
> >
> > Or we stop using vhost_dev_start()/stop() when the VM is paused/resumed
> > (stop/cont).
> >
> 
> Yes, that comes back to the thread [1].
> 
> As a third alternative, you can keep vhost_dev_start and let the
> function check the current state and initialize the device only if
> needed. But you can keep symmetrical functions and call one or another
> at the device's code, of course. Not sure what is cleaner or requires
> less changes.
> 
> > > This is understandable and I
> > > think we have a solution for that: to move the vhost_ops call to
> > > virtio_reset and the end of virtio_save.
> >
> > Why would we reset the device in virtio_save()?
> >
> 
> If the VM continues in the source because of whatever reason,
> vhost_dev_start would expect the device to be clean. You can test it
> with the command "cont" after the LM.
> 
> > > To remove the reset call from
> > > vhost_dev_stop is somehow mandatory, as it is called before
> > > virtio_save.
> > >
> > > But we cannot move to vhost_vdpa_dev_start, as proposed here. The reasons are:
> > > * All the features, vq parameters, etc are set before any
> > > vhost_vdpa_dev_start call. To reset at any vhost_vdpa_dev_start would
> > > wipe them.
> > > * The device needs to hold all the resources until it is reset. Things
> > > like descriptor status etc.
> > >
> > > And, regarding the comment "When RESUME is available, use it instead
> > > of resetting", we cannot use resume to replace reset in all cases.
> > > This is because the semantics are different.
> > >
> > > For example, vhost_dev_stop and vhost_dev_start are also called when
> > > the guest reset by itself the device. You can check it rmmoding and
> > > modprobing virtio-net driver, for example. In this case, the driver
> > > expects to find all vqs to start with 0, but the resume must not reset
> > > these indexes.
> >
> > This isn’t quite clear to me.  I understand this to mean that there must
> > be a reset somewhere in vhost_dev_stop() and/or vhost_dev_start().  But
> > above, you proposed moving the reset from vhost_dev_stop() into
> > virtio_reset().  Is virtio_reset() called in addition to
> > vhost_dev_stop() and vhost_dev_start() when the guest driver is changed?
> >
> 
> Right. Maybe another option is virtio_set_status?
> 
> The point is that other parts of qemu / guest trust the device to be
> reset after (current version of) vhost_dev_stop, so if we are going to
> move the reset it must be added to the callers at least. To trace
> these callers is needed, so maybe it is easy to add another function
> (vhost_dev_suspend?), your first alternative.
> 
> > Because we can’t have an always-present reset in vhost_dev_stop() or
> > vhost_dev_start().  It just doesn’t work with stop/cont.  At the same
> > time, I understand that you say we must have it because
> > vhost_dev_{stop,start}() are also used when the guest driver changes.
> > Consequently, it sounds clear to me that using the exact same functions
> > in stop/cont and when the guest driver is unloaded/loaded is and has
> > always been wrong.  Because in stop/cont, the guest driver never
> > changes, so we shouldn’t tell the back-end that it did (by sending
> > SET_STATUS(0)).
> >
> 
> Talking just about vhost_net here, the device dumps all the state
> (like vqs) to qemu in vhost_dev_stop. That is what allows to have, for
> example, an unified code in migrating: qemu only needs to know how to
> migrate its emulated device, and vhost code just writes or read at
> suspend / continue or live migrating. To suspend and continue is the
> same operation actually for vhost_net.

Hi Longpeng,
This discussion made me realize that --device vhost-vdpa-device-pci does
not support stateful vDPA devices.

For example, a vdpa-net device that implements the controlq will lose
state (e.g. packet receive filtering) when the guest is stopped with the
QEMU 'stop' command.

QEMU needs to use the VHOST_VDPA_RESUME ioctl so it can resume stateful
devices instead of resetting them across 'stop'/'cont'.

Whatever solution we agree on in this thread should also work for
vhost-vdpa and solve the issue for --device vhost-vdpa-device-pci.

Stefan
diff mbox series

Patch

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index e64bfc7f98..72c3686b7f 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -42,8 +42,6 @@  typedef struct vhost_vdpa {
     bool shadow_vqs_enabled;
     /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
     bool shadow_data;
-    /* Device suspended successfully */
-    bool suspended;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 6a173cb9fa..69bf59d630 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -120,6 +120,14 @@  struct vhost_dev {
     uint64_t backend_cap;
     /* @started: is the vhost device started? */
     bool started;
+    /**
+     * @suspended: Whether the vhost device is currently suspended.  Set
+     * and reset by implementations (vhost-user, vhost-vdpa, ...), which
+     * are supposed to automatically suspend/resume in their
+     * vhost_dev_start handlers as required.  Must also be cleared when
+     * the device is reset.
+     */
+    bool suspended;
     bool log_enabled;
     uint64_t log_size;
     Error *migration_blocker;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7b7dee468e..f7fd19a203 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -858,13 +858,12 @@  static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
 
 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
 {
-    struct vhost_vdpa *v = dev->opaque;
     int ret;
     uint8_t status = 0;
 
     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
     trace_vhost_vdpa_reset_device(dev);
-    v->suspended = false;
+    dev->suspended = false;
     return ret;
 }
 
@@ -1278,7 +1277,7 @@  static void vhost_vdpa_suspend(struct vhost_dev *dev)
         if (unlikely(r)) {
             error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
         } else {
-            v->suspended = true;
+            dev->suspended = true;
             return;
         }
     }
@@ -1313,6 +1312,10 @@  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
             return -1;
         }
         vhost_vdpa_set_vring_ready(dev);
+        if (dev->suspended) {
+            /* TODO: When RESUME is available, use it instead of resetting */
+            vhost_vdpa_reset_status(dev);
+        }
     } else {
         vhost_vdpa_suspend(dev);
         vhost_vdpa_svqs_stop(dev);
@@ -1400,7 +1403,7 @@  static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
         return 0;
     }
 
-    if (!v->suspended) {
+    if (!dev->suspended) {
         /*
          * Cannot trust in value returned by device, let vhost recover used
          * idx from guest.
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index abf0d03c8d..2e28e58da7 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -2059,7 +2059,13 @@  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
                              hdev->vqs + i,
                              hdev->vq_index + i);
     }
-    if (hdev->vhost_ops->vhost_reset_status) {
+
+    /*
+     * If we failed to successfully stop the device via SUSPEND (should have
+     * been attempted by `vhost_dev_start(hdev, false)`), reset it to stop it.
+     * Stateful devices where this would be problematic must implement SUSPEND.
+     */
+    if (!hdev->suspended && hdev->vhost_ops->vhost_reset_status) {
         hdev->vhost_ops->vhost_reset_status(hdev);
     }