diff mbox series

[v3,2/2] vhost-user: fix lost reconnect again

Message ID 20240514061239.86461-3-fengli@smartx.com (mailing list archive)
State New
Headers show
Series [v3,1/2] Revert "vhost-user: fix lost reconnect" | expand

Commit Message

Li Feng May 14, 2024, 6:12 a.m. UTC
When the vhost-user is reconnecting to the backend, and if the vhost-user fails
at the get_features in vhost_dev_init(), then the reconnect will fail
and it will not be retriggered forever.

The reason is:
When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
immediately.

vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.

The reconnect path is:
vhost_user_blk_event
   vhost_user_async_close(.. vhost_user_blk_disconnect ..)
     qemu_chr_fe_set_handlers <----- clear the notifier callback
       schedule vhost_user_async_close_bh

The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
called, then the event fd callback will not be reinstalled.

With this patch, the vhost_user_blk_disconnect will call the
vhost_dev_cleanup() again, it's safe.

In addition, the CLOSE event may occur in a scenario where connected is false.
At this time, the event handler will be cleared. We need to ensure that the
event handler can remain installed.

All vhost-user devices have this issue, including vhost-user-blk/scsi.

Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")

Signed-off-by: Li Feng <fengli@smartx.com>
---
 hw/block/vhost-user-blk.c   |  3 ++-
 hw/scsi/vhost-user-scsi.c   |  3 ++-
 hw/virtio/vhost-user-base.c |  3 ++-
 hw/virtio/vhost-user.c      | 10 +---------
 4 files changed, 7 insertions(+), 12 deletions(-)

Comments

Raphael Norwitz May 14, 2024, 1:58 p.m. UTC | #1
Code looks good. Just a question on the error case you're trying to fix.

On Tue, May 14, 2024 at 2:12 AM Li Feng <fengli@smartx.com> wrote:
>
> When the vhost-user is reconnecting to the backend, and if the vhost-user fails
> at the get_features in vhost_dev_init(), then the reconnect will fail
> and it will not be retriggered forever.
>
> The reason is:
> When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
> immediately.
>
> vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.
>
> The reconnect path is:
> vhost_user_blk_event
>    vhost_user_async_close(.. vhost_user_blk_disconnect ..)
>      qemu_chr_fe_set_handlers <----- clear the notifier callback
>        schedule vhost_user_async_close_bh
>
> The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
> called, then the event fd callback will not be reinstalled.
>
> With this patch, the vhost_user_blk_disconnect will call the
> vhost_dev_cleanup() again, it's safe.
>
> In addition, the CLOSE event may occur in a scenario where connected is false.
> At this time, the event handler will be cleared. We need to ensure that the
> event handler can remain installed.

Following on from the prior patch, why would "connected" be false when
a CLOSE event happens?

>
> All vhost-user devices have this issue, including vhost-user-blk/scsi.
>
> Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")
>
> Signed-off-by: Li Feng <fengli@smartx.com>
> ---
>  hw/block/vhost-user-blk.c   |  3 ++-
>  hw/scsi/vhost-user-scsi.c   |  3 ++-
>  hw/virtio/vhost-user-base.c |  3 ++-
>  hw/virtio/vhost-user.c      | 10 +---------
>  4 files changed, 7 insertions(+), 12 deletions(-)
>
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 41d1ac3a5a..c6842ced48 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -353,7 +353,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>      VHostUserBlk *s = VHOST_USER_BLK(vdev);
>
>      if (!s->connected) {
> -        return;
> +        goto done;
>      }
>      s->connected = false;
>
> @@ -361,6 +361,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>
>      vhost_dev_cleanup(&s->dev);
>
> +done:
>      /* Re-instate the event handler for new connections */
>      qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
>                               NULL, dev, NULL, true);
> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
> index 48a59e020e..b49a11d23b 100644
> --- a/hw/scsi/vhost-user-scsi.c
> +++ b/hw/scsi/vhost-user-scsi.c
> @@ -181,7 +181,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>      VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>
>      if (!s->connected) {
> -        return;
> +        goto done;
>      }
>      s->connected = false;
>
> @@ -189,6 +189,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>
>      vhost_dev_cleanup(&vsc->dev);
>
> +done:
>      /* Re-instate the event handler for new connections */
>      qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>                               vhost_user_scsi_event, NULL, dev, NULL, true);
> diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c
> index 4b54255682..11e72b1e3b 100644
> --- a/hw/virtio/vhost-user-base.c
> +++ b/hw/virtio/vhost-user-base.c
> @@ -225,13 +225,14 @@ static void vub_disconnect(DeviceState *dev)
>      VHostUserBase *vub = VHOST_USER_BASE(vdev);
>
>      if (!vub->connected) {
> -        return;
> +        goto done;
>      }
>      vub->connected = false;
>
>      vub_stop(vdev);
>      vhost_dev_cleanup(&vub->vhost_dev);
>
> +done:
>      /* Re-instate the event handler for new connections */
>      qemu_chr_fe_set_handlers(&vub->chardev,
>                               NULL, NULL, vub_event,
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index c929097e87..c407ea8939 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -2781,16 +2781,8 @@ typedef struct {
>  static void vhost_user_async_close_bh(void *opaque)
>  {
>      VhostAsyncCallback *data = opaque;
> -    struct vhost_dev *vhost = data->vhost;
>
> -    /*
> -     * If the vhost_dev has been cleared in the meantime there is
> -     * nothing left to do as some other path has completed the
> -     * cleanup.
> -     */
> -    if (vhost->vdev) {
> -        data->cb(data->dev);
> -    }
> +    data->cb(data->dev);
>
>      g_free(data);
>  }
> --
> 2.45.0
>
Li Feng May 15, 2024, 5:47 a.m. UTC | #2
> 2024年5月14日 21:58,Raphael Norwitz <raphael@enfabrica.net> 写道:
> 
> Code looks good. Just a question on the error case you're trying to fix.
> 
> On Tue, May 14, 2024 at 2:12 AM Li Feng <fengli@smartx.com> wrote:
>> 
>> When the vhost-user is reconnecting to the backend, and if the vhost-user fails
>> at the get_features in vhost_dev_init(), then the reconnect will fail
>> and it will not be retriggered forever.
>> 
>> The reason is:
>> When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
>> immediately.
>> 
>> vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.
>> 
>> The reconnect path is:
>> vhost_user_blk_event
>>   vhost_user_async_close(.. vhost_user_blk_disconnect ..)
>>     qemu_chr_fe_set_handlers <----- clear the notifier callback
>>       schedule vhost_user_async_close_bh
>> 
>> The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
>> called, then the event fd callback will not be reinstalled.
>> 
>> With this patch, the vhost_user_blk_disconnect will call the
>> vhost_dev_cleanup() again, it's safe.
>> 
>> In addition, the CLOSE event may occur in a scenario where connected is false.
>> At this time, the event handler will be cleared. We need to ensure that the
>> event handler can remain installed.
> 
> Following on from the prior patch, why would "connected" be false when
> a CLOSE event happens?

In OPEN event handling, vhost_user_blk_connect calls vhost_dev_init and encounters
an error such that s->connected remains false.
Next, after the CLOSE event arrives, it is found that s->connected is false, so nothing
is done, but the event handler will be cleaned up in `vhost_user_async_close`
before the CLOSE event is executed.

Thanks,
Li

> 
>> 
>> All vhost-user devices have this issue, including vhost-user-blk/scsi.
>> 
>> Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")
>> 
>> Signed-off-by: Li Feng <fengli@smartx.com>
>> ---
>> hw/block/vhost-user-blk.c   |  3 ++-
>> hw/scsi/vhost-user-scsi.c   |  3 ++-
>> hw/virtio/vhost-user-base.c |  3 ++-
>> hw/virtio/vhost-user.c      | 10 +---------
>> 4 files changed, 7 insertions(+), 12 deletions(-)
>> 
>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>> index 41d1ac3a5a..c6842ced48 100644
>> --- a/hw/block/vhost-user-blk.c
>> +++ b/hw/block/vhost-user-blk.c
>> @@ -353,7 +353,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>>     VHostUserBlk *s = VHOST_USER_BLK(vdev);
>> 
>>     if (!s->connected) {
>> -        return;
>> +        goto done;
>>     }
>>     s->connected = false;
>> 
>> @@ -361,6 +361,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>> 
>>     vhost_dev_cleanup(&s->dev);
>> 
>> +done:
>>     /* Re-instate the event handler for new connections */
>>     qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
>>                              NULL, dev, NULL, true);
>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>> index 48a59e020e..b49a11d23b 100644
>> --- a/hw/scsi/vhost-user-scsi.c
>> +++ b/hw/scsi/vhost-user-scsi.c
>> @@ -181,7 +181,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>>     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>> 
>>     if (!s->connected) {
>> -        return;
>> +        goto done;
>>     }
>>     s->connected = false;
>> 
>> @@ -189,6 +189,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>> 
>>     vhost_dev_cleanup(&vsc->dev);
>> 
>> +done:
>>     /* Re-instate the event handler for new connections */
>>     qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>>                              vhost_user_scsi_event, NULL, dev, NULL, true);
>> diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c
>> index 4b54255682..11e72b1e3b 100644
>> --- a/hw/virtio/vhost-user-base.c
>> +++ b/hw/virtio/vhost-user-base.c
>> @@ -225,13 +225,14 @@ static void vub_disconnect(DeviceState *dev)
>>     VHostUserBase *vub = VHOST_USER_BASE(vdev);
>> 
>>     if (!vub->connected) {
>> -        return;
>> +        goto done;
>>     }
>>     vub->connected = false;
>> 
>>     vub_stop(vdev);
>>     vhost_dev_cleanup(&vub->vhost_dev);
>> 
>> +done:
>>     /* Re-instate the event handler for new connections */
>>     qemu_chr_fe_set_handlers(&vub->chardev,
>>                              NULL, NULL, vub_event,
>> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
>> index c929097e87..c407ea8939 100644
>> --- a/hw/virtio/vhost-user.c
>> +++ b/hw/virtio/vhost-user.c
>> @@ -2781,16 +2781,8 @@ typedef struct {
>> static void vhost_user_async_close_bh(void *opaque)
>> {
>>     VhostAsyncCallback *data = opaque;
>> -    struct vhost_dev *vhost = data->vhost;
>> 
>> -    /*
>> -     * If the vhost_dev has been cleared in the meantime there is
>> -     * nothing left to do as some other path has completed the
>> -     * cleanup.
>> -     */
>> -    if (vhost->vdev) {
>> -        data->cb(data->dev);
>> -    }
>> +    data->cb(data->dev);
>> 
>>     g_free(data);
>> }
>> --
>> 2.45.0
>>
Raphael Norwitz May 15, 2024, 3:47 p.m. UTC | #3
The case your describing makes sense but now I have some concerns on
the vhost_dev_cleanup bit.

On Wed, May 15, 2024 at 1:47 AM Li Feng <fengli@smartx.com> wrote:
>
>
>
> > 2024年5月14日 21:58,Raphael Norwitz <raphael@enfabrica.net> 写道:
> >
> > Code looks good. Just a question on the error case you're trying to fix.
> >
> > On Tue, May 14, 2024 at 2:12 AM Li Feng <fengli@smartx.com> wrote:
> >>
> >> When the vhost-user is reconnecting to the backend, and if the vhost-user fails
> >> at the get_features in vhost_dev_init(), then the reconnect will fail
> >> and it will not be retriggered forever.
> >>
> >> The reason is:
> >> When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
> >> immediately.
> >>
> >> vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.
> >>
> >> The reconnect path is:
> >> vhost_user_blk_event
> >>   vhost_user_async_close(.. vhost_user_blk_disconnect ..)
> >>     qemu_chr_fe_set_handlers <----- clear the notifier callback
> >>       schedule vhost_user_async_close_bh
> >>
> >> The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
> >> called, then the event fd callback will not be reinstalled.
> >>
> >> With this patch, the vhost_user_blk_disconnect will call the
> >> vhost_dev_cleanup() again, it's safe.
> >>
> >> In addition, the CLOSE event may occur in a scenario where connected is false.
> >> At this time, the event handler will be cleared. We need to ensure that the
> >> event handler can remain installed.
> >
> > Following on from the prior patch, why would "connected" be false when
> > a CLOSE event happens?
>
> In OPEN event handling, vhost_user_blk_connect calls vhost_dev_init and encounters
> an error such that s->connected remains false.
> Next, after the CLOSE event arrives, it is found that s->connected is false, so nothing
> is done, but the event handler will be cleaned up in `vhost_user_async_close`
> before the CLOSE event is executed.
>

Got it - I see why the event handler is never re-installed in the code
as it was before if we fail at get_features. That said, how do you
explain your comment:

> >> With this patch, the vhost_user_blk_disconnect will call the
> >> vhost_dev_cleanup() again, it's safe.

I see vhost_dev_cleanup() accessing hdev without even a NULL check. In
the case we're talking about here I don't think it's a problem because
if vhost_dev_init() fails, connected will be false and hit the goto
but I am concerned that there could be double-frees or use-after-frees
in other cases.

> Thanks,
> Li
>
> >
> >>
> >> All vhost-user devices have this issue, including vhost-user-blk/scsi.
> >>
> >> Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")
> >>
> >> Signed-off-by: Li Feng <fengli@smartx.com>
> >> ---
> >> hw/block/vhost-user-blk.c   |  3 ++-
> >> hw/scsi/vhost-user-scsi.c   |  3 ++-
> >> hw/virtio/vhost-user-base.c |  3 ++-
> >> hw/virtio/vhost-user.c      | 10 +---------
> >> 4 files changed, 7 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> >> index 41d1ac3a5a..c6842ced48 100644
> >> --- a/hw/block/vhost-user-blk.c
> >> +++ b/hw/block/vhost-user-blk.c
> >> @@ -353,7 +353,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
> >>     VHostUserBlk *s = VHOST_USER_BLK(vdev);
> >>
> >>     if (!s->connected) {
> >> -        return;
> >> +        goto done;
> >>     }
> >>     s->connected = false;
> >>
> >> @@ -361,6 +361,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
> >>
> >>     vhost_dev_cleanup(&s->dev);
> >>
> >> +done:
> >>     /* Re-instate the event handler for new connections */
> >>     qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
> >>                              NULL, dev, NULL, true);
> >> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
> >> index 48a59e020e..b49a11d23b 100644
> >> --- a/hw/scsi/vhost-user-scsi.c
> >> +++ b/hw/scsi/vhost-user-scsi.c
> >> @@ -181,7 +181,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
> >>     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
> >>
> >>     if (!s->connected) {
> >> -        return;
> >> +        goto done;
> >>     }
> >>     s->connected = false;
> >>
> >> @@ -189,6 +189,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
> >>
> >>     vhost_dev_cleanup(&vsc->dev);
> >>
> >> +done:
> >>     /* Re-instate the event handler for new connections */
> >>     qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
> >>                              vhost_user_scsi_event, NULL, dev, NULL, true);
> >> diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c
> >> index 4b54255682..11e72b1e3b 100644
> >> --- a/hw/virtio/vhost-user-base.c
> >> +++ b/hw/virtio/vhost-user-base.c
> >> @@ -225,13 +225,14 @@ static void vub_disconnect(DeviceState *dev)
> >>     VHostUserBase *vub = VHOST_USER_BASE(vdev);
> >>
> >>     if (!vub->connected) {
> >> -        return;
> >> +        goto done;
> >>     }
> >>     vub->connected = false;
> >>
> >>     vub_stop(vdev);
> >>     vhost_dev_cleanup(&vub->vhost_dev);
> >>
> >> +done:
> >>     /* Re-instate the event handler for new connections */
> >>     qemu_chr_fe_set_handlers(&vub->chardev,
> >>                              NULL, NULL, vub_event,
> >> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> >> index c929097e87..c407ea8939 100644
> >> --- a/hw/virtio/vhost-user.c
> >> +++ b/hw/virtio/vhost-user.c
> >> @@ -2781,16 +2781,8 @@ typedef struct {
> >> static void vhost_user_async_close_bh(void *opaque)
> >> {
> >>     VhostAsyncCallback *data = opaque;
> >> -    struct vhost_dev *vhost = data->vhost;
> >>
> >> -    /*
> >> -     * If the vhost_dev has been cleared in the meantime there is
> >> -     * nothing left to do as some other path has completed the
> >> -     * cleanup.
> >> -     */
> >> -    if (vhost->vdev) {
> >> -        data->cb(data->dev);
> >> -    }
> >> +    data->cb(data->dev);
> >>
> >>     g_free(data);
> >> }
> >> --
> >> 2.45.0
> >>
>
Li Feng May 16, 2024, 2:48 a.m. UTC | #4
> 2024年5月15日 23:47,Raphael Norwitz <raphael@enfabrica.net> 写道:
> 
> The case your describing makes sense but now I have some concerns on
> the vhost_dev_cleanup bit.
> 
> On Wed, May 15, 2024 at 1:47 AM Li Feng <fengli@smartx.com <mailto:fengli@smartx.com>> wrote:
>> 
>> 
>> 
>>> 2024年5月14日 21:58,Raphael Norwitz <raphael@enfabrica.net> 写道:
>>> 
>>> Code looks good. Just a question on the error case you're trying to fix.
>>> 
>>> On Tue, May 14, 2024 at 2:12 AM Li Feng <fengli@smartx.com> wrote:
>>>> 
>>>> When the vhost-user is reconnecting to the backend, and if the vhost-user fails
>>>> at the get_features in vhost_dev_init(), then the reconnect will fail
>>>> and it will not be retriggered forever.
>>>> 
>>>> The reason is:
>>>> When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
>>>> immediately.
>>>> 
>>>> vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.
>>>> 
>>>> The reconnect path is:
>>>> vhost_user_blk_event
>>>>  vhost_user_async_close(.. vhost_user_blk_disconnect ..)
>>>>    qemu_chr_fe_set_handlers <----- clear the notifier callback
>>>>      schedule vhost_user_async_close_bh
>>>> 
>>>> The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
>>>> called, then the event fd callback will not be reinstalled.
>>>> 
>>>> With this patch, the vhost_user_blk_disconnect will call the
>>>> vhost_dev_cleanup() again, it's safe.
>>>> 
>>>> In addition, the CLOSE event may occur in a scenario where connected is false.
>>>> At this time, the event handler will be cleared. We need to ensure that the
>>>> event handler can remain installed.
>>> 
>>> Following on from the prior patch, why would "connected" be false when
>>> a CLOSE event happens?
>> 
>> In OPEN event handling, vhost_user_blk_connect calls vhost_dev_init and encounters
>> an error such that s->connected remains false.
>> Next, after the CLOSE event arrives, it is found that s->connected is false, so nothing
>> is done, but the event handler will be cleaned up in `vhost_user_async_close`
>> before the CLOSE event is executed.
>> 
> 
> Got it - I see why the event handler is never re-installed in the code
> as it was before if we fail at get_features. That said, how do you
> explain your comment:

OK, I will update the commit message because this code has changed some months ago.

> 
>>>> With this patch, the vhost_user_blk_disconnect will call the
>>>> vhost_dev_cleanup() again, it's safe.
> 
> I see vhost_dev_cleanup() accessing hdev without even a NULL check. In
> the case we're talking about here I don't think it's a problem because
> if vhost_dev_init() fails, connected will be false and hit the goto
> but I am concerned that there could be double-frees or use-after-frees
> in other cases.

OK, you are right, with this patch, the vhost_dev_cleanup will not be
called multiple times now.

I think there is no need to worry about calling vhost_dev_cleanup multiple times,
because historically vhost_dev_cleanup has been allowed to be called multiple
times, and looking at the code, it can be found that calling vhost_dev_cleanup
multiple times is indeed safe.

Look this patch:

commit e0547b59dc0ead4c605d3f02d1c8829630a1311b
Author: Marc-André Lureau <marcandre.lureau@redhat.com>
Date:   Wed Jul 27 01:15:02 2016 +0400

    vhost: make vhost_dev_cleanup() idempotent

    It is called on multiple code path, so make it safe to call several
    times (note: I don't remember a reproducer here, but a function called
    'cleanup' should probably be idempotent in my book)

    Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Thanks,
Li

> 
>> Thanks,
>> Li
>> 
>>> 
>>>> 
>>>> All vhost-user devices have this issue, including vhost-user-blk/scsi.
>>>> 
>>>> Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")
>>>> 
>>>> Signed-off-by: Li Feng <fengli@smartx.com>
>>>> ---
>>>> hw/block/vhost-user-blk.c   |  3 ++-
>>>> hw/scsi/vhost-user-scsi.c   |  3 ++-
>>>> hw/virtio/vhost-user-base.c |  3 ++-
>>>> hw/virtio/vhost-user.c      | 10 +---------
>>>> 4 files changed, 7 insertions(+), 12 deletions(-)
>>>> 
>>>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>>>> index 41d1ac3a5a..c6842ced48 100644
>>>> --- a/hw/block/vhost-user-blk.c
>>>> +++ b/hw/block/vhost-user-blk.c
>>>> @@ -353,7 +353,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>>>>    VHostUserBlk *s = VHOST_USER_BLK(vdev);
>>>> 
>>>>    if (!s->connected) {
>>>> -        return;
>>>> +        goto done;
>>>>    }
>>>>    s->connected = false;
>>>> 
>>>> @@ -361,6 +361,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>>>> 
>>>>    vhost_dev_cleanup(&s->dev);
>>>> 
>>>> +done:
>>>>    /* Re-instate the event handler for new connections */
>>>>    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
>>>>                             NULL, dev, NULL, true);
>>>> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
>>>> index 48a59e020e..b49a11d23b 100644
>>>> --- a/hw/scsi/vhost-user-scsi.c
>>>> +++ b/hw/scsi/vhost-user-scsi.c
>>>> @@ -181,7 +181,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>>>>    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>>>> 
>>>>    if (!s->connected) {
>>>> -        return;
>>>> +        goto done;
>>>>    }
>>>>    s->connected = false;
>>>> 
>>>> @@ -189,6 +189,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>>>> 
>>>>    vhost_dev_cleanup(&vsc->dev);
>>>> 
>>>> +done:
>>>>    /* Re-instate the event handler for new connections */
>>>>    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>>>>                             vhost_user_scsi_event, NULL, dev, NULL, true);
>>>> diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c
>>>> index 4b54255682..11e72b1e3b 100644
>>>> --- a/hw/virtio/vhost-user-base.c
>>>> +++ b/hw/virtio/vhost-user-base.c
>>>> @@ -225,13 +225,14 @@ static void vub_disconnect(DeviceState *dev)
>>>>    VHostUserBase *vub = VHOST_USER_BASE(vdev);
>>>> 
>>>>    if (!vub->connected) {
>>>> -        return;
>>>> +        goto done;
>>>>    }
>>>>    vub->connected = false;
>>>> 
>>>>    vub_stop(vdev);
>>>>    vhost_dev_cleanup(&vub->vhost_dev);
>>>> 
>>>> +done:
>>>>    /* Re-instate the event handler for new connections */
>>>>    qemu_chr_fe_set_handlers(&vub->chardev,
>>>>                             NULL, NULL, vub_event,
>>>> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
>>>> index c929097e87..c407ea8939 100644
>>>> --- a/hw/virtio/vhost-user.c
>>>> +++ b/hw/virtio/vhost-user.c
>>>> @@ -2781,16 +2781,8 @@ typedef struct {
>>>> static void vhost_user_async_close_bh(void *opaque)
>>>> {
>>>>    VhostAsyncCallback *data = opaque;
>>>> -    struct vhost_dev *vhost = data->vhost;
>>>> 
>>>> -    /*
>>>> -     * If the vhost_dev has been cleared in the meantime there is
>>>> -     * nothing left to do as some other path has completed the
>>>> -     * cleanup.
>>>> -     */
>>>> -    if (vhost->vdev) {
>>>> -        data->cb(data->dev);
>>>> -    }
>>>> +    data->cb(data->dev);
>>>> 
>>>>    g_free(data);
>>>> }
>>>> --
>>>> 2.45.0
Raphael Norwitz May 16, 2024, 4:19 a.m. UTC | #5
OK - I'm happy with this approach then.

On Wed, May 15, 2024 at 10:48 PM Li Feng <fengli@smartx.com> wrote:
>
>
>
> 2024年5月15日 23:47,Raphael Norwitz <raphael@enfabrica.net> 写道:
>
> The case your describing makes sense but now I have some concerns on
> the vhost_dev_cleanup bit.
>
> On Wed, May 15, 2024 at 1:47 AM Li Feng <fengli@smartx.com> wrote:
>
>
>
>
> 2024年5月14日 21:58,Raphael Norwitz <raphael@enfabrica.net> 写道:
>
> Code looks good. Just a question on the error case you're trying to fix.
>
> On Tue, May 14, 2024 at 2:12 AM Li Feng <fengli@smartx.com> wrote:
>
>
> When the vhost-user is reconnecting to the backend, and if the vhost-user fails
> at the get_features in vhost_dev_init(), then the reconnect will fail
> and it will not be retriggered forever.
>
> The reason is:
> When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
> immediately.
>
> vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.
>
> The reconnect path is:
> vhost_user_blk_event
>  vhost_user_async_close(.. vhost_user_blk_disconnect ..)
>    qemu_chr_fe_set_handlers <----- clear the notifier callback
>      schedule vhost_user_async_close_bh
>
> The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
> called, then the event fd callback will not be reinstalled.
>
> With this patch, the vhost_user_blk_disconnect will call the
> vhost_dev_cleanup() again, it's safe.
>
> In addition, the CLOSE event may occur in a scenario where connected is false.
> At this time, the event handler will be cleared. We need to ensure that the
> event handler can remain installed.
>
>
> Following on from the prior patch, why would "connected" be false when
> a CLOSE event happens?
>
>
> In OPEN event handling, vhost_user_blk_connect calls vhost_dev_init and encounters
> an error such that s->connected remains false.
> Next, after the CLOSE event arrives, it is found that s->connected is false, so nothing
> is done, but the event handler will be cleaned up in `vhost_user_async_close`
> before the CLOSE event is executed.
>
>
> Got it - I see why the event handler is never re-installed in the code
> as it was before if we fail at get_features. That said, how do you
> explain your comment:
>
>
> OK, I will update the commit message because this code has changed some months ago.
>
>
> With this patch, the vhost_user_blk_disconnect will call the
> vhost_dev_cleanup() again, it's safe.
>
>
> I see vhost_dev_cleanup() accessing hdev without even a NULL check. In
> the case we're talking about here I don't think it's a problem because
> if vhost_dev_init() fails, connected will be false and hit the goto
> but I am concerned that there could be double-frees or use-after-frees
> in other cases.
>
>
> OK, you are right, with this patch, the vhost_dev_cleanup will not be
> called multiple times now.
>
> I think there is no need to worry about calling vhost_dev_cleanup multiple times,
> because historically vhost_dev_cleanup has been allowed to be called multiple
> times, and looking at the code, it can be found that calling vhost_dev_cleanup
> multiple times is indeed safe.
>
> Look this patch:
>
> commit e0547b59dc0ead4c605d3f02d1c8829630a1311b
> Author: Marc-André Lureau <marcandre.lureau@redhat.com>
> Date:   Wed Jul 27 01:15:02 2016 +0400
>
>     vhost: make vhost_dev_cleanup() idempotent
>
>     It is called on multiple code path, so make it safe to call several
>     times (note: I don't remember a reproducer here, but a function called
>     'cleanup' should probably be idempotent in my book)
>
>     Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>     Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> Thanks,
> Li
>
>
> Thanks,
> Li
>
>
>
> All vhost-user devices have this issue, including vhost-user-blk/scsi.
>
> Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")
>
> Signed-off-by: Li Feng <fengli@smartx.com>
> ---
> hw/block/vhost-user-blk.c   |  3 ++-
> hw/scsi/vhost-user-scsi.c   |  3 ++-
> hw/virtio/vhost-user-base.c |  3 ++-
> hw/virtio/vhost-user.c      | 10 +---------
> 4 files changed, 7 insertions(+), 12 deletions(-)
>
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 41d1ac3a5a..c6842ced48 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -353,7 +353,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>    VHostUserBlk *s = VHOST_USER_BLK(vdev);
>
>    if (!s->connected) {
> -        return;
> +        goto done;
>    }
>    s->connected = false;
>
> @@ -361,6 +361,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>
>    vhost_dev_cleanup(&s->dev);
>
> +done:
>    /* Re-instate the event handler for new connections */
>    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
>                             NULL, dev, NULL, true);
> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
> index 48a59e020e..b49a11d23b 100644
> --- a/hw/scsi/vhost-user-scsi.c
> +++ b/hw/scsi/vhost-user-scsi.c
> @@ -181,7 +181,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
>
>    if (!s->connected) {
> -        return;
> +        goto done;
>    }
>    s->connected = false;
>
> @@ -189,6 +189,7 @@ static void vhost_user_scsi_disconnect(DeviceState *dev)
>
>    vhost_dev_cleanup(&vsc->dev);
>
> +done:
>    /* Re-instate the event handler for new connections */
>    qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
>                             vhost_user_scsi_event, NULL, dev, NULL, true);
> diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c
> index 4b54255682..11e72b1e3b 100644
> --- a/hw/virtio/vhost-user-base.c
> +++ b/hw/virtio/vhost-user-base.c
> @@ -225,13 +225,14 @@ static void vub_disconnect(DeviceState *dev)
>    VHostUserBase *vub = VHOST_USER_BASE(vdev);
>
>    if (!vub->connected) {
> -        return;
> +        goto done;
>    }
>    vub->connected = false;
>
>    vub_stop(vdev);
>    vhost_dev_cleanup(&vub->vhost_dev);
>
> +done:
>    /* Re-instate the event handler for new connections */
>    qemu_chr_fe_set_handlers(&vub->chardev,
>                             NULL, NULL, vub_event,
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index c929097e87..c407ea8939 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -2781,16 +2781,8 @@ typedef struct {
> static void vhost_user_async_close_bh(void *opaque)
> {
>    VhostAsyncCallback *data = opaque;
> -    struct vhost_dev *vhost = data->vhost;
>
> -    /*
> -     * If the vhost_dev has been cleared in the meantime there is
> -     * nothing left to do as some other path has completed the
> -     * cleanup.
> -     */
> -    if (vhost->vdev) {
> -        data->cb(data->dev);
> -    }
> +    data->cb(data->dev);
>
>    g_free(data);
> }
> --
> 2.45.0
>
>
diff mbox series

Patch

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 41d1ac3a5a..c6842ced48 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -353,7 +353,7 @@  static void vhost_user_blk_disconnect(DeviceState *dev)
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
 
     if (!s->connected) {
-        return;
+        goto done;
     }
     s->connected = false;
 
@@ -361,6 +361,7 @@  static void vhost_user_blk_disconnect(DeviceState *dev)
 
     vhost_dev_cleanup(&s->dev);
 
+done:
     /* Re-instate the event handler for new connections */
     qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
                              NULL, dev, NULL, true);
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index 48a59e020e..b49a11d23b 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -181,7 +181,7 @@  static void vhost_user_scsi_disconnect(DeviceState *dev)
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
 
     if (!s->connected) {
-        return;
+        goto done;
     }
     s->connected = false;
 
@@ -189,6 +189,7 @@  static void vhost_user_scsi_disconnect(DeviceState *dev)
 
     vhost_dev_cleanup(&vsc->dev);
 
+done:
     /* Re-instate the event handler for new connections */
     qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
                              vhost_user_scsi_event, NULL, dev, NULL, true);
diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c
index 4b54255682..11e72b1e3b 100644
--- a/hw/virtio/vhost-user-base.c
+++ b/hw/virtio/vhost-user-base.c
@@ -225,13 +225,14 @@  static void vub_disconnect(DeviceState *dev)
     VHostUserBase *vub = VHOST_USER_BASE(vdev);
 
     if (!vub->connected) {
-        return;
+        goto done;
     }
     vub->connected = false;
 
     vub_stop(vdev);
     vhost_dev_cleanup(&vub->vhost_dev);
 
+done:
     /* Re-instate the event handler for new connections */
     qemu_chr_fe_set_handlers(&vub->chardev,
                              NULL, NULL, vub_event,
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index c929097e87..c407ea8939 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2781,16 +2781,8 @@  typedef struct {
 static void vhost_user_async_close_bh(void *opaque)
 {
     VhostAsyncCallback *data = opaque;
-    struct vhost_dev *vhost = data->vhost;
 
-    /*
-     * If the vhost_dev has been cleared in the meantime there is
-     * nothing left to do as some other path has completed the
-     * cleanup.
-     */
-    if (vhost->vdev) {
-        data->cb(data->dev);
-    }
+    data->cb(data->dev);
 
     g_free(data);
 }