diff mbox series

ui/gtk: Wait until the current guest frame is rendered before switching to RUN_STATE_SAVE_VM

Message ID 20240529224252.80395-1-dongwon.kim@intel.com (mailing list archive)
State New
Headers show
Series ui/gtk: Wait until the current guest frame is rendered before switching to RUN_STATE_SAVE_VM | expand

Commit Message

Kim, Dongwon May 29, 2024, 10:42 p.m. UTC
From: Dongwon <dongwon.kim@intel.com>

Make sure rendering of the current frame is finished before switching
the run state to RUN_STATE_SAVE_VM by waiting for egl-sync object to be
signaled.

Cc: Marc-André Lureau <marcandre.lureau@redhat.com>
Cc: Vivek Kasireddy <vivek.kasireddy@intel.com>
Signed-off-by: Dongwon Kim <dongwon.kim@intel.com>
---
 ui/egl-helpers.c |  2 --
 ui/gtk.c         | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

Comments

Marc-André Lureau June 4, 2024, 11:12 a.m. UTC | #1
Hi

On Thu, May 30, 2024 at 2:44 AM <dongwon.kim@intel.com> wrote:

> From: Dongwon <dongwon.kim@intel.com>
>
> Make sure rendering of the current frame is finished before switching
> the run state to RUN_STATE_SAVE_VM by waiting for egl-sync object to be
> signaled.
>

Can you expand on what this solves?


>
> Cc: Marc-André Lureau <marcandre.lureau@redhat.com>
> Cc: Vivek Kasireddy <vivek.kasireddy@intel.com>
> Signed-off-by: Dongwon Kim <dongwon.kim@intel.com>
> ---
>  ui/egl-helpers.c |  2 --
>  ui/gtk.c         | 19 +++++++++++++++++++
>  2 files changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
> index 99b2ebbe23..dafeb36074 100644
> --- a/ui/egl-helpers.c
> +++ b/ui/egl-helpers.c
> @@ -396,8 +396,6 @@ void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
>          fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
>                                                sync);
>          qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
> -        eglDestroySyncKHR(qemu_egl_display, sync);
> -        qemu_dmabuf_set_sync(dmabuf, NULL);
>

If this function is called multiple times, it will now set a new fence_fd
each time, and potentially leak older fd. Maybe it could first check if a
fence_fd exists instead.

     }
>  }
>
> diff --git a/ui/gtk.c b/ui/gtk.c
> index 93b13b7a30..cf0dd6abed 100644
> --- a/ui/gtk.c
> +++ b/ui/gtk.c
> @@ -600,9 +600,12 @@ void gd_hw_gl_flushed(void *vcon)
>
>      fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
>      if (fence_fd >= 0) {
> +        void *sync = qemu_dmabuf_get_sync(dmabuf);
>          qemu_set_fd_handler(fence_fd, NULL, NULL, NULL);
>          close(fence_fd);
>          qemu_dmabuf_set_fence_fd(dmabuf, -1);
> +        eglDestroySyncKHR(qemu_egl_display, sync);
> +        qemu_dmabuf_set_sync(dmabuf, NULL);
>          graphic_hw_gl_block(vc->gfx.dcl.con, false);
>      }
>  }
> @@ -682,6 +685,22 @@ static const DisplayGLCtxOps egl_ctx_ops = {
>  static void gd_change_runstate(void *opaque, bool running, RunState state)
>  {
>      GtkDisplayState *s = opaque;
> +    QemuDmaBuf *dmabuf;
> +    int i;
> +
> +    if (state == RUN_STATE_SAVE_VM) {
> +        for (i = 0; i < s->nb_vcs; i++) {
> +            VirtualConsole *vc = &s->vc[i];
> +            dmabuf = vc->gfx.guest_fb.dmabuf;
> +            if (dmabuf && qemu_dmabuf_get_fence_fd(dmabuf) >= 0) {
> +                /* wait for the rendering to be completed */
> +                eglClientWaitSync(qemu_egl_display,
> +                                  qemu_dmabuf_get_sync(dmabuf),
> +                                  EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
> +                                  1000000000);
>

 I don't think adding waiting points in the migration path is appropriate.
Perhaps once you explain the actual issue, it will be easier to help.

+            }
> +        }
> +    }
>
>      gd_update_caption(s);
>  }
> --
> 2.34.1
>
>
>
Kim, Dongwon June 4, 2024, 5:49 p.m. UTC | #2
On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
> Hi
> 
> On Thu, May 30, 2024 at 2:44 AM <dongwon.kim@intel.com 
> <mailto:dongwon.kim@intel.com>> wrote:
> 
>     From: Dongwon <dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
> 
>     Make sure rendering of the current frame is finished before switching
>     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync object to be
>     signaled.
> 
> 
> Can you expand on what this solves?

In current scheme, guest waits for the fence to be signaled for each 
frame it submits before moving to the next frame. If the guest’s state 
is saved while it is still waiting for the fence, The guest will 
continue to  wait for the fence that was signaled while ago when it is 
restored to the point. One way to prevent it is to get it finish the 
current frame before changing the state.

> 
> 
>     Cc: Marc-André Lureau <marcandre.lureau@redhat.com
>     <mailto:marcandre.lureau@redhat.com>>
>     Cc: Vivek Kasireddy <vivek.kasireddy@intel.com
>     <mailto:vivek.kasireddy@intel.com>>
>     Signed-off-by: Dongwon Kim <dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>>
>     ---
>       ui/egl-helpers.c |  2 --
>       ui/gtk.c         | 19 +++++++++++++++++++
>       2 files changed, 19 insertions(+), 2 deletions(-)
> 
>     diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
>     index 99b2ebbe23..dafeb36074 100644
>     --- a/ui/egl-helpers.c
>     +++ b/ui/egl-helpers.c
>     @@ -396,8 +396,6 @@ void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
>               fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
>                                                     sync);
>               qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
>     -        eglDestroySyncKHR(qemu_egl_display, sync);
>     -        qemu_dmabuf_set_sync(dmabuf, NULL);
> 
> 
> If this function is called multiple times, it will now set a new 
> fence_fd each time, and potentially leak older fd. Maybe it could first 
> check if a fence_fd exists instead.

We can make that change.

> 
>           }
>       }
> 
>     diff --git a/ui/gtk.c b/ui/gtk.c
>     index 93b13b7a30..cf0dd6abed 100644
>     --- a/ui/gtk.c
>     +++ b/ui/gtk.c
>     @@ -600,9 +600,12 @@ void gd_hw_gl_flushed(void *vcon)
> 
>           fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
>           if (fence_fd >= 0) {
>     +        void *sync = qemu_dmabuf_get_sync(dmabuf);
>               qemu_set_fd_handler(fence_fd, NULL, NULL, NULL);
>               close(fence_fd);
>               qemu_dmabuf_set_fence_fd(dmabuf, -1);
>     +        eglDestroySyncKHR(qemu_egl_display, sync);
>     +        qemu_dmabuf_set_sync(dmabuf, NULL);
>               graphic_hw_gl_block(vc->gfx.dcl.con, false);
>           }
>       }
>     @@ -682,6 +685,22 @@ static const DisplayGLCtxOps egl_ctx_ops = {
>       static void gd_change_runstate(void *opaque, bool running,
>     RunState state)
>       {
>           GtkDisplayState *s = opaque;
>     +    QemuDmaBuf *dmabuf;
>     +    int i;
>     +
>     +    if (state == RUN_STATE_SAVE_VM) {
>     +        for (i = 0; i < s->nb_vcs; i++) {
>     +            VirtualConsole *vc = &s->vc[i];
>     +            dmabuf = vc->gfx.guest_fb.dmabuf;
>     +            if (dmabuf && qemu_dmabuf_get_fence_fd(dmabuf) >= 0) {
>     +                /* wait for the rendering to be completed */
>     +                eglClientWaitSync(qemu_egl_display,
>     +                                  qemu_dmabuf_get_sync(dmabuf),
>     +                                  EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
>     +                                  1000000000);
> 
> 
>   I don't think adding waiting points in the migration path is 
> appropriate. Perhaps once you explain the actual issue, it will be 
> easier to help.
> 
>     +            }
>     +        }
>     +    }
> 
>           gd_update_caption(s);
>       }
>     -- 
>     2.34.1
> 
> 
> 
> 
> -- 
> Marc-André Lureau
Marc-André Lureau June 5, 2024, 7:55 a.m. UTC | #3
Hi

On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon <dongwon.kim@intel.com> wrote:

> On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
> > Hi
> >
> > On Thu, May 30, 2024 at 2:44 AM <dongwon.kim@intel.com
> > <mailto:dongwon.kim@intel.com>> wrote:
> >
> >     From: Dongwon <dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
> >
> >     Make sure rendering of the current frame is finished before switching
> >     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync object to
> be
> >     signaled.
> >
> >
> > Can you expand on what this solves?
>
> In current scheme, guest waits for the fence to be signaled for each
> frame it submits before moving to the next frame. If the guest’s state
> is saved while it is still waiting for the fence, The guest will
> continue to  wait for the fence that was signaled while ago when it is
> restored to the point. One way to prevent it is to get it finish the
> current frame before changing the state.
>

After the UI sets a fence, hw_ops->gl_block(true) gets called, which will
block virtio-gpu/virgl from processing commands (until the fence is
signaled and gl_block/false called again).

But this "blocking" state is not saved. So how does this affect
save/restore? Please give more details, thanks


> >
> >
> >     Cc: Marc-André Lureau <marcandre.lureau@redhat.com
> >     <mailto:marcandre.lureau@redhat.com>>
> >     Cc: Vivek Kasireddy <vivek.kasireddy@intel.com
> >     <mailto:vivek.kasireddy@intel.com>>
> >     Signed-off-by: Dongwon Kim <dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>>
> >     ---
> >       ui/egl-helpers.c |  2 --
> >       ui/gtk.c         | 19 +++++++++++++++++++
> >       2 files changed, 19 insertions(+), 2 deletions(-)
> >
> >     diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
> >     index 99b2ebbe23..dafeb36074 100644
> >     --- a/ui/egl-helpers.c
> >     +++ b/ui/egl-helpers.c
> >     @@ -396,8 +396,6 @@ void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
> >               fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
> >                                                     sync);
> >               qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
> >     -        eglDestroySyncKHR(qemu_egl_display, sync);
> >     -        qemu_dmabuf_set_sync(dmabuf, NULL);
> >
> >
> > If this function is called multiple times, it will now set a new
> > fence_fd each time, and potentially leak older fd. Maybe it could first
> > check if a fence_fd exists instead.
>
> We can make that change.
>
> >
> >           }
> >       }
> >
> >     diff --git a/ui/gtk.c b/ui/gtk.c
> >     index 93b13b7a30..cf0dd6abed 100644
> >     --- a/ui/gtk.c
> >     +++ b/ui/gtk.c
> >     @@ -600,9 +600,12 @@ void gd_hw_gl_flushed(void *vcon)
> >
> >           fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
> >           if (fence_fd >= 0) {
> >     +        void *sync = qemu_dmabuf_get_sync(dmabuf);
> >               qemu_set_fd_handler(fence_fd, NULL, NULL, NULL);
> >               close(fence_fd);
> >               qemu_dmabuf_set_fence_fd(dmabuf, -1);
> >     +        eglDestroySyncKHR(qemu_egl_display, sync);
> >     +        qemu_dmabuf_set_sync(dmabuf, NULL);
> >               graphic_hw_gl_block(vc->gfx.dcl.con, false);
> >           }
> >       }
> >     @@ -682,6 +685,22 @@ static const DisplayGLCtxOps egl_ctx_ops = {
> >       static void gd_change_runstate(void *opaque, bool running,
> >     RunState state)
> >       {
> >           GtkDisplayState *s = opaque;
> >     +    QemuDmaBuf *dmabuf;
> >     +    int i;
> >     +
> >     +    if (state == RUN_STATE_SAVE_VM) {
> >     +        for (i = 0; i < s->nb_vcs; i++) {
> >     +            VirtualConsole *vc = &s->vc[i];
> >     +            dmabuf = vc->gfx.guest_fb.dmabuf;
> >     +            if (dmabuf && qemu_dmabuf_get_fence_fd(dmabuf) >= 0) {
> >     +                /* wait for the rendering to be completed */
> >     +                eglClientWaitSync(qemu_egl_display,
> >     +                                  qemu_dmabuf_get_sync(dmabuf),
> >     +                                  EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
> >     +                                  1000000000);
> >
> >
> >   I don't think adding waiting points in the migration path is
> > appropriate. Perhaps once you explain the actual issue, it will be
> > easier to help.
> >
> >     +            }
> >     +        }
> >     +    }
> >
> >           gd_update_caption(s);
> >       }
> >     --
> >     2.34.1
> >
> >
> >
> >
> > --
> > Marc-André Lureau
>
>
Kim, Dongwon June 12, 2024, 1:29 a.m. UTC | #4
Hi, 

From: Marc-André Lureau <marcandre.lureau@gmail.com> 
Sent: Wednesday, June 5, 2024 12:56 AM
To: Kim, Dongwon <dongwon.kim@intel.com>
Cc: qemu-devel@nongnu.org; Peter Xu <peterx@redhat.com>
Subject: Re: [PATCH] ui/gtk: Wait until the current guest frame is rendered before switching to RUN_STATE_SAVE_VM

Hi

On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon <mailto:dongwon.kim@intel.com> wrote:
On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
> Hi
> 
> On Thu, May 30, 2024 at 2:44 AM <mailto:dongwon.kim@intel.com 
> <mailto:mailto:dongwon.kim@intel.com>> wrote:
> 
>     From: Dongwon <mailto:dongwon.kim@intel.com <mailto:mailto:dongwon.kim@intel.com>>
> 
>     Make sure rendering of the current frame is finished before switching
>     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync object to be
>     signaled.
> 
> 
> Can you expand on what this solves?

In current scheme, guest waits for the fence to be signaled for each 
frame it submits before moving to the next frame. If the guest’s state 
is saved while it is still waiting for the fence, The guest will 
continue to  wait for the fence that was signaled while ago when it is 
restored to the point. One way to prevent it is to get it finish the 
current frame before changing the state.

After the UI sets a fence, hw_ops->gl_block(true) gets called, which will block virtio-gpu/virgl from processing commands (until the fence is signaled and gl_block/false called again).

But this "blocking" state is not saved. So how does this affect save/restore? Please give more details, thanks

Yeah sure. "Blocking" state is not saved but guest's state is saved while it was still waiting for the response for its last resource-flush virtio msg. This virtio response, by the way is set to be sent to the guest when the pipeline is unblocked (and when the fence is signaled.). Once the guest's state is saved, current instance of guest will be continued and receives the response as usual. The problem is happening when we restore the saved guest's state again because what guest does will be waiting for the response that was sent a while ago to the original instance.

> 
> 
>     Cc: Marc-André Lureau <mailto:marcandre.lureau@redhat.com
>     <mailto:mailto:marcandre.lureau@redhat.com>>
>     Cc: Vivek Kasireddy <mailto:vivek.kasireddy@intel.com
>     <mailto:mailto:vivek.kasireddy@intel.com>>
>     Signed-off-by: Dongwon Kim <mailto:dongwon.kim@intel.com
>     <mailto:mailto:dongwon.kim@intel.com>>
>     ---
>       ui/egl-helpers.c |  2 --
>       ui/gtk.c         | 19 +++++++++++++++++++
>       2 files changed, 19 insertions(+), 2 deletions(-)
> 
>     diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
>     index 99b2ebbe23..dafeb36074 100644
>     --- a/ui/egl-helpers.c
>     +++ b/ui/egl-helpers.c
>     @@ -396,8 +396,6 @@ void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
>               fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
>                                                     sync);
>               qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
>     -        eglDestroySyncKHR(qemu_egl_display, sync);
>     -        qemu_dmabuf_set_sync(dmabuf, NULL);
> 
> 
> If this function is called multiple times, it will now set a new 
> fence_fd each time, and potentially leak older fd. Maybe it could first 
> check if a fence_fd exists instead.

We can make that change.

> 
>           }
>       }
> 
>     diff --git a/ui/gtk.c b/ui/gtk.c
>     index 93b13b7a30..cf0dd6abed 100644
>     --- a/ui/gtk.c
>     +++ b/ui/gtk.c
>     @@ -600,9 +600,12 @@ void gd_hw_gl_flushed(void *vcon)
> 
>           fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
>           if (fence_fd >= 0) {
>     +        void *sync = qemu_dmabuf_get_sync(dmabuf);
>               qemu_set_fd_handler(fence_fd, NULL, NULL, NULL);
>               close(fence_fd);
>               qemu_dmabuf_set_fence_fd(dmabuf, -1);
>     +        eglDestroySyncKHR(qemu_egl_display, sync);
>     +        qemu_dmabuf_set_sync(dmabuf, NULL);
>               graphic_hw_gl_block(vc->gfx.dcl.con, false);
>           }
>       }
>     @@ -682,6 +685,22 @@ static const DisplayGLCtxOps egl_ctx_ops = {
>       static void gd_change_runstate(void *opaque, bool running,
>     RunState state)
>       {
>           GtkDisplayState *s = opaque;
>     +    QemuDmaBuf *dmabuf;
>     +    int i;
>     +
>     +    if (state == RUN_STATE_SAVE_VM) {
>     +        for (i = 0; i < s->nb_vcs; i++) {
>     +            VirtualConsole *vc = &s->vc[i];
>     +            dmabuf = vc->gfx.guest_fb.dmabuf;
>     +            if (dmabuf && qemu_dmabuf_get_fence_fd(dmabuf) >= 0) {
>     +                /* wait for the rendering to be completed */
>     +                eglClientWaitSync(qemu_egl_display,
>     +                                  qemu_dmabuf_get_sync(dmabuf),
>     +                                  EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
>     +                                  1000000000);
> 
> 
>   I don't think adding waiting points in the migration path is 
> appropriate. Perhaps once you explain the actual issue, it will be 
> easier to help.
> 
>     +            }
>     +        }
>     +    }
> 
>           gd_update_caption(s);
>       }
>     -- 
>     2.34.1
> 
> 
> 
> 
> -- 
> Marc-André Lureau
Marc-André Lureau June 12, 2024, 5:44 a.m. UTC | #5
Hi

On Wed, Jun 12, 2024 at 5:29 AM Kim, Dongwon <dongwon.kim@intel.com> wrote:

> Hi,
>
> From: Marc-André Lureau <marcandre.lureau@gmail.com>
> Sent: Wednesday, June 5, 2024 12:56 AM
> To: Kim, Dongwon <dongwon.kim@intel.com>
> Cc: qemu-devel@nongnu.org; Peter Xu <peterx@redhat.com>
> Subject: Re: [PATCH] ui/gtk: Wait until the current guest frame is
> rendered before switching to RUN_STATE_SAVE_VM
>
> Hi
>
> On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon <mailto:dongwon.kim@intel.com>
> wrote:
> On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
> > Hi
> >
> > On Thu, May 30, 2024 at 2:44 AM <mailto:dongwon.kim@intel.com
> > <mailto:mailto:dongwon.kim@intel.com>> wrote:
> >
> >     From: Dongwon <mailto:dongwon.kim@intel.com <mailto:mailto:
> dongwon.kim@intel.com>>
> >
> >     Make sure rendering of the current frame is finished before switching
> >     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync object to
> be
> >     signaled.
> >
> >
> > Can you expand on what this solves?
>
> In current scheme, guest waits for the fence to be signaled for each
> frame it submits before moving to the next frame. If the guest’s state
> is saved while it is still waiting for the fence, The guest will
> continue to  wait for the fence that was signaled while ago when it is
> restored to the point. One way to prevent it is to get it finish the
> current frame before changing the state.
>
> After the UI sets a fence, hw_ops->gl_block(true) gets called, which will
> block virtio-gpu/virgl from processing commands (until the fence is
> signaled and gl_block/false called again).
>
> But this "blocking" state is not saved. So how does this affect
> save/restore? Please give more details, thanks
>
> Yeah sure. "Blocking" state is not saved but guest's state is saved while
> it was still waiting for the response for its last resource-flush virtio
> msg. This virtio response, by the way is set to be sent to the guest when
> the pipeline is unblocked (and when the fence is signaled.). Once the
> guest's state is saved, current instance of guest will be continued and
> receives the response as usual. The problem is happening when we restore
> the saved guest's state again because what guest does will be waiting for
> the response that was sent a while ago to the original instance.
>

Where is the pending response saved? Can you detail how you test this?

thanks
Kim, Dongwon June 12, 2024, 6:50 p.m. UTC | #6
On 6/11/2024 10:44 PM, Marc-André Lureau wrote:
> Hi
> 
> On Wed, Jun 12, 2024 at 5:29 AM Kim, Dongwon <dongwon.kim@intel.com 
> <mailto:dongwon.kim@intel.com>> wrote:
> 
>     Hi,
> 
>     From: Marc-André Lureau <marcandre.lureau@gmail.com
>     <mailto:marcandre.lureau@gmail.com>>
>     Sent: Wednesday, June 5, 2024 12:56 AM
>     To: Kim, Dongwon <dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
>     Cc: qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>; Peter Xu
>     <peterx@redhat.com <mailto:peterx@redhat.com>>
>     Subject: Re: [PATCH] ui/gtk: Wait until the current guest frame is
>     rendered before switching to RUN_STATE_SAVE_VM
> 
>     Hi
> 
>     On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon
>     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>> wrote:
>     On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
>      > Hi
>      >
>      > On Thu, May 30, 2024 at 2:44 AM <mailto:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>
>      > <mailto:mailto <mailto:mailto>:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>>> wrote:
>      >
>      >     From: Dongwon <mailto:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com> <mailto:mailto
>     <mailto:mailto>:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>>
>      >
>      >     Make sure rendering of the current frame is finished before
>     switching
>      >     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync
>     object to be
>      >     signaled.
>      >
>      >
>      > Can you expand on what this solves?
> 
>     In current scheme, guest waits for the fence to be signaled for each
>     frame it submits before moving to the next frame. If the guest’s state
>     is saved while it is still waiting for the fence, The guest will
>     continue to  wait for the fence that was signaled while ago when it is
>     restored to the point. One way to prevent it is to get it finish the
>     current frame before changing the state.
> 
>     After the UI sets a fence, hw_ops->gl_block(true) gets called, which
>     will block virtio-gpu/virgl from processing commands (until the
>     fence is signaled and gl_block/false called again).
> 
>     But this "blocking" state is not saved. So how does this affect
>     save/restore? Please give more details, thanks
> 
>     Yeah sure. "Blocking" state is not saved but guest's state is saved
>     while it was still waiting for the response for its last
>     resource-flush virtio msg. This virtio response, by the way is set
>     to be sent to the guest when the pipeline is unblocked (and when the
>     fence is signaled.). Once the guest's state is saved, current
>     instance of guest will be continued and receives the response as
>     usual. The problem is happening when we restore the saved guest's
>     state again because what guest does will be waiting for the response
>     that was sent a while ago to the original instance.
> 
> 
> Where is the pending response saved? Can you detail how you test this?
> 

There is no pending response for the guest's restored point, which is a 
problem. The response is sent out after saving is done.

Normal cycle :

resource-flush (scanout flush) -> gl block -> render -> gl unblock 
(after fence is signaled) -> pending response sent out to the guest -> 
guest (virtio-gpu drv) processes the next scanout frame -> (next cycle) 
resource-flush -> gl block ......

When vm state is saved in the middle :

resource-flush (scanout-flush) -> gl block -> saving vm-state -> render 
-> gl unblock -> pending response (resp #1) sent out to the guest -> 
guest (virtio-gpu drv) processes the next scanout frame -> (next cycle) 
resource-flush -> gl block ......

Now, we restore the vm-state we saved

vm-state is restored -> guest (virtio-gpu drv) can't move on as this 
state is still waiting for the response (resp #1)

So we need to make sure vm-state is saved after the cycle is completed.

This situation would be only happening if you use blob=true with 
virtio-gpu drv as KMS on the linux guest. Do you have any similar setup?

> thanks
> 
> -- 
> Marc-André Lureau
Marc-André Lureau June 13, 2024, 1:16 p.m. UTC | #7
Hi

On Wed, Jun 12, 2024 at 10:50 PM Kim, Dongwon <dongwon.kim@intel.com> wrote:

> On 6/11/2024 10:44 PM, Marc-André Lureau wrote:
> > Hi
> >
> > On Wed, Jun 12, 2024 at 5:29 AM Kim, Dongwon <dongwon.kim@intel.com
> > <mailto:dongwon.kim@intel.com>> wrote:
> >
> >     Hi,
> >
> >     From: Marc-André Lureau <marcandre.lureau@gmail.com
> >     <mailto:marcandre.lureau@gmail.com>>
> >     Sent: Wednesday, June 5, 2024 12:56 AM
> >     To: Kim, Dongwon <dongwon.kim@intel.com <mailto:
> dongwon.kim@intel.com>>
> >     Cc: qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>; Peter Xu
> >     <peterx@redhat.com <mailto:peterx@redhat.com>>
> >     Subject: Re: [PATCH] ui/gtk: Wait until the current guest frame is
> >     rendered before switching to RUN_STATE_SAVE_VM
> >
> >     Hi
> >
> >     On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon
> >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>> wrote:
> >     On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
> >      > Hi
> >      >
> >      > On Thu, May 30, 2024 at 2:44 AM <mailto:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>
> >      > <mailto:mailto <mailto:mailto>:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>>> wrote:
> >      >
> >      >     From: Dongwon <mailto:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com> <mailto:mailto
> >     <mailto:mailto>:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com
> >>>
> >      >
> >      >     Make sure rendering of the current frame is finished before
> >     switching
> >      >     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync
> >     object to be
> >      >     signaled.
> >      >
> >      >
> >      > Can you expand on what this solves?
> >
> >     In current scheme, guest waits for the fence to be signaled for each
> >     frame it submits before moving to the next frame. If the guest’s
> state
> >     is saved while it is still waiting for the fence, The guest will
> >     continue to  wait for the fence that was signaled while ago when it
> is
> >     restored to the point. One way to prevent it is to get it finish the
> >     current frame before changing the state.
> >
> >     After the UI sets a fence, hw_ops->gl_block(true) gets called, which
> >     will block virtio-gpu/virgl from processing commands (until the
> >     fence is signaled and gl_block/false called again).
> >
> >     But this "blocking" state is not saved. So how does this affect
> >     save/restore? Please give more details, thanks
> >
> >     Yeah sure. "Blocking" state is not saved but guest's state is saved
> >     while it was still waiting for the response for its last
> >     resource-flush virtio msg. This virtio response, by the way is set
> >     to be sent to the guest when the pipeline is unblocked (and when the
> >     fence is signaled.). Once the guest's state is saved, current
> >     instance of guest will be continued and receives the response as
> >     usual. The problem is happening when we restore the saved guest's
> >     state again because what guest does will be waiting for the response
> >     that was sent a while ago to the original instance.
> >
> >
> > Where is the pending response saved? Can you detail how you test this?
> >
>
> There is no pending response for the guest's restored point, which is a
> problem. The response is sent out after saving is done.
>
> Normal cycle :
>
> resource-flush (scanout flush) -> gl block -> render -> gl unblock
> (after fence is signaled) -> pending response sent out to the guest ->
> guest (virtio-gpu drv) processes the next scanout frame -> (next cycle)
> resource-flush -> gl block ......
>
> When vm state is saved in the middle :
>
> resource-flush (scanout-flush) -> gl block -> saving vm-state -> render
> -> gl unblock -> pending response (resp #1) sent out to the guest ->
> guest (virtio-gpu drv) processes the next scanout frame -> (next cycle)
> resource-flush -> gl block ......
>
> Now, we restore the vm-state we saved
>
> vm-state is restored -> guest (virtio-gpu drv) can't move on as this
> state is still waiting for the response (resp #1)
>

Ok, so actually it's more of a device state issue than a UI/GTK. We end up
not saving a state that reflects the guest state. My understanding is that
the guest is waiting for a fence reply, and we don't save that. Imho, a
better fix would be to either save the fenceq (but then, what else is
missing to complete the operation on resume?), or have a wait to delay the
migration until the fences are flushed.


> So we need to make sure vm-state is saved after the cycle is completed.
>
> This situation would be only happening if you use blob=true with
> virtio-gpu drv as KMS on the linux guest. Do you have any similar setup?
>
>
No, further details to reproduce would help. Even better would be having
some automated test.
Kim, Dongwon June 13, 2024, 5:27 p.m. UTC | #8
Hi Marc-André,

On 6/13/2024 6:16 AM, Marc-André Lureau wrote:
> Hi
> 
> On Wed, Jun 12, 2024 at 10:50 PM Kim, Dongwon <dongwon.kim@intel.com 
> <mailto:dongwon.kim@intel.com>> wrote:
> 
>     On 6/11/2024 10:44 PM, Marc-André Lureau wrote:
>      > Hi
>      >
>      > On Wed, Jun 12, 2024 at 5:29 AM Kim, Dongwon
>     <dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
>      > <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>> wrote:
>      >
>      >     Hi,
>      >
>      >     From: Marc-André Lureau <marcandre.lureau@gmail.com
>     <mailto:marcandre.lureau@gmail.com>
>      >     <mailto:marcandre.lureau@gmail.com
>     <mailto:marcandre.lureau@gmail.com>>>
>      >     Sent: Wednesday, June 5, 2024 12:56 AM
>      >     To: Kim, Dongwon <dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com> <mailto:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>>>
>      >     Cc: qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>
>     <mailto:qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>>; Peter Xu
>      >     <peterx@redhat.com <mailto:peterx@redhat.com>
>     <mailto:peterx@redhat.com <mailto:peterx@redhat.com>>>
>      >     Subject: Re: [PATCH] ui/gtk: Wait until the current guest
>     frame is
>      >     rendered before switching to RUN_STATE_SAVE_VM
>      >
>      >     Hi
>      >
>      >     On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon
>      >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
>     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>> wrote:
>      >     On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
>      >      > Hi
>      >      >
>      >      > On Thu, May 30, 2024 at 2:44 AM
>     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
>      >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
>      >      > <mailto:mailto <mailto:mailto> <mailto:mailto
>     <mailto:mailto>>:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
>      >     <mailto:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>>>> wrote:
>      >      >
>      >      >     From: Dongwon <mailto:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>
>      >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
>     <mailto:mailto <mailto:mailto>
>      >     <mailto:mailto <mailto:mailto>>:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com> <mailto:dongwon.kim@intel.com
>     <mailto:dongwon.kim@intel.com>>>>
>      >      >
>      >      >     Make sure rendering of the current frame is finished
>     before
>      >     switching
>      >      >     the run state to RUN_STATE_SAVE_VM by waiting for egl-sync
>      >     object to be
>      >      >     signaled.
>      >      >
>      >      >
>      >      > Can you expand on what this solves?
>      >
>      >     In current scheme, guest waits for the fence to be signaled
>     for each
>      >     frame it submits before moving to the next frame. If the
>     guest’s state
>      >     is saved while it is still waiting for the fence, The guest will
>      >     continue to  wait for the fence that was signaled while ago
>     when it is
>      >     restored to the point. One way to prevent it is to get it
>     finish the
>      >     current frame before changing the state.
>      >
>      >     After the UI sets a fence, hw_ops->gl_block(true) gets
>     called, which
>      >     will block virtio-gpu/virgl from processing commands (until the
>      >     fence is signaled and gl_block/false called again).
>      >
>      >     But this "blocking" state is not saved. So how does this affect
>      >     save/restore? Please give more details, thanks
>      >
>      >     Yeah sure. "Blocking" state is not saved but guest's state is
>     saved
>      >     while it was still waiting for the response for its last
>      >     resource-flush virtio msg. This virtio response, by the way
>     is set
>      >     to be sent to the guest when the pipeline is unblocked (and
>     when the
>      >     fence is signaled.). Once the guest's state is saved, current
>      >     instance of guest will be continued and receives the response as
>      >     usual. The problem is happening when we restore the saved guest's
>      >     state again because what guest does will be waiting for the
>     response
>      >     that was sent a while ago to the original instance.
>      >
>      >
>      > Where is the pending response saved? Can you detail how you test
>     this?
>      >
> 
>     There is no pending response for the guest's restored point, which is a
>     problem. The response is sent out after saving is done.
> 
>     Normal cycle :
> 
>     resource-flush (scanout flush) -> gl block -> render -> gl unblock
>     (after fence is signaled) -> pending response sent out to the guest ->
>     guest (virtio-gpu drv) processes the next scanout frame -> (next cycle)
>     resource-flush -> gl block ......
> 
>     When vm state is saved in the middle :
> 
>     resource-flush (scanout-flush) -> gl block -> saving vm-state -> render
>     -> gl unblock -> pending response (resp #1) sent out to the guest ->
>     guest (virtio-gpu drv) processes the next scanout frame -> (next cycle)
>     resource-flush -> gl block ......
> 
>     Now, we restore the vm-state we saved
> 
>     vm-state is restored -> guest (virtio-gpu drv) can't move on as this
>     state is still waiting for the response (resp #1)
> 
> 
> Ok, so actually it's more of a device state issue than a UI/GTK. We end 
> up not saving a state that reflects the guest state. My understanding is 
> that the guest is waiting for a fence reply, and we don't save that. 
> Imho, a better fix would be to either save the fenceq (but then, what 
> else is missing to complete the operation on resume?), or have a wait to 
> delay the migration until the fences are flushed.

The second method you are proposing here - 'have a wait'. I understand 
you mean delaying the start point of migration but don't you think the 
current patch is basically doing the similar thing? Assuming egl wait 
sync is what we need to use for a wait, do you have any suggestion where 
that should be called other than 'gd_change_runstate'?

> 
> 
>     So we need to make sure vm-state is saved after the cycle is completed.
> 
>     This situation would be only happening if you use blob=true with
>     virtio-gpu drv as KMS on the linux guest. Do you have any similar setup?
> 
> 
> No, further details to reproduce would help. Even better would be having 
> some automated test.

I will think about this. We use GPU shared via SRIOV as a GPU deviceand 
virtio-gpu as a display device on the guest. I think I need to find and 
test more general cases.

> 
> 
> -- 
> Marc-André Lureau
Marc-André Lureau June 14, 2024, 9:25 a.m. UTC | #9
Hi

On Thu, Jun 13, 2024 at 9:27 PM Kim, Dongwon <dongwon.kim@intel.com> wrote:

> Hi Marc-André,
>
> On 6/13/2024 6:16 AM, Marc-André Lureau wrote:
> > Hi
> >
> > On Wed, Jun 12, 2024 at 10:50 PM Kim, Dongwon <dongwon.kim@intel.com
> > <mailto:dongwon.kim@intel.com>> wrote:
> >
> >     On 6/11/2024 10:44 PM, Marc-André Lureau wrote:
> >      > Hi
> >      >
> >      > On Wed, Jun 12, 2024 at 5:29 AM Kim, Dongwon
> >     <dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
> >      > <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>>
> wrote:
> >      >
> >      >     Hi,
> >      >
> >      >     From: Marc-André Lureau <marcandre.lureau@gmail.com
> >     <mailto:marcandre.lureau@gmail.com>
> >      >     <mailto:marcandre.lureau@gmail.com
> >     <mailto:marcandre.lureau@gmail.com>>>
> >      >     Sent: Wednesday, June 5, 2024 12:56 AM
> >      >     To: Kim, Dongwon <dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com> <mailto:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>>>
> >      >     Cc: qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>
> >     <mailto:qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>>;
> Peter Xu
> >      >     <peterx@redhat.com <mailto:peterx@redhat.com>
> >     <mailto:peterx@redhat.com <mailto:peterx@redhat.com>>>
> >      >     Subject: Re: [PATCH] ui/gtk: Wait until the current guest
> >     frame is
> >      >     rendered before switching to RUN_STATE_SAVE_VM
> >      >
> >      >     Hi
> >      >
> >      >     On Tue, Jun 4, 2024 at 9:49 PM Kim, Dongwon
> >      >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
> >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>>
> wrote:
> >      >     On 6/4/2024 4:12 AM, Marc-André Lureau wrote:
> >      >      > Hi
> >      >      >
> >      >      > On Thu, May 30, 2024 at 2:44 AM
> >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>
> >      >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
> >      >      > <mailto:mailto <mailto:mailto> <mailto:mailto
> >     <mailto:mailto>>:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com
> >
> >      >     <mailto:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>>>> wrote:
> >      >      >
> >      >      >     From: Dongwon <mailto:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>
> >      >     <mailto:dongwon.kim@intel.com <mailto:dongwon.kim@intel.com>>
> >     <mailto:mailto <mailto:mailto>
> >      >     <mailto:mailto <mailto:mailto>>:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com> <mailto:dongwon.kim@intel.com
> >     <mailto:dongwon.kim@intel.com>>>>
> >      >      >
> >      >      >     Make sure rendering of the current frame is finished
> >     before
> >      >     switching
> >      >      >     the run state to RUN_STATE_SAVE_VM by waiting for
> egl-sync
> >      >     object to be
> >      >      >     signaled.
> >      >      >
> >      >      >
> >      >      > Can you expand on what this solves?
> >      >
> >      >     In current scheme, guest waits for the fence to be signaled
> >     for each
> >      >     frame it submits before moving to the next frame. If the
> >     guest’s state
> >      >     is saved while it is still waiting for the fence, The guest
> will
> >      >     continue to  wait for the fence that was signaled while ago
> >     when it is
> >      >     restored to the point. One way to prevent it is to get it
> >     finish the
> >      >     current frame before changing the state.
> >      >
> >      >     After the UI sets a fence, hw_ops->gl_block(true) gets
> >     called, which
> >      >     will block virtio-gpu/virgl from processing commands (until
> the
> >      >     fence is signaled and gl_block/false called again).
> >      >
> >      >     But this "blocking" state is not saved. So how does this
> affect
> >      >     save/restore? Please give more details, thanks
> >      >
> >      >     Yeah sure. "Blocking" state is not saved but guest's state is
> >     saved
> >      >     while it was still waiting for the response for its last
> >      >     resource-flush virtio msg. This virtio response, by the way
> >     is set
> >      >     to be sent to the guest when the pipeline is unblocked (and
> >     when the
> >      >     fence is signaled.). Once the guest's state is saved, current
> >      >     instance of guest will be continued and receives the response
> as
> >      >     usual. The problem is happening when we restore the saved
> guest's
> >      >     state again because what guest does will be waiting for the
> >     response
> >      >     that was sent a while ago to the original instance.
> >      >
> >      >
> >      > Where is the pending response saved? Can you detail how you test
> >     this?
> >      >
> >
> >     There is no pending response for the guest's restored point, which
> is a
> >     problem. The response is sent out after saving is done.
> >
> >     Normal cycle :
> >
> >     resource-flush (scanout flush) -> gl block -> render -> gl unblock
> >     (after fence is signaled) -> pending response sent out to the guest
> ->
> >     guest (virtio-gpu drv) processes the next scanout frame -> (next
> cycle)
> >     resource-flush -> gl block ......
> >
> >     When vm state is saved in the middle :
> >
> >     resource-flush (scanout-flush) -> gl block -> saving vm-state ->
> render
> >     -> gl unblock -> pending response (resp #1) sent out to the guest ->
> >     guest (virtio-gpu drv) processes the next scanout frame -> (next
> cycle)
> >     resource-flush -> gl block ......
> >
> >     Now, we restore the vm-state we saved
> >
> >     vm-state is restored -> guest (virtio-gpu drv) can't move on as this
> >     state is still waiting for the response (resp #1)
> >
> >
> > Ok, so actually it's more of a device state issue than a UI/GTK. We end
> > up not saving a state that reflects the guest state. My understanding is
> > that the guest is waiting for a fence reply, and we don't save that.
> > Imho, a better fix would be to either save the fenceq (but then, what
> > else is missing to complete the operation on resume?), or have a wait to
> > delay the migration until the fences are flushed.
>
> The second method you are proposing here - 'have a wait'. I understand
> you mean delaying the start point of migration but don't you think the
> current patch is basically doing the similar thing? Assuming egl wait
> sync is what we need to use for a wait, do you have any suggestion where
> that should be called other than 'gd_change_runstate'?
>

It should be handled at virtio-gpu side. I am not sure if runstate handler
or pre_save are the right place.

Peter, what is the correct way to delay migration until the host finishes
some work? (in this case we would need to wait for the rendering/UI, to
signal pending fences)

thanks
diff mbox series

Patch

diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
index 99b2ebbe23..dafeb36074 100644
--- a/ui/egl-helpers.c
+++ b/ui/egl-helpers.c
@@ -396,8 +396,6 @@  void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
         fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
                                               sync);
         qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
-        eglDestroySyncKHR(qemu_egl_display, sync);
-        qemu_dmabuf_set_sync(dmabuf, NULL);
     }
 }
 
diff --git a/ui/gtk.c b/ui/gtk.c
index 93b13b7a30..cf0dd6abed 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -600,9 +600,12 @@  void gd_hw_gl_flushed(void *vcon)
 
     fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
     if (fence_fd >= 0) {
+        void *sync = qemu_dmabuf_get_sync(dmabuf);
         qemu_set_fd_handler(fence_fd, NULL, NULL, NULL);
         close(fence_fd);
         qemu_dmabuf_set_fence_fd(dmabuf, -1);
+        eglDestroySyncKHR(qemu_egl_display, sync);
+        qemu_dmabuf_set_sync(dmabuf, NULL);
         graphic_hw_gl_block(vc->gfx.dcl.con, false);
     }
 }
@@ -682,6 +685,22 @@  static const DisplayGLCtxOps egl_ctx_ops = {
 static void gd_change_runstate(void *opaque, bool running, RunState state)
 {
     GtkDisplayState *s = opaque;
+    QemuDmaBuf *dmabuf;
+    int i;
+
+    if (state == RUN_STATE_SAVE_VM) {
+        for (i = 0; i < s->nb_vcs; i++) {
+            VirtualConsole *vc = &s->vc[i];
+            dmabuf = vc->gfx.guest_fb.dmabuf;
+            if (dmabuf && qemu_dmabuf_get_fence_fd(dmabuf) >= 0) {
+                /* wait for the rendering to be completed */
+                eglClientWaitSync(qemu_egl_display,
+                                  qemu_dmabuf_get_sync(dmabuf),
+                                  EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
+                                  1000000000);
+            }
+        }
+    }
 
     gd_update_caption(s);
 }