diff mbox series

[3/3] net/colo-compare.c: Fix deadlock

Message ID 87c2f42b46f93fb89867f82e45aa2689eff98432.1586370737.git.lukasstraub2@web.de (mailing list archive)
State New, archived
Headers show
Series colo-compare bugfixes | expand

Commit Message

Lukas Straub April 8, 2020, 6:33 p.m. UTC
The chr_out chardev is connected to a filter-redirector
running in the main loop. qemu_chr_fe_write_all might block
here in compare_chr_send if the (socket-)buffer is full.
If another filter-redirector in the main loop want's to
send data to chr_pri_in it might also block if the buffer
is full. This leads to a deadlock because both event loops
get blocked.

Fix this by converting compare_chr_send to a coroutine
and return error if it is in use.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
 net/colo-compare.c | 82 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 11 deletions(-)

Comments

Zhang, Chen April 22, 2020, 8:40 a.m. UTC | #1
> -----Original Message-----
> From: Lukas Straub <lukasstraub2@web.de>
> Sent: Thursday, April 9, 2020 2:34 AM
> To: qemu-devel <qemu-devel@nongnu.org>
> Cc: Zhang, Chen <chen.zhang@intel.com>; Li Zhijian
> <lizhijian@cn.fujitsu.com>; Jason Wang <jasowang@redhat.com>; Marc-
> André Lureau <marcandre.lureau@redhat.com>; Paolo Bonzini
> <pbonzini@redhat.com>
> Subject: [PATCH 3/3] net/colo-compare.c: Fix deadlock
> 
> The chr_out chardev is connected to a filter-redirector running in the main
> loop. qemu_chr_fe_write_all might block here in compare_chr_send if the
> (socket-)buffer is full.
> If another filter-redirector in the main loop want's to send data to chr_pri_in
> it might also block if the buffer is full. This leads to a deadlock because both
> event loops get blocked.
> 
> Fix this by converting compare_chr_send to a coroutine and return error if it
> is in use.
> 

I have tested this series, running fine currently.
Can you share performance data after this patch?

Thanks
Zhang Chen

> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
>  net/colo-compare.c | 82
> +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 71 insertions(+), 11 deletions(-)
> 
> diff --git a/net/colo-compare.c b/net/colo-compare.c index
> 1de4220fe2..82787d3055 100644
> --- a/net/colo-compare.c
> +++ b/net/colo-compare.c
> @@ -32,6 +32,9 @@
>  #include "migration/migration.h"
>  #include "util.h"
> 
> +#include "block/aio-wait.h"
> +#include "qemu/coroutine.h"
> +
>  #define TYPE_COLO_COMPARE "colo-compare"
>  #define COLO_COMPARE(obj) \
>      OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE) @@ -77,6
> +80,17 @@ static int event_unhandled_count;
>   *                    |packet  |  |packet  +    |packet  | |packet  +
>   *                    +--------+  +--------+    +--------+ +--------+
>   */
> +
> +typedef struct SendCo {
> +    Coroutine *co;
> +    uint8_t *buf;
> +    uint32_t size;
> +    uint32_t vnet_hdr_len;
> +    bool notify_remote_frame;
> +    bool done;
> +    int ret;
> +} SendCo;
> +
>  typedef struct CompareState {
>      Object parent;
> 
> @@ -91,6 +105,7 @@ typedef struct CompareState {
>      SocketReadState pri_rs;
>      SocketReadState sec_rs;
>      SocketReadState notify_rs;
> +    SendCo sendco;
>      bool vnet_hdr;
>      uint32_t compare_timeout;
>      uint32_t expired_scan_cycle;
> @@ -699,19 +714,17 @@ static void colo_compare_connection(void
> *opaque, void *user_data)
>      }
>  }
> 
> -static int compare_chr_send(CompareState *s,
> -                            const uint8_t *buf,
> -                            uint32_t size,
> -                            uint32_t vnet_hdr_len,
> -                            bool notify_remote_frame)
> +static void coroutine_fn _compare_chr_send(void *opaque)
>  {
> +    CompareState *s = opaque;
> +    SendCo *sendco = &s->sendco;
> +    const uint8_t *buf = sendco->buf;
> +    uint32_t size = sendco->size;
> +    uint32_t vnet_hdr_len = sendco->vnet_hdr_len;
> +    bool notify_remote_frame = sendco->notify_remote_frame;
>      int ret = 0;
>      uint32_t len = htonl(size);
> 
> -    if (!size) {
> -        return 0;
> -    }
> -
>      if (notify_remote_frame) {
>          ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
>                                      (uint8_t *)&len, @@ -754,10 +767,50 @@ static int
> compare_chr_send(CompareState *s,
>          goto err;
>      }
> 
> -    return 0;
> +    sendco->ret = 0;
> +    goto out;
> 
>  err:
> -    return ret < 0 ? ret : -EIO;
> +    sendco->ret = ret < 0 ? ret : -EIO;
> +out:
> +    sendco->co = NULL;
> +    g_free(sendco->buf);
> +    sendco->buf = NULL;
> +    sendco->done = true;
> +    aio_wait_kick();
> +}
> +
> +static int compare_chr_send(CompareState *s,
> +                            const uint8_t *buf,
> +                            uint32_t size,
> +                            uint32_t vnet_hdr_len,
> +                            bool notify_remote_frame) {
> +    SendCo *sendco = &s->sendco;
> +
> +    if (!size) {
> +        return 0;
> +    }
> +
> +    if (sendco->done) {
> +        sendco->co = qemu_coroutine_create(_compare_chr_send, s);
> +        sendco->buf = g_malloc(size);
> +        sendco->size = size;
> +        sendco->vnet_hdr_len = vnet_hdr_len;
> +        sendco->notify_remote_frame = notify_remote_frame;
> +        sendco->done = false;
> +        memcpy(sendco->buf, buf, size);
> +        qemu_coroutine_enter(sendco->co);
> +        if (sendco->done) {
> +            /* report early errors */
> +            return sendco->ret;
> +        } else {
> +            /* else assume success */
> +            return 0;
> +        }
> +    }
> +
> +    return -ENOBUFS;
>  }
> 
>  static int compare_chr_can_read(void *opaque) @@ -1146,6 +1199,8 @@
> static void colo_compare_complete(UserCreatable *uc, Error **errp)
>      CompareState *s = COLO_COMPARE(uc);
>      Chardev *chr;
> 
> +    s->sendco.done = true;
> +
>      if (!s->pri_indev || !s->sec_indev || !s->outdev || !s->iothread) {
>          error_setg(errp, "colo compare needs 'primary_in' ,"
>                     "'secondary_in','outdev','iothread' property set"); @@ -1281,6
> +1336,11 @@ static void colo_compare_finalize(Object *obj)
>      CompareState *s = COLO_COMPARE(obj);
>      CompareState *tmp = NULL;
> 
> +    AioContext *ctx = iothread_get_aio_context(s->iothread);
> +    aio_context_acquire(ctx);
> +    AIO_WAIT_WHILE(ctx, !s->sendco.done);
> +    aio_context_release(ctx);
> +
>      qemu_chr_fe_deinit(&s->chr_pri_in, false);
>      qemu_chr_fe_deinit(&s->chr_sec_in, false);
>      qemu_chr_fe_deinit(&s->chr_out, false);
> --
> 2.20.1
Lukas Straub April 23, 2020, 2:03 p.m. UTC | #2
On Wed, 22 Apr 2020 08:40:40 +0000
"Zhang, Chen" <chen.zhang@intel.com> wrote:

> > -----Original Message-----
> > From: Lukas Straub <lukasstraub2@web.de>
> > Sent: Thursday, April 9, 2020 2:34 AM
> > To: qemu-devel <qemu-devel@nongnu.org>
> > Cc: Zhang, Chen <chen.zhang@intel.com>; Li Zhijian
> > <lizhijian@cn.fujitsu.com>; Jason Wang <jasowang@redhat.com>; Marc-
> > André Lureau <marcandre.lureau@redhat.com>; Paolo Bonzini
> > <pbonzini@redhat.com>
> > Subject: [PATCH 3/3] net/colo-compare.c: Fix deadlock
> > 
> > The chr_out chardev is connected to a filter-redirector running in the main
> > loop. qemu_chr_fe_write_all might block here in compare_chr_send if the
> > (socket-)buffer is full.
> > If another filter-redirector in the main loop want's to send data to chr_pri_in
> > it might also block if the buffer is full. This leads to a deadlock because both
> > event loops get blocked.
> > 
> > Fix this by converting compare_chr_send to a coroutine and return error if it
> > is in use.
> >   
> 
> I have tested this series, running fine currently.
> Can you share performance data after this patch?
> 
> Thanks
> Zhang Chen

Hello,
Here are the results (using iperf3):
Client-to-server tcp:
without patch: ~64.2 Mbit/s
with patch: ~28.9 Mbit/s
Server-to-client tcp:
without patch: 360 Kbit/s (when it doesn't deadlock :)
with patch: 220 Kbit/s

Yeah, it hurts performance somewhat, but the deadlock happens often with lots
of server-to-client traffic. (It deadlocked in 2 of 4 runs)

Do you have a better idea to solve this issue?

Regards,
Lukas Straub

> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> >  net/colo-compare.c | 82
> > +++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 71 insertions(+), 11 deletions(-)
> > 
> > diff --git a/net/colo-compare.c b/net/colo-compare.c index
> > 1de4220fe2..82787d3055 100644
> > --- a/net/colo-compare.c
> > +++ b/net/colo-compare.c
> > @@ -32,6 +32,9 @@
> >  #include "migration/migration.h"
> >  #include "util.h"
> > 
> > +#include "block/aio-wait.h"
> > +#include "qemu/coroutine.h"
> > +
> >  #define TYPE_COLO_COMPARE "colo-compare"
> >  #define COLO_COMPARE(obj) \
> >      OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE) @@ -77,6
> > +80,17 @@ static int event_unhandled_count;
> >   *                    |packet  |  |packet  +    |packet  | |packet  +
> >   *                    +--------+  +--------+    +--------+ +--------+
> >   */
> > +
> > +typedef struct SendCo {
> > +    Coroutine *co;
> > +    uint8_t *buf;
> > +    uint32_t size;
> > +    uint32_t vnet_hdr_len;
> > +    bool notify_remote_frame;
> > +    bool done;
> > +    int ret;
> > +} SendCo;
> > +
> >  typedef struct CompareState {
> >      Object parent;
> > 
> > @@ -91,6 +105,7 @@ typedef struct CompareState {
> >      SocketReadState pri_rs;
> >      SocketReadState sec_rs;
> >      SocketReadState notify_rs;
> > +    SendCo sendco;
> >      bool vnet_hdr;
> >      uint32_t compare_timeout;
> >      uint32_t expired_scan_cycle;
> > @@ -699,19 +714,17 @@ static void colo_compare_connection(void
> > *opaque, void *user_data)
> >      }
> >  }
> > 
> > -static int compare_chr_send(CompareState *s,
> > -                            const uint8_t *buf,
> > -                            uint32_t size,
> > -                            uint32_t vnet_hdr_len,
> > -                            bool notify_remote_frame)
> > +static void coroutine_fn _compare_chr_send(void *opaque)
> >  {
> > +    CompareState *s = opaque;
> > +    SendCo *sendco = &s->sendco;
> > +    const uint8_t *buf = sendco->buf;
> > +    uint32_t size = sendco->size;
> > +    uint32_t vnet_hdr_len = sendco->vnet_hdr_len;
> > +    bool notify_remote_frame = sendco->notify_remote_frame;
> >      int ret = 0;
> >      uint32_t len = htonl(size);
> > 
> > -    if (!size) {
> > -        return 0;
> > -    }
> > -
> >      if (notify_remote_frame) {
> >          ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
> >                                      (uint8_t *)&len, @@ -754,10 +767,50 @@ static int
> > compare_chr_send(CompareState *s,
> >          goto err;
> >      }
> > 
> > -    return 0;
> > +    sendco->ret = 0;
> > +    goto out;
> > 
> >  err:
> > -    return ret < 0 ? ret : -EIO;
> > +    sendco->ret = ret < 0 ? ret : -EIO;
> > +out:
> > +    sendco->co = NULL;
> > +    g_free(sendco->buf);
> > +    sendco->buf = NULL;
> > +    sendco->done = true;
> > +    aio_wait_kick();
> > +}
> > +
> > +static int compare_chr_send(CompareState *s,
> > +                            const uint8_t *buf,
> > +                            uint32_t size,
> > +                            uint32_t vnet_hdr_len,
> > +                            bool notify_remote_frame) {
> > +    SendCo *sendco = &s->sendco;
> > +
> > +    if (!size) {
> > +        return 0;
> > +    }
> > +
> > +    if (sendco->done) {
> > +        sendco->co = qemu_coroutine_create(_compare_chr_send, s);
> > +        sendco->buf = g_malloc(size);
> > +        sendco->size = size;
> > +        sendco->vnet_hdr_len = vnet_hdr_len;
> > +        sendco->notify_remote_frame = notify_remote_frame;
> > +        sendco->done = false;
> > +        memcpy(sendco->buf, buf, size);
> > +        qemu_coroutine_enter(sendco->co);
> > +        if (sendco->done) {
> > +            /* report early errors */
> > +            return sendco->ret;
> > +        } else {
> > +            /* else assume success */
> > +            return 0;
> > +        }
> > +    }
> > +
> > +    return -ENOBUFS;
> >  }
> > 
> >  static int compare_chr_can_read(void *opaque) @@ -1146,6 +1199,8 @@
> > static void colo_compare_complete(UserCreatable *uc, Error **errp)
> >      CompareState *s = COLO_COMPARE(uc);
> >      Chardev *chr;
> > 
> > +    s->sendco.done = true;
> > +
> >      if (!s->pri_indev || !s->sec_indev || !s->outdev || !s->iothread) {
> >          error_setg(errp, "colo compare needs 'primary_in' ,"
> >                     "'secondary_in','outdev','iothread' property set"); @@ -1281,6
> > +1336,11 @@ static void colo_compare_finalize(Object *obj)
> >      CompareState *s = COLO_COMPARE(obj);
> >      CompareState *tmp = NULL;
> > 
> > +    AioContext *ctx = iothread_get_aio_context(s->iothread);
> > +    aio_context_acquire(ctx);
> > +    AIO_WAIT_WHILE(ctx, !s->sendco.done);
> > +    aio_context_release(ctx);
> > +
> >      qemu_chr_fe_deinit(&s->chr_pri_in, false);
> >      qemu_chr_fe_deinit(&s->chr_sec_in, false);
> >      qemu_chr_fe_deinit(&s->chr_out, false);
> > --
> > 2.20.1
diff mbox series

Patch

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 1de4220fe2..82787d3055 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -32,6 +32,9 @@ 
 #include "migration/migration.h"
 #include "util.h"
 
+#include "block/aio-wait.h"
+#include "qemu/coroutine.h"
+
 #define TYPE_COLO_COMPARE "colo-compare"
 #define COLO_COMPARE(obj) \
     OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
@@ -77,6 +80,17 @@  static int event_unhandled_count;
  *                    |packet  |  |packet  +    |packet  | |packet  +
  *                    +--------+  +--------+    +--------+ +--------+
  */
+
+typedef struct SendCo {
+    Coroutine *co;
+    uint8_t *buf;
+    uint32_t size;
+    uint32_t vnet_hdr_len;
+    bool notify_remote_frame;
+    bool done;
+    int ret;
+} SendCo;
+
 typedef struct CompareState {
     Object parent;
 
@@ -91,6 +105,7 @@  typedef struct CompareState {
     SocketReadState pri_rs;
     SocketReadState sec_rs;
     SocketReadState notify_rs;
+    SendCo sendco;
     bool vnet_hdr;
     uint32_t compare_timeout;
     uint32_t expired_scan_cycle;
@@ -699,19 +714,17 @@  static void colo_compare_connection(void *opaque, void *user_data)
     }
 }
 
-static int compare_chr_send(CompareState *s,
-                            const uint8_t *buf,
-                            uint32_t size,
-                            uint32_t vnet_hdr_len,
-                            bool notify_remote_frame)
+static void coroutine_fn _compare_chr_send(void *opaque)
 {
+    CompareState *s = opaque;
+    SendCo *sendco = &s->sendco;
+    const uint8_t *buf = sendco->buf;
+    uint32_t size = sendco->size;
+    uint32_t vnet_hdr_len = sendco->vnet_hdr_len;
+    bool notify_remote_frame = sendco->notify_remote_frame;
     int ret = 0;
     uint32_t len = htonl(size);
 
-    if (!size) {
-        return 0;
-    }
-
     if (notify_remote_frame) {
         ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
                                     (uint8_t *)&len,
@@ -754,10 +767,50 @@  static int compare_chr_send(CompareState *s,
         goto err;
     }
 
-    return 0;
+    sendco->ret = 0;
+    goto out;
 
 err:
-    return ret < 0 ? ret : -EIO;
+    sendco->ret = ret < 0 ? ret : -EIO;
+out:
+    sendco->co = NULL;
+    g_free(sendco->buf);
+    sendco->buf = NULL;
+    sendco->done = true;
+    aio_wait_kick();
+}
+
+static int compare_chr_send(CompareState *s,
+                            const uint8_t *buf,
+                            uint32_t size,
+                            uint32_t vnet_hdr_len,
+                            bool notify_remote_frame)
+{
+    SendCo *sendco = &s->sendco;
+
+    if (!size) {
+        return 0;
+    }
+
+    if (sendco->done) {
+        sendco->co = qemu_coroutine_create(_compare_chr_send, s);
+        sendco->buf = g_malloc(size);
+        sendco->size = size;
+        sendco->vnet_hdr_len = vnet_hdr_len;
+        sendco->notify_remote_frame = notify_remote_frame;
+        sendco->done = false;
+        memcpy(sendco->buf, buf, size);
+        qemu_coroutine_enter(sendco->co);
+        if (sendco->done) {
+            /* report early errors */
+            return sendco->ret;
+        } else {
+            /* else assume success */
+            return 0;
+        }
+    }
+
+    return -ENOBUFS;
 }
 
 static int compare_chr_can_read(void *opaque)
@@ -1146,6 +1199,8 @@  static void colo_compare_complete(UserCreatable *uc, Error **errp)
     CompareState *s = COLO_COMPARE(uc);
     Chardev *chr;
 
+    s->sendco.done = true;
+
     if (!s->pri_indev || !s->sec_indev || !s->outdev || !s->iothread) {
         error_setg(errp, "colo compare needs 'primary_in' ,"
                    "'secondary_in','outdev','iothread' property set");
@@ -1281,6 +1336,11 @@  static void colo_compare_finalize(Object *obj)
     CompareState *s = COLO_COMPARE(obj);
     CompareState *tmp = NULL;
 
+    AioContext *ctx = iothread_get_aio_context(s->iothread);
+    aio_context_acquire(ctx);
+    AIO_WAIT_WHILE(ctx, !s->sendco.done);
+    aio_context_release(ctx);
+
     qemu_chr_fe_deinit(&s->chr_pri_in, false);
     qemu_chr_fe_deinit(&s->chr_sec_in, false);
     qemu_chr_fe_deinit(&s->chr_out, false);