diff mbox

[v2] mirror: follow AioContext change gracefully

Message ID 20160612065104.13856-1-famz@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Fam Zheng June 12, 2016, 6:51 a.m. UTC
From: Stefan Hajnoczi <stefanha@redhat.com>

When dataplane is enabled or disabled the drive switches to a new
AioContext.  The mirror block job must also move to the new AioContext
so that drive accesses are always made within its AioContext.

This patch partially achieves that by draining target and source
requests to reach a quiescent point.  The job is resumed in the new
AioContext after moving s->target into the new AioContext.

The quiesce_requested flag is added to deal with yield points in
block_job_sleep_ns(), bdrv_is_allocated_above(), and
bdrv_get_block_status_above().  Previously they continue executing in
the old AioContext. The nested aio_poll in mirror_detach_aio_context
will drive the mirror coroutine upto fixed yield points, where
mirror_check_for_quiesce is called.

Cc: Fam Zheng <famz@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Jeff Cody <jcody@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
[Drain source as well, and add s->quiesce_requested flag. -- Fam]
Signed-off-by: Fam Zheng <famz@redhat.com>

---

v2: Picked up Stefan's RFC patch and move on towards a more complete
fix.  Please review!

Jason: it would be nice if you could test this version again. It differs
from the previous version.
---
 block/mirror.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

Comments

Paolo Bonzini June 13, 2016, 9:55 a.m. UTC | #1
On 12/06/2016 08:51, Fam Zheng wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> When dataplane is enabled or disabled the drive switches to a new
> AioContext.  The mirror block job must also move to the new AioContext
> so that drive accesses are always made within its AioContext.
> 
> This patch partially achieves that by draining target and source
> requests to reach a quiescent point.  The job is resumed in the new
> AioContext after moving s->target into the new AioContext.
> 
> The quiesce_requested flag is added to deal with yield points in
> block_job_sleep_ns(), bdrv_is_allocated_above(), and
> bdrv_get_block_status_above().  Previously they continue executing in
> the old AioContext. The nested aio_poll in mirror_detach_aio_context
> will drive the mirror coroutine upto fixed yield points, where
> mirror_check_for_quiesce is called.
> 
> Cc: Fam Zheng <famz@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Jeff Cody <jcody@redhat.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> [Drain source as well, and add s->quiesce_requested flag. -- Fam]
> Signed-off-by: Fam Zheng <famz@redhat.com>

As discussed on IRC, perhaps we can reuse the pausing/job->busy
mechanism to detect quiescence.

There's no synchronous pause function, but it can be realized with
block_job_pause and aio_poll.

Also while discussing this patch on IRC Fam noticed that resume
currently clears the job's iostatus.  I think that functionality can be
moved to the QMP command.

Thanks,

Paolo

> ---
> 
> v2: Picked up Stefan's RFC patch and move on towards a more complete
> fix.  Please review!
> 
> Jason: it would be nice if you could test this version again. It differs
> from the previous version.
> ---
>  block/mirror.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 80fd3c7..142199a 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -63,6 +63,8 @@ typedef struct MirrorBlockJob {
>      int ret;
>      bool unmap;
>      bool waiting_for_io;
> +    bool quiesce_requested; /* temporarily detached to move AioContext,
> +                               don't do more I/O */
>      int target_cluster_sectors;
>      int max_iov;
>  } MirrorBlockJob;
> @@ -119,7 +121,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
>      qemu_iovec_destroy(&op->qiov);
>      g_free(op);
>  
> -    if (s->waiting_for_io) {
> +    if (s->waiting_for_io && !s->quiesce_requested) {
>          qemu_coroutine_enter(s->common.co, NULL);
>      }
>  }
> @@ -307,6 +309,14 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
>      }
>  }
>  
> +static void coroutine_fn mirror_check_for_quiesce(MirrorBlockJob *s)
> +{
> +    if (s->quiesce_requested) {
> +        s->quiesce_requested = false;
> +        qemu_coroutine_yield();
> +    }
> +}
> +
>  static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
>  {
>      BlockDriverState *source = blk_bs(s->common.blk);
> @@ -331,6 +341,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
>          mirror_wait_for_io(s);
>      }
>  
> +    mirror_check_for_quiesce(s);
>      /* Find the number of consective dirty chunks following the first dirty
>       * one, and wait for in flight requests in them. */
>      while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
> @@ -442,6 +453,31 @@ static void mirror_drain(MirrorBlockJob *s)
>      }
>  }
>  
> +static void mirror_attached_aio_context(AioContext *new_context, void *opaque)
> +{
> +    MirrorBlockJob *s = opaque;
> +
> +    blk_set_aio_context(s->target, new_context);
> +
> +    /* Resume execution */
> +    assert(!s->quiesce_requested);
> +    if (s->waiting_for_io) {
> +        qemu_coroutine_enter(s->common.co, NULL);
> +    }
> +}
> +
> +static void mirror_detach_aio_context(void *opaque)
> +{
> +    MirrorBlockJob *s = opaque;
> +
> +    /* Complete pending write requests */
> +    assert(!s->quiesce_requested);
> +    s->quiesce_requested = true;
> +    while (s->quiesce_requested || s->in_flight) {
> +        aio_poll(blk_get_aio_context(s->common.blk), true);
> +    }
> +}
> +
>  typedef struct {
>      int ret;
>  } MirrorExitData;
> @@ -491,6 +527,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
>      if (replace_aio_context) {
>          aio_context_release(replace_aio_context);
>      }
> +    blk_remove_aio_context_notifier(s->common.blk, mirror_attached_aio_context,
> +                                    mirror_detach_aio_context, s);
>      g_free(s->replaces);
>      bdrv_op_unblock_all(target_bs, s->common.blocker);
>      blk_unref(s->target);
> @@ -583,6 +621,7 @@ static void coroutine_fn mirror_run(void *opaque)
>                  block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
>              }
>  
> +            mirror_check_for_quiesce(s);
>              if (block_job_is_cancelled(&s->common)) {
>                  goto immediate_exit;
>              }
> @@ -612,6 +651,7 @@ static void coroutine_fn mirror_run(void *opaque)
>              goto immediate_exit;
>          }
>  
> +        mirror_check_for_quiesce(s);
>          cnt = bdrv_get_dirty_count(s->dirty_bitmap);
>          /* s->common.offset contains the number of bytes already processed so
>           * far, cnt is the number of dirty sectors remaining and
> @@ -851,6 +891,9 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
>  
>      bdrv_op_block_all(target, s->common.blocker);
>  
> +    blk_add_aio_context_notifier(s->common.blk, mirror_attached_aio_context,
> +                                 mirror_detach_aio_context, s);
> +
>      s->common.co = qemu_coroutine_create(mirror_run);
>      trace_mirror_start(bs, s, s->common.co, opaque);
>      qemu_coroutine_enter(s->common.co, s);
>
Stefan Hajnoczi June 13, 2016, 10:49 a.m. UTC | #2
On Sun, Jun 12, 2016 at 02:51:04PM +0800, Fam Zheng wrote:
> @@ -119,7 +121,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
>      qemu_iovec_destroy(&op->qiov);
>      g_free(op);
>  
> -    if (s->waiting_for_io) {
> +    if (s->waiting_for_io && !s->quiesce_requested) {
>          qemu_coroutine_enter(s->common.co, NULL);
>      }

Is it necessary to interact with s->waiting_for_io?  The coroutine
should reach a quiescent point later on anyway so it would be simpler to
leave this unchanged.

> +static void mirror_detach_aio_context(void *opaque)
> +{
> +    MirrorBlockJob *s = opaque;
> +
> +    /* Complete pending write requests */
> +    assert(!s->quiesce_requested);
> +    s->quiesce_requested = true;
> +    while (s->quiesce_requested || s->in_flight) {
> +        aio_poll(blk_get_aio_context(s->common.blk), true);
> +    }
> +}

Adding synchronous aio_poll() loops will bite us in the future.  For
example, if a guest resets the virtio device the vcpu will be hung until
I/O completes and sleeps finish.  This flaw in QEMU already exists today
and won't be fixed any time soon, so I guess this approach is okay...

Stefan
Stefan Hajnoczi June 13, 2016, 1:29 p.m. UTC | #3
On Sun, Jun 12, 2016 at 02:51:04PM +0800, Fam Zheng wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> When dataplane is enabled or disabled the drive switches to a new
> AioContext.  The mirror block job must also move to the new AioContext
> so that drive accesses are always made within its AioContext.
> 
> This patch partially achieves that by draining target and source
> requests to reach a quiescent point.  The job is resumed in the new
> AioContext after moving s->target into the new AioContext.
> 
> The quiesce_requested flag is added to deal with yield points in
> block_job_sleep_ns(), bdrv_is_allocated_above(), and
> bdrv_get_block_status_above().  Previously they continue executing in
> the old AioContext. The nested aio_poll in mirror_detach_aio_context
> will drive the mirror coroutine upto fixed yield points, where
> mirror_check_for_quiesce is called.
> 
> Cc: Fam Zheng <famz@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Jeff Cody <jcody@redhat.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> [Drain source as well, and add s->quiesce_requested flag. -- Fam]
> Signed-off-by: Fam Zheng <famz@redhat.com>
> 
> ---
> 
> v2: Picked up Stefan's RFC patch and move on towards a more complete
> fix.  Please review!

Thanks, I'll send a v3 with my own comment addressed and fixes for other
block jobs.
Jason J. Herne June 13, 2016, 3:12 p.m. UTC | #4
On 06/12/2016 02:51 AM, Fam Zheng wrote:
...
> ---
>
> v2: Picked up Stefan's RFC patch and move on towards a more complete
> fix.  Please review!
>
> Jason: it would be nice if you could test this version again. It differs
> from the previous version.

No problem. I'll test v3 when it is available.
diff mbox

Patch

diff --git a/block/mirror.c b/block/mirror.c
index 80fd3c7..142199a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -63,6 +63,8 @@  typedef struct MirrorBlockJob {
     int ret;
     bool unmap;
     bool waiting_for_io;
+    bool quiesce_requested; /* temporarily detached to move AioContext,
+                               don't do more I/O */
     int target_cluster_sectors;
     int max_iov;
 } MirrorBlockJob;
@@ -119,7 +121,7 @@  static void mirror_iteration_done(MirrorOp *op, int ret)
     qemu_iovec_destroy(&op->qiov);
     g_free(op);
 
-    if (s->waiting_for_io) {
+    if (s->waiting_for_io && !s->quiesce_requested) {
         qemu_coroutine_enter(s->common.co, NULL);
     }
 }
@@ -307,6 +309,14 @@  static void mirror_do_zero_or_discard(MirrorBlockJob *s,
     }
 }
 
+static void coroutine_fn mirror_check_for_quiesce(MirrorBlockJob *s)
+{
+    if (s->quiesce_requested) {
+        s->quiesce_requested = false;
+        qemu_coroutine_yield();
+    }
+}
+
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = blk_bs(s->common.blk);
@@ -331,6 +341,7 @@  static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         mirror_wait_for_io(s);
     }
 
+    mirror_check_for_quiesce(s);
     /* Find the number of consective dirty chunks following the first dirty
      * one, and wait for in flight requests in them. */
     while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
@@ -442,6 +453,31 @@  static void mirror_drain(MirrorBlockJob *s)
     }
 }
 
+static void mirror_attached_aio_context(AioContext *new_context, void *opaque)
+{
+    MirrorBlockJob *s = opaque;
+
+    blk_set_aio_context(s->target, new_context);
+
+    /* Resume execution */
+    assert(!s->quiesce_requested);
+    if (s->waiting_for_io) {
+        qemu_coroutine_enter(s->common.co, NULL);
+    }
+}
+
+static void mirror_detach_aio_context(void *opaque)
+{
+    MirrorBlockJob *s = opaque;
+
+    /* Complete pending write requests */
+    assert(!s->quiesce_requested);
+    s->quiesce_requested = true;
+    while (s->quiesce_requested || s->in_flight) {
+        aio_poll(blk_get_aio_context(s->common.blk), true);
+    }
+}
+
 typedef struct {
     int ret;
 } MirrorExitData;
@@ -491,6 +527,8 @@  static void mirror_exit(BlockJob *job, void *opaque)
     if (replace_aio_context) {
         aio_context_release(replace_aio_context);
     }
+    blk_remove_aio_context_notifier(s->common.blk, mirror_attached_aio_context,
+                                    mirror_detach_aio_context, s);
     g_free(s->replaces);
     bdrv_op_unblock_all(target_bs, s->common.blocker);
     blk_unref(s->target);
@@ -583,6 +621,7 @@  static void coroutine_fn mirror_run(void *opaque)
                 block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
             }
 
+            mirror_check_for_quiesce(s);
             if (block_job_is_cancelled(&s->common)) {
                 goto immediate_exit;
             }
@@ -612,6 +651,7 @@  static void coroutine_fn mirror_run(void *opaque)
             goto immediate_exit;
         }
 
+        mirror_check_for_quiesce(s);
         cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         /* s->common.offset contains the number of bytes already processed so
          * far, cnt is the number of dirty sectors remaining and
@@ -851,6 +891,9 @@  static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
     bdrv_op_block_all(target, s->common.blocker);
 
+    blk_add_aio_context_notifier(s->common.blk, mirror_attached_aio_context,
+                                 mirror_detach_aio_context, s);
+
     s->common.co = qemu_coroutine_create(mirror_run);
     trace_mirror_start(bs, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);