diff mbox series

[12/13] block: Don't poll in bdrv_replace_child_noperm()

Message ID 20221108123738.530873-13-kwolf@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: Simplify drain | expand

Commit Message

Kevin Wolf Nov. 8, 2022, 12:37 p.m. UTC
In order to make sure that bdrv_replace_child_noperm() doesn't have to
poll any more, get rid of the bdrv_parent_drained_begin_single() call.

This is possible now because we can require that the child is already
drained when the function is called (it better be, having in-flight
requests while modifying the graph isn't going to end well!) and we
don't call the parent drain callbacks more than once.

The additional drain calls needed in callers cause the test case to run
its code in the drain handler too early (bdrv_attach_child() drains
now), so modify it to only enable the code after the test setup has
completed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block-io.h     |  8 ++++
 block.c                      | 72 +++++++++++++++++++++++++-----------
 block/io.c                   |  2 +-
 tests/unit/test-bdrv-drain.c | 10 +++++
 4 files changed, 70 insertions(+), 22 deletions(-)

Comments

Emanuele Giuseppe Esposito Nov. 11, 2022, 11:21 a.m. UTC | #1
Am 08/11/2022 um 13:37 schrieb Kevin Wolf:
> In order to make sure that bdrv_replace_child_noperm() doesn't have to
> poll any more, get rid of the bdrv_parent_drained_begin_single() call.
> 
> This is possible now because we can require that the child is already
> drained when the function is called (it better be, having in-flight
> requests while modifying the graph isn't going to end well!) and we
> don't call the parent drain callbacks more than once.
> 
> The additional drain calls needed in callers cause the test case to run
> its code in the drain handler too early (bdrv_attach_child() drains
> now), so modify it to only enable the code after the test setup has
> completed.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  include/block/block-io.h     |  8 ++++
>  block.c                      | 72 +++++++++++++++++++++++++-----------
>  block/io.c                   |  2 +-
>  tests/unit/test-bdrv-drain.c | 10 +++++
>  4 files changed, 70 insertions(+), 22 deletions(-)
> 
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index 5b54ed4672..ddce8550a9 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -290,6 +290,14 @@ bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>   */
>  void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
>  
> +/**
> + * bdrv_parent_drained_poll_single:
> + *
> + * Returns true if there is any pending activity to cease before @c can be
> + * called quiesced, false otherwise.
> + */
> +bool bdrv_parent_drained_poll_single(BdrvChild *c);
> +
>  /**
>   * bdrv_parent_drained_end_single:
>   *
> diff --git a/block.c b/block.c
> index 5f5f79cd16..12039e9b8a 100644
> --- a/block.c
> +++ b/block.c
> @@ -2399,6 +2399,20 @@ static void bdrv_replace_child_abort(void *opaque)
>  
>      GLOBAL_STATE_CODE();
>      /* old_bs reference is transparently moved from @s to @s->child */
> +    if (!s->child->bs) {
> +        /*
> +         * The parents were undrained when removing old_bs from the child. New
> +         * requests can't have been made, though, because the child was empty.
> +         *
> +         * TODO Make bdrv_replace_child_noperm() transactionable to avoid
> +         * undraining the parent in the first place. Once this is done, having
> +         * new_bs drained when calling bdrv_replace_child_tran() is not a
> +         * requirement any more.
> +         */
> +        bdrv_parent_drained_begin_single(s->child, false);
> +        assert(!bdrv_parent_drained_poll_single(s->child));
> +    }
> +    assert(s->child->parent_quiesce_counter);
>      bdrv_replace_child_noperm(s->child, s->old_bs);
>      bdrv_unref(new_bs);
>  }
> @@ -2414,12 +2428,20 @@ static TransactionActionDrv bdrv_replace_child_drv = {
>   *
>   * Note: real unref of old_bs is done only on commit.
>   *
> + * Both child and new_bs (if non-NULL) must be drained. new_bs must be kept
> + * drained until the transaction is completed (this automatically implies that
> + * child remains drained, too).
> + *
>   * The function doesn't update permissions, caller is responsible for this.
>   */
>  static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
>                                      Transaction *tran)
>  {
>      BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
> +
> +    assert(child->parent_quiesce_counter);
> +    assert(!new_bs || new_bs->quiesce_counter);
> +
>      *s = (BdrvReplaceChildState) {
>          .child = child,
>          .old_bs = child->bs,
> @@ -2818,6 +2840,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
>      int new_bs_quiesce_counter;
>  
>      assert(!child->frozen);
> +    /*
> +     * When removing the child, it's the callers responsibility to make sure
> +     * that no requests are in flight any more. Usually the parent is drained,
> +     * but not through child->parent_quiesce_counter.
> +     */
> +    assert(!new_bs || child->parent_quiesce_counter);
>      assert(old_bs != new_bs);
>      GLOBAL_STATE_CODE();
>  
> @@ -2825,16 +2853,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
>          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
>      }
>  
> -    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
> -
> -    /*
> -     * If the new child node is drained but the old one was not, flush
> -     * all outstanding requests to the old child node.
> -     */
> -    if (new_bs_quiesce_counter && !child->parent_quiesce_counter) {
> -        bdrv_parent_drained_begin_single(child, true);
> -    }
> -
>      if (old_bs) {
>          if (child->klass->detach) {
>              child->klass->detach(child);
> @@ -2849,14 +2867,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
>          assert_bdrv_graph_writable(new_bs);
>          QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
>  
> -        /*
> -         * Polling in bdrv_parent_drained_begin_single() may have led to the new
> -         * node's quiesce_counter having been decreased.  Not a problem, we just
> -         * need to recognize this here and then invoke drained_end appropriately
> -         * more often.
> -         */
> -        assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
> -
>          if (child->klass->attach) {
>              child->klass->attach(child);
>          }
> @@ -2865,9 +2875,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
>      /*
>       * If the old child node was drained but the new one is not, allow
>       * requests to come in only after the new node has been attached.
> -     *
> -     * Update new_bs_quiesce_counter because bdrv_parent_drained_begin_single()
> -     * polls, which could have changed the value.
>       */
>      new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
>      if (!new_bs_quiesce_counter && child->parent_quiesce_counter) {
> @@ -3004,6 +3011,12 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
>      }
>  
>      bdrv_ref(child_bs);
> +    /*
> +     * Let every new BdrvChild start drained, inserting it in the graph with
> +     * bdrv_replace_child_noperm() will undrain it if the child node is not
> +     * drained. The child was only just created, so polling is not necessary.
> +     */

I think there's a better way to write this, I find it complicated to read.

Also I don't really understand how you cover the case where we are
replacing a child with another one (so both old and new are not-null and
not newly created), and `old` for example could (?) have a drained
counter greater than `new`.
Before we had all the draining saldo stuff, but now it's gone.

Thank you,
Emanuele

> +    bdrv_parent_drained_begin_single(new_child, false);
>      bdrv_replace_child_noperm(new_child, child_bs);
>  
>      BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
> @@ -5053,7 +5066,10 @@ static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
>      }
>  
>      if (child->bs) {
> +        BlockDriverState *bs = child->bs;
> +        bdrv_drained_begin(bs);
>          bdrv_replace_child_tran(child, NULL, tran);
> +        bdrv_drained_end(bs);
>      }
>  
>      tran_add(tran, &bdrv_remove_child_drv, child);
> @@ -5070,6 +5086,15 @@ static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
>      bdrv_remove_child(bdrv_filter_or_cow_child(bs), tran);
>  }
>  
> +static void undrain_on_clean_cb(void *opaque)
> +{
> +    bdrv_drained_end(opaque);
> +}
> +
> +static TransactionActionDrv undrain_on_clean = {
> +    .clean = undrain_on_clean_cb,
> +};
> +
>  static int bdrv_replace_node_noperm(BlockDriverState *from,
>                                      BlockDriverState *to,
>                                      bool auto_skip, Transaction *tran,
> @@ -5079,6 +5104,11 @@ static int bdrv_replace_node_noperm(BlockDriverState *from,
>  
>      GLOBAL_STATE_CODE();
>  
> +    bdrv_drained_begin(from);
> +    bdrv_drained_begin(to);
> +    tran_add(tran, &undrain_on_clean, from);
> +    tran_add(tran, &undrain_on_clean, to);
> +
>      QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
>          assert(c->bs == from);
>          if (!should_update_child(c, to)) {
> diff --git a/block/io.c b/block/io.c
> index 4a83359a8f..d0f641926f 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -80,7 +80,7 @@ static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
>      }
>  }
>  
> -static bool bdrv_parent_drained_poll_single(BdrvChild *c)
> +bool bdrv_parent_drained_poll_single(BdrvChild *c)
>  {
>      if (c->klass->drained_poll) {
>          return c->klass->drained_poll(c);
> diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
> index 172bc6debc..2686a8acee 100644
> --- a/tests/unit/test-bdrv-drain.c
> +++ b/tests/unit/test-bdrv-drain.c
> @@ -1654,6 +1654,7 @@ static void test_drop_intermediate_poll(void)
>  
>  
>  typedef struct BDRVReplaceTestState {
> +    bool setup_completed;
>      bool was_drained;
>      bool was_undrained;
>      bool has_read;
> @@ -1738,6 +1739,10 @@ static void bdrv_replace_test_drain_begin(BlockDriverState *bs)
>  {
>      BDRVReplaceTestState *s = bs->opaque;
>  
> +    if (!s->setup_completed) {
> +        return;
> +    }
> +
>      if (!s->drain_count) {
>          s->drain_co = qemu_coroutine_create(bdrv_replace_test_drain_co, bs);
>          bdrv_inc_in_flight(bs);
> @@ -1769,6 +1774,10 @@ static void bdrv_replace_test_drain_end(BlockDriverState *bs)
>  {
>      BDRVReplaceTestState *s = bs->opaque;
>  
> +    if (!s->setup_completed) {
> +        return;
> +    }
> +
>      g_assert(s->drain_count > 0);
>      if (!--s->drain_count) {
>          s->was_undrained = true;
> @@ -1867,6 +1876,7 @@ static void do_test_replace_child_mid_drain(int old_drain_count,
>      bdrv_ref(old_child_bs);
>      bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds,
>                        BDRV_CHILD_COW, &error_abort);
> +    parent_s->setup_completed = true;
>  
>      for (i = 0; i < old_drain_count; i++) {
>          bdrv_drained_begin(old_child_bs);
>
Hanna Czenczek Nov. 14, 2022, 8:22 p.m. UTC | #2
On 08.11.22 13:37, Kevin Wolf wrote:
> In order to make sure that bdrv_replace_child_noperm() doesn't have to
> poll any more, get rid of the bdrv_parent_drained_begin_single() call.
>
> This is possible now because we can require that the child is already
> drained when the function is called (it better be, having in-flight
> requests while modifying the graph isn't going to end well!) and we
> don't call the parent drain callbacks more than once.
>
> The additional drain calls needed in callers cause the test case to run
> its code in the drain handler too early (bdrv_attach_child() drains
> now), so modify it to only enable the code after the test setup has
> completed.
>
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>   include/block/block-io.h     |  8 ++++
>   block.c                      | 72 +++++++++++++++++++++++++-----------
>   block/io.c                   |  2 +-
>   tests/unit/test-bdrv-drain.c | 10 +++++
>   4 files changed, 70 insertions(+), 22 deletions(-)

I find this change complicated.  I understand it’s the point of the 
series, but I find it difficult to grasp.  But I guess there can be no 
drain series without such a patch.

As usual, I was very skeptical of the code at first, and over time 
slowly realized that I’m mostly confused by the comments, and the code 
seems fine.  Ah, well.

[...]

> diff --git a/block.c b/block.c
> index 5f5f79cd16..12039e9b8a 100644
> --- a/block.c
> +++ b/block.c

[...]

> @@ -2414,12 +2428,20 @@ static TransactionActionDrv bdrv_replace_child_drv = {
>    *
>    * Note: real unref of old_bs is done only on commit.
>    *
> + * Both child and new_bs (if non-NULL) must be drained. new_bs must be kept
> + * drained until the transaction is completed (this automatically implies that
> + * child remains drained, too).

I find “child” extremely ambiguous.  The problem is that there generally 
is no way to drain a BdrvChild object, is there?  You can only drain the 
BDS in it, which then drains the parent through the BdrvChild object.  
Historically, I don’t think there was ever a place where we cared about 
the BdrvChild object between the two to be drained, was there?  I mean, 
now there apparently is, in bdrv_child_attach_common(), but that’s a 
different story.

So the problem is that “draining a BdrvChild object” generally appears 
in the context of bdrv_parent_drained_*() functions, i.e. actually 
functions draining the parent.  Which makes it a bit confusing to refer 
to a BdrvChild object just as “child”.

I know that “child” here refers to the variable (or does it not?), but 
that is why I really prefer marking variables that are just plain 
English words, e.g. as @child or `child`, so it’s clear they are a name 
and not a noun.

In any case, because the concept is generally to drain the `child->bs` 
instead of the BdrvChild object directly, I understand the comment to 
mean: “Both the old child (`child->bs`) and `new_bs` (if non-NULL) must 
be drained.  `new_bs` must be kept drained until the transaction is 
completed.  This implies that the parent too will be kept drained until 
the transaction is completed by the BdrvChild object `child`.”

Or am I misunderstanding something, and the distinction between `child` 
and `child->bs` and the parent node is important here? (Would be good to 
know. :))

> + *
>    * The function doesn't update permissions, caller is responsible for this.
>    */
>   static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
>                                       Transaction *tran)
>   {
>       BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
> +
> +    assert(child->parent_quiesce_counter);
> +    assert(!new_bs || new_bs->quiesce_counter);
> +
>       *s = (BdrvReplaceChildState) {
>           .child = child,
>           .old_bs = child->bs,
> @@ -2818,6 +2840,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,

This function now has its callers fulfill kind of a complicated 
contract.  I would prefer that to be written out in a doc comment, 
especially because it sounds like the assertions can’t cover everything 
(i.e. callers that remove a child are required to have stopped issuing 
requests to that child, but they are free to do that in any way they 
want, so no assertion will check for it here).

>       int new_bs_quiesce_counter;
>   
>       assert(!child->frozen);
> +    /*
> +     * When removing the child, it's the callers responsibility to make sure
> +     * that no requests are in flight any more. Usually the parent is drained,
> +     * but not through child->parent_quiesce_counter.
> +     */

When I see a comment above an assertion, I immediately assume it is 
going to describe what the assertion checks.  Unless I’m 
misunderstanding something (again?), this comment actually describes 
what the assertion *does not* check.  I find that confusing, especially 
because the comment leads with “it’s the caller’s responsibility”, which 
to me implies “and that’s why we check it here in this assertion”, 
because assertions are there to verify that contracts are met.

The assertion verifies that the parent must be drained (through @child), 
unless the child is removed, which case isn’t covered by the assertion.  
That “isn’t covered” is then described by the comment, right?

I’d prefer the comment to lead with describing what the assertion does 
check, and then transitioning to “But in case the child is removed, we 
ignore that, and just note that it’s the caller’s responsibility to...”.

Also, the comment doesn’t explicitly say why we don’t check it in the 
assertion.  It says “usually” and “child->parent_quiesce_counter”, which 
implies “can’t get any information from child->parent_quiesce_counter, 
and regardless, callers can do what they want do achieve quiescing in 
regards to this child, so there’s nothing we can check”.  It feels like 
we can just say outright that there’s an informal contract that we can’t 
formally verify here, but callers naturally still must adhere to it.  It 
would be interesting to know (and note) why that is, though, i.e. why we 
can’t have parents be drained through the BdrvChild object for the child 
that is being removed.

I understand the intention behind the assertion to be: “We require the 
parent not to have in-flight requests to the BdrvChild object 
manipulated here.  In most cases, we verify that by requiring the parent 
be drained through this BdrvChild object.  However, when a child is 
being removed, we skip formal verification, because we leave callers 
free in deciding how to ensure that no requests are in flight.  Usually, 
they will still have the parent be drained (even if not through this 
BdrvChild object), but we don’t require that.”

I may well be wrong, but then it would be good for a comment to correct 
me. :)

(Interestingly, because bdrv_replace_child_noperm() no longer polls 
itself, it can’t know for sure that `child->parent_quiesce_counter > 0` 
means that there are no requests in flight.)

> +    assert(!new_bs || child->parent_quiesce_counter);
>       assert(old_bs != new_bs);
>       GLOBAL_STATE_CODE();

[...]

> @@ -2865,9 +2875,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
>       /*
>        * If the old child node was drained but the new one is not, allow

This now also covers the case where there was no old child node, but the 
parent was simply drained via an empty BdrvChild by the caller.

>        * requests to come in only after the new node has been attached.
> -     *
> -     * Update new_bs_quiesce_counter because bdrv_parent_drained_begin_single()
> -     * polls, which could have changed the value.
>        */
>       new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
>       if (!new_bs_quiesce_counter && child->parent_quiesce_counter) {
> @@ -3004,6 +3011,12 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
>       }
>   
>       bdrv_ref(child_bs);
> +    /*
> +     * Let every new BdrvChild start drained, inserting it in the graph with
> +     * bdrv_replace_child_noperm() will undrain it if the child node is not
> +     * drained. The child was only just created, so polling is not necessary.

I feel like this is hiding some complexity.  Unless I missed something, 
draining a BdrvChild always meant draining the parent. But here, it 
absolutely does not mean that, and maybe that deserves a big warning sign?

Beginning a drain without poll means quiescing.  You assert that there 
can be no requests to the new child, which I agree on[1].  The 
combination of no new requests coming in, and no requests being there at 
this point is what being drained means.  So @new_child is indeed “drained”.

But the parent isn’t drained, because it isn’t polled.  There may still 
be requests in flight to its other children.  That’s really interesting, 
and I found it extremely confusing until I wrote ten paragraphs in reply 
here and scrapped most of them again.  Whenever I find this to be my 
reaction to something, I really wish for a detailed comment that 
explains the situation.

I would like the comment to:
- Expand on what “only just created” means.  As it’s written, that could 
mean relying on a race condition.  At which point would the parent be 
able to send requests?  (I assume either the .attach() in 
bdrv_replace_child_noperm(), or when this function returns, whichever 
comes first.  (The former always comes first.))
- Say in more detail that calling bdrv_parent_drained_begin_single() 
without polling will quiesce the parent, preventing new requests from 
appearing.
- Note that because there are no requests in flight, and because no new 
requests can then appear, the BdrvChild is drained.
- Note that the parent is only quiesced, not drained, and may still have 
requests in flight to other children, but naturally we don’t care about 
them.

I feel like the comment tries to hide all that complexity simply by 
avoiding the word “parent”.

[1] As far as I can piece together, no requests to the new child can 
have started yet, because this function creates the BdrvChild object, so 
before it is returned to the caller (or BdrvChildClass.attach() is 
called in bdrv_replace_child_noperm()), the block driver won’t generate 
requests to it.

Hanna

> +     */
> +    bdrv_parent_drained_begin_single(new_child, false);
>       bdrv_replace_child_noperm(new_child, child_bs);
>   
>       BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
Kevin Wolf Nov. 17, 2022, 1:27 p.m. UTC | #3
Am 14.11.2022 um 21:22 hat Hanna Reitz geschrieben:
> On 08.11.22 13:37, Kevin Wolf wrote:
> > In order to make sure that bdrv_replace_child_noperm() doesn't have to
> > poll any more, get rid of the bdrv_parent_drained_begin_single() call.
> > 
> > This is possible now because we can require that the child is already
> > drained when the function is called (it better be, having in-flight
> > requests while modifying the graph isn't going to end well!) and we
> > don't call the parent drain callbacks more than once.
> > 
> > The additional drain calls needed in callers cause the test case to run
> > its code in the drain handler too early (bdrv_attach_child() drains
> > now), so modify it to only enable the code after the test setup has
> > completed.
> > 
> > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > ---
> >   include/block/block-io.h     |  8 ++++
> >   block.c                      | 72 +++++++++++++++++++++++++-----------
> >   block/io.c                   |  2 +-
> >   tests/unit/test-bdrv-drain.c | 10 +++++
> >   4 files changed, 70 insertions(+), 22 deletions(-)
> 
> I find this change complicated.  I understand it’s the point of the series,
> but I find it difficult to grasp.  But I guess there can be no drain series
> without such a patch.
> 
> As usual, I was very skeptical of the code at first, and over time slowly
> realized that I’m mostly confused by the comments, and the code seems fine. 
> Ah, well.

I spent a while thinking about how to do things differently, but I think
my conclusion is that only improving the comments is probably the best.

The real condition in bdrv_replace_child_noperm() is: If you want to
change the BdrvChild to point to a drained node with child->bs, either
child->parent_quiesce_counter must already be non-zero or you must be
able to increase it and keep the parent's quiesce_counter consistent
with that, but without polling or starting new requests.

This patch generalised the condition to non-drained child nodes as well
just because it's easier to verify when it's a condition that applies
always. It also picked the first option, child->parent_quiesce_counter
already being non-zero.

If we wanted to implement the second option, the potential problem is in
the "without starting new requests" condition, because .drained_begin
can start new requests on the child (e.g. restarting throttled requests,
so that we can actually drain the request queue).

Note that we're not even interested in draining the request queue here,
we already assume that the parent doesn't have active requests on the
child (otherwise we would always have gotten crashes). We just want to
get the counters to agree with each other.

Maybe the most reasonable approach for this would be formally requiring
that .drained_begin both in BdrvChildClass and BlockDriver not do
anything if the thing in question is already quiesced (I think this is
true in practice; for BlockDriver probably only after the earlier
patches in this series) and then assert(bdrv_parent_is_drained(child))
in bdrv_replace_child_noperm(), which would require a new BdrvChildClass
callback. Then the bdrv_parent_drained_begin_single() call could stay
around, but wouldn't have to poll any more.

Of course, all of the drains in this patch would have to stay anyway to
make sure that the parent is already drained. So I'm not sure if it's
any simpler or better in any way than requiring that the parent was
already drained through _this_ BdrvChild.

What do you think?

> [...]
> 
> > diff --git a/block.c b/block.c
> > index 5f5f79cd16..12039e9b8a 100644
> > --- a/block.c
> > +++ b/block.c
> 
> [...]
> 
> > @@ -2414,12 +2428,20 @@ static TransactionActionDrv bdrv_replace_child_drv = {
> >    *
> >    * Note: real unref of old_bs is done only on commit.
> >    *
> > + * Both child and new_bs (if non-NULL) must be drained. new_bs must be kept
> > + * drained until the transaction is completed (this automatically implies that
> > + * child remains drained, too).
> 
> I find “child” extremely ambiguous.  The problem is that there generally is
> no way to drain a BdrvChild object, is there?  You can only drain the BDS in
> it, which then drains the parent through the BdrvChild object. 
> Historically, I don’t think there was ever a place where we cared about the
> BdrvChild object between the two to be drained, was there?  I mean, now
> there apparently is, in bdrv_child_attach_common(), but that’s a different
> story.

I think we've always cared about the parent drain happening through the
BdrvChild, though, at least since your commit 804db8ea which introduced
BdrvChild.parent_quiesce_counter.

Whether or not calling the BdrvChild drained in this case is probably
more a question of terminology.

If we want to avoid calling a BdrvChild drained, I guess I could require
child->bs to be drained instead, which implies the condition we're
really interested in.

> So the problem is that “draining a BdrvChild object” generally appears in
> the context of bdrv_parent_drained_*() functions, i.e. actually functions
> draining the parent.  Which makes it a bit confusing to refer to a BdrvChild
> object just as “child”.
> 
> I know that “child” here refers to the variable (or does it not?), but that
> is why I really prefer marking variables that are just plain English words,
> e.g. as @child or `child`, so it’s clear they are a name and not a noun.

That's fair, I should add that @. (Yes, it does.)

> In any case, because the concept is generally to drain the `child->bs`
> instead of the BdrvChild object directly, I understand the comment to mean:
> “Both the old child (`child->bs`) and `new_bs` (if non-NULL) must be
> drained.  `new_bs` must be kept drained until the transaction is completed. 
> This implies that the parent too will be kept drained until the transaction
> is completed by the BdrvChild object `child`.”
> 
> Or am I misunderstanding something, and the distinction between `child` and
> `child->bs` and the parent node is important here? (Would be good to know.
> :))

I'm not sure how a transaction "is completed by the BdrvChild object"
(isn't it the caller that finalises the transaction?), but I think
otherwise that's equivalent to what I was trying to express.

Oh, is it just the word order that confused me, and you really mean
"will be kept drained by the BdrvChild object until the transation is
completed (by the caller)"?

> > + *
> >    * The function doesn't update permissions, caller is responsible for this.
> >    */
> >   static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
> >                                       Transaction *tran)
> >   {
> >       BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
> > +
> > +    assert(child->parent_quiesce_counter);
> > +    assert(!new_bs || new_bs->quiesce_counter);
> > +
> >       *s = (BdrvReplaceChildState) {
> >           .child = child,
> >           .old_bs = child->bs,
> > @@ -2818,6 +2840,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
> 
> This function now has its callers fulfill kind of a complicated contract.  I
> would prefer that to be written out in a doc comment, especially because it
> sounds like the assertions can’t cover everything (i.e. callers that remove
> a child are required to have stopped issuing requests to that child, but
> they are free to do that in any way they want, so no assertion will check
> for it here).

Ok, I can add a comment.

I don't think the contract is complicated: The parent has to be drained
through this BdrvChild (because new_bs could already be drained), except
for new_bs == NULL which is obviously not attaching the child to a
drained node.

> >       int new_bs_quiesce_counter;
> >       assert(!child->frozen);
> > +    /*
> > +     * When removing the child, it's the callers responsibility to make sure
> > +     * that no requests are in flight any more. Usually the parent is drained,
> > +     * but not through child->parent_quiesce_counter.
> > +     */
> 
> When I see a comment above an assertion, I immediately assume it is going to
> describe what the assertion checks.  Unless I’m misunderstanding something
> (again?), this comment actually describes what the assertion *does not*
> check.  I find that confusing, especially because the comment leads with
> “it’s the caller’s responsibility”, which to me implies “and that’s why we
> check it here in this assertion”, because assertions are there to verify
> that contracts are met.

The comment is bad, I must have been confused myself while writing it.

The logic here isn't even about request in flights. It's true that there
must be none, but that's a separate requirement. What it is about is
maintaining consistency between child->parent_quiesce_counter and the
parent's own quiesce_counter in case of switching to a drained node,
without having to poll - which is only possible if the parent is already
drained.

That we require the parent to be drained through this specific BdrvChild
is a choice with the intention to keep things simpler, as explained
above.

> The assertion verifies that the parent must be drained (through @child),
> unless the child is removed, which case isn’t covered by the assertion. 
> That “isn’t covered” is then described by the comment, right?
> 
> I’d prefer the comment to lead with describing what the assertion does
> check, and then transitioning to “But in case the child is removed, we
> ignore that, and just note that it’s the caller’s responsibility to...”.
> 
> Also, the comment doesn’t explicitly say why we don’t check it in the
> assertion.  It says “usually” and “child->parent_quiesce_counter”, which
> implies “can’t get any information from child->parent_quiesce_counter, and
> regardless, callers can do what they want do achieve quiescing in regards to
> this child, so there’s nothing we can check”.  It feels like we can just say
> outright that there’s an informal contract that we can’t formally verify
> here, but callers naturally still must adhere to it.  It would be
> interesting to know (and note) why that is, though, i.e. why we can’t have
> parents be drained through the BdrvChild object for the child that is being
> removed.

We could require that, it would just be more complicated for the callers
that pass a constant NULL, for no real benefit.

> I understand the intention behind the assertion to be: “We require the
> parent not to have in-flight requests to the BdrvChild object manipulated
> here.  In most cases, we verify that by requiring the parent be drained
> through this BdrvChild object.  However, when a child is being removed, we
> skip formal verification, because we leave callers free in deciding how to
> ensure that no requests are in flight.  Usually, they will still have the
> parent be drained (even if not through this BdrvChild object), but we don’t
> require that.”
> 
> I may well be wrong, but then it would be good for a comment to correct me.
> :)
> 
> (Interestingly, because bdrv_replace_child_noperm() no longer polls itself,
> it can’t know for sure that `child->parent_quiesce_counter > 0` means that
> there are no requests in flight.)
> 
> > +    assert(!new_bs || child->parent_quiesce_counter);
> >       assert(old_bs != new_bs);
> >       GLOBAL_STATE_CODE();
> 
> [...]
> 
> > @@ -2865,9 +2875,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
> >       /*
> >        * If the old child node was drained but the new one is not, allow
> 
> This now also covers the case where there was no old child node, but the
> parent was simply drained via an empty BdrvChild by the caller.

I'm not sure how to express this concisely if we want to avoid calling
the BdrvChild itself drained.

> >        * requests to come in only after the new node has been attached.
> > -     *
> > -     * Update new_bs_quiesce_counter because bdrv_parent_drained_begin_single()
> > -     * polls, which could have changed the value.
> >        */
> >       new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
> >       if (!new_bs_quiesce_counter && child->parent_quiesce_counter) {
> > @@ -3004,6 +3011,12 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
> >       }
> >       bdrv_ref(child_bs);
> > +    /*
> > +     * Let every new BdrvChild start drained, inserting it in the graph with
> > +     * bdrv_replace_child_noperm() will undrain it if the child node is not
> > +     * drained. The child was only just created, so polling is not necessary.
> 
> I feel like this is hiding some complexity.  Unless I missed something,
> draining a BdrvChild always meant draining the parent. But here, it
> absolutely does not mean that, and maybe that deserves a big warning sign?
> 
> Beginning a drain without poll means quiescing.  You assert that there can
> be no requests to the new child, which I agree on[1].  The combination of no
> new requests coming in, and no requests being there at this point is what
> being drained means.  So @new_child is indeed “drained”.
> 
> But the parent isn’t drained, because it isn’t polled.  There may still be
> requests in flight to its other children.  That’s really interesting, and I
> found it extremely confusing until I wrote ten paragraphs in reply here and
> scrapped most of them again.  Whenever I find this to be my reaction to
> something, I really wish for a detailed comment that explains the situation.
> 
> I would like the comment to:
> - Expand on what “only just created” means.  As it’s written, that could
> mean relying on a race condition.  At which point would the parent be able
> to send requests?  (I assume either the .attach() in
> bdrv_replace_child_noperm(), or when this function returns, whichever comes
> first.  (The former always comes first.))

I don't think .attach is supposed to create requests - even less so
while the BdrvChild is drained. It may schedule a BH, but that won't be
executed until this function returns.

This is not documented explicitly, maybe we should document it.

I suppose .drained_end can create requests in general, but it wouldn't
make sense to me if it did that for a new child. It generally just
resumes operations that were stopped because of the drain, but there was
no operation on a new child yet.

> - Say in more detail that calling bdrv_parent_drained_begin_single() without
> polling will quiesce the parent, preventing new requests from appearing.
> - Note that because there are no requests in flight, and because no new
> requests can then appear, the BdrvChild is drained.
> - Note that the parent is only quiesced, not drained, and may still have
> requests in flight to other children, but naturally we don’t care about
> them.

All of this is true, but at the same time not related to what the
bdrv_parent_drained_begin_single() call is meant for - increasing the
parent's quiesce_count from the BdrvChild before calling
bdrv_replace_child_noperm(), so that we don't have to do it inside of
that function where we don't know that we don't have to poll.

That there are no requests in flight when you change child->bs is a
requirement that we already had before this patch.

If it feels better to you, we could even just poll here (and drop patch
13 because it would still be used).

The part that is important in the context of Emanuele's patches that
will follow is that we poll outside of a bdrv_graph_wrlock/wrunlock()
section. This might mean that we'd have to pull the polling further
down into the callers in the long run. Emanuele's current patches only
put the lock in bdrv_replace_child_noperm(), but generally speaking you
wouldn't want the graph to change between two related changes, so I'm
almost sure that the lock will be taken in callers in the future.

> I feel like the comment tries to hide all that complexity simply by avoiding
> the word “parent”.
> 
> [1] As far as I can piece together, no requests to the new child can have
> started yet, because this function creates the BdrvChild object, so before
> it is returned to the caller (or BdrvChildClass.attach() is called in
> bdrv_replace_child_noperm()), the block driver won’t generate requests to
> it.

Kevin
diff mbox series

Patch

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 5b54ed4672..ddce8550a9 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -290,6 +290,14 @@  bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
  */
 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
 
+/**
+ * bdrv_parent_drained_poll_single:
+ *
+ * Returns true if there is any pending activity to cease before @c can be
+ * called quiesced, false otherwise.
+ */
+bool bdrv_parent_drained_poll_single(BdrvChild *c);
+
 /**
  * bdrv_parent_drained_end_single:
  *
diff --git a/block.c b/block.c
index 5f5f79cd16..12039e9b8a 100644
--- a/block.c
+++ b/block.c
@@ -2399,6 +2399,20 @@  static void bdrv_replace_child_abort(void *opaque)
 
     GLOBAL_STATE_CODE();
     /* old_bs reference is transparently moved from @s to @s->child */
+    if (!s->child->bs) {
+        /*
+         * The parents were undrained when removing old_bs from the child. New
+         * requests can't have been made, though, because the child was empty.
+         *
+         * TODO Make bdrv_replace_child_noperm() transactionable to avoid
+         * undraining the parent in the first place. Once this is done, having
+         * new_bs drained when calling bdrv_replace_child_tran() is not a
+         * requirement any more.
+         */
+        bdrv_parent_drained_begin_single(s->child, false);
+        assert(!bdrv_parent_drained_poll_single(s->child));
+    }
+    assert(s->child->parent_quiesce_counter);
     bdrv_replace_child_noperm(s->child, s->old_bs);
     bdrv_unref(new_bs);
 }
@@ -2414,12 +2428,20 @@  static TransactionActionDrv bdrv_replace_child_drv = {
  *
  * Note: real unref of old_bs is done only on commit.
  *
+ * Both child and new_bs (if non-NULL) must be drained. new_bs must be kept
+ * drained until the transaction is completed (this automatically implies that
+ * child remains drained, too).
+ *
  * The function doesn't update permissions, caller is responsible for this.
  */
 static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
                                     Transaction *tran)
 {
     BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
+
+    assert(child->parent_quiesce_counter);
+    assert(!new_bs || new_bs->quiesce_counter);
+
     *s = (BdrvReplaceChildState) {
         .child = child,
         .old_bs = child->bs,
@@ -2818,6 +2840,12 @@  static void bdrv_replace_child_noperm(BdrvChild *child,
     int new_bs_quiesce_counter;
 
     assert(!child->frozen);
+    /*
+     * When removing the child, it's the callers responsibility to make sure
+     * that no requests are in flight any more. Usually the parent is drained,
+     * but not through child->parent_quiesce_counter.
+     */
+    assert(!new_bs || child->parent_quiesce_counter);
     assert(old_bs != new_bs);
     GLOBAL_STATE_CODE();
 
@@ -2825,16 +2853,6 @@  static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
 
-    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
-
-    /*
-     * If the new child node is drained but the old one was not, flush
-     * all outstanding requests to the old child node.
-     */
-    if (new_bs_quiesce_counter && !child->parent_quiesce_counter) {
-        bdrv_parent_drained_begin_single(child, true);
-    }
-
     if (old_bs) {
         if (child->klass->detach) {
             child->klass->detach(child);
@@ -2849,14 +2867,6 @@  static void bdrv_replace_child_noperm(BdrvChild *child,
         assert_bdrv_graph_writable(new_bs);
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
 
-        /*
-         * Polling in bdrv_parent_drained_begin_single() may have led to the new
-         * node's quiesce_counter having been decreased.  Not a problem, we just
-         * need to recognize this here and then invoke drained_end appropriately
-         * more often.
-         */
-        assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
-
         if (child->klass->attach) {
             child->klass->attach(child);
         }
@@ -2865,9 +2875,6 @@  static void bdrv_replace_child_noperm(BdrvChild *child,
     /*
      * If the old child node was drained but the new one is not, allow
      * requests to come in only after the new node has been attached.
-     *
-     * Update new_bs_quiesce_counter because bdrv_parent_drained_begin_single()
-     * polls, which could have changed the value.
      */
     new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
     if (!new_bs_quiesce_counter && child->parent_quiesce_counter) {
@@ -3004,6 +3011,12 @@  static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
     }
 
     bdrv_ref(child_bs);
+    /*
+     * Let every new BdrvChild start drained, inserting it in the graph with
+     * bdrv_replace_child_noperm() will undrain it if the child node is not
+     * drained. The child was only just created, so polling is not necessary.
+     */
+    bdrv_parent_drained_begin_single(new_child, false);
     bdrv_replace_child_noperm(new_child, child_bs);
 
     BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
@@ -5053,7 +5066,10 @@  static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
     }
 
     if (child->bs) {
+        BlockDriverState *bs = child->bs;
+        bdrv_drained_begin(bs);
         bdrv_replace_child_tran(child, NULL, tran);
+        bdrv_drained_end(bs);
     }
 
     tran_add(tran, &bdrv_remove_child_drv, child);
@@ -5070,6 +5086,15 @@  static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
     bdrv_remove_child(bdrv_filter_or_cow_child(bs), tran);
 }
 
+static void undrain_on_clean_cb(void *opaque)
+{
+    bdrv_drained_end(opaque);
+}
+
+static TransactionActionDrv undrain_on_clean = {
+    .clean = undrain_on_clean_cb,
+};
+
 static int bdrv_replace_node_noperm(BlockDriverState *from,
                                     BlockDriverState *to,
                                     bool auto_skip, Transaction *tran,
@@ -5079,6 +5104,11 @@  static int bdrv_replace_node_noperm(BlockDriverState *from,
 
     GLOBAL_STATE_CODE();
 
+    bdrv_drained_begin(from);
+    bdrv_drained_begin(to);
+    tran_add(tran, &undrain_on_clean, from);
+    tran_add(tran, &undrain_on_clean, to);
+
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
         assert(c->bs == from);
         if (!should_update_child(c, to)) {
diff --git a/block/io.c b/block/io.c
index 4a83359a8f..d0f641926f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -80,7 +80,7 @@  static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
     }
 }
 
-static bool bdrv_parent_drained_poll_single(BdrvChild *c)
+bool bdrv_parent_drained_poll_single(BdrvChild *c)
 {
     if (c->klass->drained_poll) {
         return c->klass->drained_poll(c);
diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index 172bc6debc..2686a8acee 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -1654,6 +1654,7 @@  static void test_drop_intermediate_poll(void)
 
 
 typedef struct BDRVReplaceTestState {
+    bool setup_completed;
     bool was_drained;
     bool was_undrained;
     bool has_read;
@@ -1738,6 +1739,10 @@  static void bdrv_replace_test_drain_begin(BlockDriverState *bs)
 {
     BDRVReplaceTestState *s = bs->opaque;
 
+    if (!s->setup_completed) {
+        return;
+    }
+
     if (!s->drain_count) {
         s->drain_co = qemu_coroutine_create(bdrv_replace_test_drain_co, bs);
         bdrv_inc_in_flight(bs);
@@ -1769,6 +1774,10 @@  static void bdrv_replace_test_drain_end(BlockDriverState *bs)
 {
     BDRVReplaceTestState *s = bs->opaque;
 
+    if (!s->setup_completed) {
+        return;
+    }
+
     g_assert(s->drain_count > 0);
     if (!--s->drain_count) {
         s->was_undrained = true;
@@ -1867,6 +1876,7 @@  static void do_test_replace_child_mid_drain(int old_drain_count,
     bdrv_ref(old_child_bs);
     bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds,
                       BDRV_CHILD_COW, &error_abort);
+    parent_s->setup_completed = true;
 
     for (i = 0; i < old_drain_count; i++) {
         bdrv_drained_begin(old_child_bs);