@@ -620,18 +620,10 @@ void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
* bdrv_parent_drained_end_single:
*
* End a quiesced section for the parent of @c.
+ * c->parent_quiesced must be true.
*/
void bdrv_parent_drained_end_single(BdrvChild *c);
-/**
- * bdrv_parent_drained_end:
- *
- * End a quiesced section of all users of @bs. This is part of
- * bdrv_drained_end.
- */
-void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
- bool ignore_bds_parents);
-
/**
* bdrv_drain_poll:
*
@@ -730,13 +730,10 @@ struct BdrvChild {
bool frozen;
/*
- * How many times the parent of this child has been drained
- * (through role->drained_*).
- * Usually, this is equal to bs->quiesce_counter (potentially
- * reduced by bdrv_drain_all_count). It may differ while the
- * child is entering or leaving a drained section.
+ * Whether the parent of this child has been drained through
+ * role->drained_*.
*/
- int parent_quiesce_counter;
+ bool parent_quiesced;
QLIST_ENTRY(BdrvChild) next;
QLIST_ENTRY(BdrvChild) next_parent;
@@ -2159,7 +2159,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
BlockDriverState *new_bs)
{
BlockDriverState *old_bs = child->bs;
- int i;
assert(!child->frozen);
@@ -2173,12 +2172,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
if (child->role->detach) {
child->role->detach(child);
}
- while (child->parent_quiesce_counter) {
+ if (child->parent_quiesced) {
bdrv_parent_drained_end_single(child);
}
QLIST_REMOVE(child, next_parent);
} else {
- assert(child->parent_quiesce_counter == 0);
+ assert(!child->parent_quiesced);
}
child->bs = new_bs;
@@ -2191,7 +2190,7 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
num -= bdrv_drain_all_count;
}
assert(num >= 0);
- for (i = 0; i < num; i++) {
+ if (num) {
bdrv_parent_drained_begin_single(child, true);
}
}
@@ -57,24 +57,47 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
void bdrv_parent_drained_end_single(BdrvChild *c)
{
- assert(c->parent_quiesce_counter > 0);
- c->parent_quiesce_counter--;
+ assert(c->parent_quiesced);
+ c->parent_quiesced = false;
if (c->role->drained_end) {
c->role->drained_end(c);
}
}
-void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
- bool ignore_bds_parents)
+static void bdrv_parent_drained_end(BlockDriverState *bs)
{
- BdrvChild *c, *next;
+ BdrvChild *c;
- QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
- if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
- continue;
- }
- bdrv_parent_drained_end_single(c);
+ if (bs->quiesce_counter) {
+ return;
}
+
+ /*
+ * The list of parents can change, so repeat until all are
+ * unquiesced (or until bs->quiesce_counter is no longer zero).
+ */
+ do {
+ QLIST_FOREACH(c, &bs->parents, next_parent) {
+ if (!c->parent_quiesced) {
+ continue;
+ }
+
+ if (bs->quiesce_counter) {
+ /*
+ * We can just leave here. The first thing
+ * bdrv_parent_drained_end_single() does is to set
+ * c->parent_quiesced to false. If something decides
+ * to drain @bs while we are unquiescing some parent,
+ * it will thus redrain that parent (and everything
+ * else that we have already unquiesced).
+ */
+ return;
+ }
+
+ bdrv_parent_drained_end_single(c);
+ break;
+ }
+ } while (c);
}
static bool bdrv_parent_drained_poll_single(BdrvChild *c)
@@ -103,9 +126,11 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
{
- c->parent_quiesce_counter++;
- if (c->role->drained_begin) {
- c->role->drained_begin(c);
+ if (!c->parent_quiesced) {
+ c->parent_quiesced = true;
+ if (c->role->drained_begin) {
+ c->role->drained_begin(c);
+ }
}
if (poll) {
BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
@@ -433,9 +458,9 @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
/* Re-enable things in child-to-parent order */
bdrv_drain_invoke(bs, false);
- bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
-
old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
+ bdrv_parent_drained_end(bs);
+
if (old_quiesce_counter == 1) {
aio_enable_external(bdrv_get_aio_context(bs));
}
@@ -446,10 +446,14 @@ static void test_multiparent(void)
do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+ /*
+ * @backing is still drained by @bs_a, so it has not unquiesced
+ * its parents yet (and @bs_a retains its qc of 2).
+ */
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
g_assert_cmpint(backing->quiesce_counter, ==, 1);
- g_assert_cmpint(a_s->drain_count, ==, 1);
+ g_assert_cmpint(a_s->drain_count, ==, 2);
g_assert_cmpint(b_s->drain_count, ==, 1);
g_assert_cmpint(backing_s->drain_count, ==, 1);
@@ -505,27 +509,28 @@ static void test_graph_change_drain_subtree(void)
do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
bdrv_set_backing_hd(bs_b, backing, &error_abort);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
- g_assert_cmpint(backing->quiesce_counter, ==, 5);
- g_assert_cmpint(a_s->drain_count, ==, 5);
- g_assert_cmpint(b_s->drain_count, ==, 5);
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 4); /* 3 + !!2 */
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 3); /* !!3 + 2 */
+ g_assert_cmpint(backing->quiesce_counter, ==, 5); /* 3 + 2 */
+ g_assert_cmpint(a_s->drain_count, ==, 4);
+ g_assert_cmpint(b_s->drain_count, ==, 3);
g_assert_cmpint(backing_s->drain_count, ==, 5);
bdrv_set_backing_hd(bs_b, NULL, &error_abort);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
+ /* @backing remains quiesced, so it does not unquiesce @bs_a */
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 4);
g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
g_assert_cmpint(backing->quiesce_counter, ==, 3);
- g_assert_cmpint(a_s->drain_count, ==, 3);
+ g_assert_cmpint(a_s->drain_count, ==, 4);
g_assert_cmpint(b_s->drain_count, ==, 2);
g_assert_cmpint(backing_s->drain_count, ==, 3);
bdrv_set_backing_hd(bs_b, backing, &error_abort);
- g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
- g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 4);
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 3);
g_assert_cmpint(backing->quiesce_counter, ==, 5);
- g_assert_cmpint(a_s->drain_count, ==, 5);
- g_assert_cmpint(b_s->drain_count, ==, 5);
+ g_assert_cmpint(a_s->drain_count, ==, 4);
+ g_assert_cmpint(b_s->drain_count, ==, 3);
g_assert_cmpint(backing_s->drain_count, ==, 5);
do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
Reliably ending the drain on a BDS's parents is quite difficult. What we have to achieve is to undrain exactly those parents that have been added to the BDS while its quiesce_counter was elevated. If we move decrementing the quiesce_counter before the invocation of bdrv_parent_drained_end(), that leaves us with the parents that existed at the start of this function. That seems simple enough, because new parents are always added at the beginning of the list, so just iterating through it from start to finish should give us exactly the parents we are looking for. Unfortunately, there is a catch. Unquiescing one parent can lead to another deciding to detach the child. In the process, the BdrvChild object may be destroyed. That means using QLIST_FOREACH_SAFE() to iterate the list is actually wrong: The @next pointer may be stale by the time we get to the next iteration. Using QLIST_FOREACH() would be just as wrong, though, because the current BdrvChild object may be destroyed just as well. It follows that we cannot just iterate over the list from start to finish. Some ideas how to solve this problem: - Store a list of the parent pointers at the beginning of the function, then try to unquiesce them all (as long as they are in bs->parents). Does not work because pointers may be reused by new parents. - Add refcounts to BdrvChild objects. Kind of silly because only the parent really has a valid reference. Once that goes away, all fields must be cleared. Therefore, all we could guarantee is that the list pointers (next/next_parent) stay valid. Also, parents can simply change the child a BdrvChild refers to. If they do that, the BdrvChild object remains accessible, but (1) it refers to the wrong child, and (2) next_parent suddenly refers to a different list. So maybe this can be made to work, but it would always be kind of silly. - Check the difference between the parent_quiesce_counters and the actual bs->quiesce_counter. In theory, both should be equal after bdrv_parent_drained_end() (reduced by bdrv_drain_all_count for BDS parents). So we could reiterate the parent list (unquiescing one parent at a time) until all parents have the desired parent_quiesce_counter. In practice, this is difficult -- I have tried. bdrv_drain_all_count may be one too high because somewhere down the stack there is a bdrv_drain_all_end() currently running. Also, I suppose something can concurrently modify bs->quiesce_counter, and I am not sure how to handle such cases. In the best case, this would lead to rather complicated code that I could not trust but only pray that it works. In the worst case, my prayers are not heard. We can get around the whole issue by observing that it really does not matter whether a parent is quiesced one time or ten. We just have to quiesce it once the moment the child tries to change from being unquiesced to being quiesced. We have to unquiesce it when the child is unquiesced again. Therefore, we can just make the parent_quiesce_counter a boolean and call it parent_quiesced. When bdrv_parent_drained_end() sees bs->quiesce_counter going to 0, we unquiesce all parents. bdrv_parent_drained_begin() in turn quiesces all unquiesced parents. (This means we have to decrement bs->quiesce_counter before bdrv_parent_drained_end().) Signed-off-by: Max Reitz <mreitz@redhat.com> --- include/block/block.h | 10 +------ include/block/block_int.h | 9 +++---- block.c | 7 +++-- block/io.c | 55 ++++++++++++++++++++++++++++----------- tests/test-bdrv-drain.c | 31 +++++++++++++--------- 5 files changed, 65 insertions(+), 47 deletions(-)