Message ID | e6fa1fe465be9051316b29668c0a255341b6b1f1.1459776815.git.berto@igalia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 04.04.2016 15:43, Alberto Garcia wrote: > This patch makes the 'device' parameter of the 'block-stream' command > accept a node name as well as a device name. > > In addition to that, operation blockers will be checked in all > intermediate nodes between the top and the base node. > > Since qmp_block_stream() now uses the error from bdrv_lookup_bs() and > no longer returns DeviceNotFound, iotest 030 is updated to expect > GenericError instead. > > Signed-off-by: Alberto Garcia <berto@igalia.com> > --- > blockdev.c | 31 +++++++++++++++++++++++-------- > qapi/block-core.json | 10 +++++++--- > tests/qemu-iotests/030 | 2 +- > 3 files changed, 31 insertions(+), 12 deletions(-) > > diff --git a/blockdev.c b/blockdev.c > index 2e7712e..bfdc0e3 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -2989,6 +2989,7 @@ void qmp_block_stream(const char *device, > BlockBackend *blk; > BlockDriverState *bs; > BlockDriverState *base_bs = NULL; > + BlockDriverState *active; > AioContext *aio_context; > Error *local_err = NULL; > const char *base_name = NULL; > @@ -2997,21 +2998,19 @@ void qmp_block_stream(const char *device, > on_error = BLOCKDEV_ON_ERROR_REPORT; > } > > - blk = blk_by_name(device); > - if (!blk) { > - error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, > - "Device '%s' not found", device); > + bs = bdrv_lookup_bs(device, device, errp); > + if (!bs) { > return; > } > > - aio_context = blk_get_aio_context(blk); > + aio_context = bdrv_get_aio_context(bs); > aio_context_acquire(aio_context); > > - if (!blk_is_available(blk)) { > + blk = blk_by_name(device); > + if (blk && !blk_is_available(blk)) { > error_setg(errp, "Device '%s' has no medium", device); > goto out; > } > - bs = blk_bs(blk); > > if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_STREAM, errp)) { > goto out; > @@ -3027,6 +3026,22 @@ void qmp_block_stream(const char *device, > base_name = base; > } > > + /* Look for the top-level node that contains 'bs' in its chain */ > + active = NULL; > + do { > + active = bdrv_next(active); > + } while (active && !bdrv_chain_contains(active, bs)); Alternatively, you could iterate up directly from @bs. Just look for the BdrvChild in bs->parents with .role == &child_backing. More interesting question: What happens if you have e.g. a qcow2 file as a quorum child, and then want to stream inside of the qcow2 backing chain? So maybe you should first walk up along &child_backing and then along &child_file/&child_format. I think a generic bdrv_get_root_bs() wouldn't hurt. > + > + if (active == NULL) { > + error_setg(errp, "Cannot find top level node for '%s'", device); > + goto out; > + } > + > + /* Check for op blockers in the top-level node too */ > + if (bdrv_op_is_blocked(active, BLOCK_OP_TYPE_STREAM, errp)) { > + goto out; > + } > + > /* if we are streaming the entire chain, the result will have no backing > * file, and specifying one is therefore an error */ > if (base_bs == NULL && has_backing_file) { > @@ -3038,7 +3053,7 @@ void qmp_block_stream(const char *device, > /* backing_file string overrides base bs filename */ > base_name = has_backing_file ? backing_file : base_name; > > - stream_start(bs, NULL, base_bs, base_name, has_speed ? speed : 0, > + stream_start(bs, active, base_bs, base_name, has_speed ? speed : 0, stream_start() behaves differently for active == NULL and active == bs. I think both kinds of behavior work, but it looks weird for active == bs. Should we pass NULL in that case instead? Max > on_error, block_job_cb, bs, &local_err); > if (local_err) { > error_propagate(errp, local_err); > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 59107ed..e20193a 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -1405,6 +1405,10 @@ > # with query-block-jobs. The operation can be stopped before it has completed > # using the block-job-cancel command. > # > +# The node that receives the data is called the top image, can be located > +# in any part of the whole chain and can be specified using its device > +# or node name. > +# > # If a base file is specified then sectors are not copied from that base file and > # its backing chain. When streaming completes the image file will have the base > # file as its backing file. This can be used to stream a subset of the backing > @@ -1413,12 +1417,12 @@ > # On successful completion the image file is updated to drop the backing file > # and the BLOCK_JOB_COMPLETED event is emitted. > # > -# @device: the device name > +# @device: the device or node name of the top image > # > # @base: #optional the common backing file name > # > -# @backing-file: #optional The backing file string to write into the active > -# layer. This filename is not validated. > +# @backing-file: #optional The backing file string to write into the top > +# image. This filename is not validated. > # > # If a pathname string is such that it cannot be > # resolved by QEMU, that means that subsequent QMP or > diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 > index 3ac2443..107049b 100755 > --- a/tests/qemu-iotests/030 > +++ b/tests/qemu-iotests/030 > @@ -126,7 +126,7 @@ class TestSingleDrive(iotests.QMPTestCase): > > def test_device_not_found(self): > result = self.vm.qmp('block-stream', device='nonexistent') > - self.assert_qmp(result, 'error/class', 'DeviceNotFound') > + self.assert_qmp(result, 'error/class', 'GenericError') > > > class TestSmallerBackingFile(iotests.QMPTestCase): >
On Wed 27 Apr 2016 03:34:19 PM CEST, Max Reitz <mreitz@redhat.com> wrote: >> + /* Look for the top-level node that contains 'bs' in its chain */ >> + active = NULL; >> + do { >> + active = bdrv_next(active); >> + } while (active && !bdrv_chain_contains(active, bs)); > > Alternatively, you could iterate up directly from @bs. Just look for > the BdrvChild in bs->parents with .role == &child_backing. Yes, but the BdrvChild in bs->parents does not contain any pointer to the actual parent node, so unless we want to change that structure I wouldn't go for this solution. > More interesting question: What happens if you have e.g. a qcow2 file > as a quorum child, and then want to stream inside of the qcow2 backing > chain? > > So maybe you should first walk up along &child_backing and then along > &child_file/&child_format. I think a generic bdrv_get_root_bs() > wouldn't hurt. You're right. I'm not sure if that would have other consequences that we should consider, so maybe we can add that later, with its own set of tests. Also, this is all necessary because of the problem with bdrv_reopen(). If that is ever fixed there's no need to block the active layer at all. >> - stream_start(bs, NULL, base_bs, base_name, has_speed ? speed : 0, >> + stream_start(bs, active, base_bs, base_name, has_speed ? speed : 0, > > stream_start() behaves differently for active == NULL and active == bs. > I think both kinds of behavior work, but it looks weird for active == > bs. Should we pass NULL in that case instead? You're right, I'll change it to pass NULL and assert(active != bs). Berto
Am 04.04.2016 um 15:43 hat Alberto Garcia geschrieben: > This patch makes the 'device' parameter of the 'block-stream' command > accept a node name as well as a device name. > > In addition to that, operation blockers will be checked in all > intermediate nodes between the top and the base node. > > Since qmp_block_stream() now uses the error from bdrv_lookup_bs() and > no longer returns DeviceNotFound, iotest 030 is updated to expect > GenericError instead. > > Signed-off-by: Alberto Garcia <berto@igalia.com> > --- > blockdev.c | 31 +++++++++++++++++++++++-------- > qapi/block-core.json | 10 +++++++--- > tests/qemu-iotests/030 | 2 +- > 3 files changed, 31 insertions(+), 12 deletions(-) > > diff --git a/blockdev.c b/blockdev.c > index 2e7712e..bfdc0e3 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -2989,6 +2989,7 @@ void qmp_block_stream(const char *device, > BlockBackend *blk; > BlockDriverState *bs; > BlockDriverState *base_bs = NULL; > + BlockDriverState *active; > AioContext *aio_context; > Error *local_err = NULL; > const char *base_name = NULL; > @@ -2997,21 +2998,19 @@ void qmp_block_stream(const char *device, > on_error = BLOCKDEV_ON_ERROR_REPORT; > } > > - blk = blk_by_name(device); > - if (!blk) { > - error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, > - "Device '%s' not found", device); > + bs = bdrv_lookup_bs(device, device, errp); > + if (!bs) { > return; > } > > - aio_context = blk_get_aio_context(blk); > + aio_context = bdrv_get_aio_context(bs); > aio_context_acquire(aio_context); > > - if (!blk_is_available(blk)) { > + blk = blk_by_name(device); > + if (blk && !blk_is_available(blk)) { > error_setg(errp, "Device '%s' has no medium", device); > goto out; > } > - bs = blk_bs(blk); > > if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_STREAM, errp)) { > goto out; > @@ -3027,6 +3026,22 @@ void qmp_block_stream(const char *device, > base_name = base; > } > > + /* Look for the top-level node that contains 'bs' in its chain */ > + active = NULL; > + do { > + active = bdrv_next(active); > + } while (active && !bdrv_chain_contains(active, bs)); > + > + if (active == NULL) { > + error_setg(errp, "Cannot find top level node for '%s'", device); > + goto out; > + } Hm... On the one hand, I really like that you don't expect the user to provide the active layer in QMP. This allows us to remove this wart once we have the new op blockers. On the other hand, this code assumes that there is only a single top-level node. This isn't necessarily true any more these days. Do we need to set blockers on _all_ root nodes that have the node in their backing chain? Kevin
Am 28.04.2016 um 14:20 hat Alberto Garcia geschrieben: > On Wed 27 Apr 2016 03:34:19 PM CEST, Max Reitz <mreitz@redhat.com> wrote: > >> + /* Look for the top-level node that contains 'bs' in its chain */ > >> + active = NULL; > >> + do { > >> + active = bdrv_next(active); > >> + } while (active && !bdrv_chain_contains(active, bs)); > > > > Alternatively, you could iterate up directly from @bs. Just look for > > the BdrvChild in bs->parents with .role == &child_backing. > > Yes, but the BdrvChild in bs->parents does not contain any pointer to > the actual parent node, so unless we want to change that structure I > wouldn't go for this solution. Yes, and that's intentional. If we do things right, there shouldn't be any need to go upwards in the tree. Our current op blockers are not done right as they are only effective when set on the top layer. Some temporary ugliness to deal with this is okay; breaking the assumption that children don't know their parents would require much better justification. > > More interesting question: What happens if you have e.g. a qcow2 file > > as a quorum child, and then want to stream inside of the qcow2 backing > > chain? > > > > So maybe you should first walk up along &child_backing and then along > > &child_file/&child_format. I think a generic bdrv_get_root_bs() > > wouldn't hurt. > > You're right. I'm not sure if that would have other consequences that we > should consider, so maybe we can add that later, with its own set of > tests. > > Also, this is all necessary because of the problem with bdrv_reopen(). > If that is ever fixed there's no need to block the active layer at all. This patch errors out if we can't find the active layer. Sounds safe and appropriate for an initial version. The real solution isn't to improve the magic to find the root node, but to remove the need to find it (by getting the new op blockers). Kevin
On 04/04/2016 07:43 AM, Alberto Garcia wrote: > This patch makes the 'device' parameter of the 'block-stream' command > accept a node name as well as a device name. > > In addition to that, operation blockers will be checked in all > intermediate nodes between the top and the base node. > > Since qmp_block_stream() now uses the error from bdrv_lookup_bs() and > no longer returns DeviceNotFound, iotest 030 is updated to expect > GenericError instead. > > Signed-off-by: Alberto Garcia <berto@igalia.com> > --- > +++ b/qapi/block-core.json > @@ -1405,6 +1405,10 @@ Context: block-stream > # with query-block-jobs. The operation can be stopped before it has completed > # using the block-job-cancel command. > # > +# The node that receives the data is called the top image, can be located > +# in any part of the whole chain and can be specified using its device > +# or node name. > +# "any part of the whole chain" feels fishy - it has to be ABOVE the base file. That is, in a chain A <- B <- C <- D <- E, if I want to set B as the base, then the top has to be C, D, or E (but not A).
On Fri 29 Apr 2016 05:18:26 PM CEST, Kevin Wolf wrote: > This patch errors out if we can't find the active layer. Sounds safe > and appropriate for an initial version. The real solution isn't to > improve the magic to find the root node, but to remove the need to > find it (by getting the new op blockers). Well, I agree with the fact that what we really want is not to block the active layer at all, but I don't see how any new op blockers are going to solve that. The situation is that we can't allow two block jobs in the same chain at the moment, and I only see three solutions: a) each job blocks the whole chain (what this series does). b) each job checks that no other job is running on the same chain. Maybe cleaner? But it would require modifying all other jobs. c) we fix bdrv_reopen() so we can actually run both jobs at the same time. I'm wondering if pausing all block jobs between bdrv_reopen_prepare() and bdrv_reopen_commit() would do the trick. Opinions? Berto
On Fri 29 Apr 2016 05:11:07 PM CEST, Kevin Wolf wrote: >> + if (active == NULL) { >> + error_setg(errp, "Cannot find top level node for '%s'", device); >> + goto out; >> + } > > Hm... On the one hand, I really like that you don't expect the user to > provide the active layer in QMP. This allows us to remove this wart > once we have the new op blockers. Exactly, I still plan to stick to the API we discussed last year. > On the other hand, this code assumes that there is only a single > top-level node. This isn't necessarily true any more these days. Hmm... if you give me an example I can test that scenario. Berto
Am 03.05.2016 um 14:53 hat Alberto Garcia geschrieben: > On Fri 29 Apr 2016 05:11:07 PM CEST, Kevin Wolf wrote: > >> + if (active == NULL) { > >> + error_setg(errp, "Cannot find top level node for '%s'", device); > >> + goto out; > >> + } > > > > Hm... On the one hand, I really like that you don't expect the user to > > provide the active layer in QMP. This allows us to remove this wart > > once we have the new op blockers. > > Exactly, I still plan to stick to the API we discussed last year. > > > On the other hand, this code assumes that there is only a single > > top-level node. This isn't necessarily true any more these days. > > Hmm... if you give me an example I can test that scenario. Simply reference the same node twice: $ x86_64-softmmu/qemu-system-x86_64 \ -drive if=none,file=/tmp/backing.qcow2,id=backing \ -drive file=/tmp/test.qcow2,backing=backing,id=hda \ -drive file=/tmp/test2.qcow2,backing=backing,id=hdb If backing.qcow2 has another backing file, you can do the intermediate streaming to it and both hda and hdb are active layers on top of it. Kevin
Am 03.05.2016 um 14:50 hat Alberto Garcia geschrieben: > On Fri 29 Apr 2016 05:18:26 PM CEST, Kevin Wolf wrote: > > This patch errors out if we can't find the active layer. Sounds safe > > and appropriate for an initial version. The real solution isn't to > > improve the magic to find the root node, but to remove the need to > > find it (by getting the new op blockers). > > Well, I agree with the fact that what we really want is not to block the > active layer at all, but I don't see how any new op blockers are going > to solve that. > > The situation is that we can't allow two block jobs in the same chain at > the moment, and I only see three solutions: > > a) each job blocks the whole chain (what this series does). > > b) each job checks that no other job is running on the same chain. > Maybe cleaner? But it would require modifying all other jobs. > > c) we fix bdrv_reopen() so we can actually run both jobs at the same > time. I'm wondering if pausing all block jobs between > bdrv_reopen_prepare() and bdrv_reopen_commit() would do the > trick. Opinions? I would have to read up the details of the problem again, but I think with bdrv_drained_begin/end() we actually have the right tool now to fix it properly. We may need to pull up the drain (bdrv_drain_all() today) from bdrv_reopen_multiple() to its caller and just assert it in the function itself, but there shouldn't be much more to it than that. Kevin
On Tue 03 May 2016 03:18:39 PM CEST, Kevin Wolf wrote: >> > On the other hand, this code assumes that there is only a single >> > top-level node. This isn't necessarily true any more these days. >> >> Hmm... if you give me an example I can test that scenario. > > Simply reference the same node twice: > > $ x86_64-softmmu/qemu-system-x86_64 \ > -drive if=none,file=/tmp/backing.qcow2,id=backing \ > -drive file=/tmp/test.qcow2,backing=backing,id=hda \ > -drive file=/tmp/test2.qcow2,backing=backing,id=hdb You're right, but maybe we need to add additional checks to some of the existing commands then, because if you run e.g block-commit on hda you're gonna corrupt hdb. Berto
On Tue 03 May 2016 03:23:24 PM CEST, Kevin Wolf wrote: >> c) we fix bdrv_reopen() so we can actually run both jobs at the same >> time. I'm wondering if pausing all block jobs between >> bdrv_reopen_prepare() and bdrv_reopen_commit() would do the >> trick. Opinions? > > I would have to read up the details of the problem again, but I think > with bdrv_drained_begin/end() we actually have the right tool now to fix > it properly. We may need to pull up the drain (bdrv_drain_all() today) > from bdrv_reopen_multiple() to its caller and just assert it in the > function itself, but there shouldn't be much more to it than that. I think that's not enough, see point 2) here: https://lists.gnu.org/archive/html/qemu-block/2015-12/msg00180.html "I've been taking a look at the bdrv_drained_begin/end() API, but as I understand it it prevents requests from a different AioContext. Since all BDS in the same chain share the same context it does not really help here." Berto
Am 03.05.2016 um 15:33 hat Alberto Garcia geschrieben: > On Tue 03 May 2016 03:23:24 PM CEST, Kevin Wolf wrote: > >> c) we fix bdrv_reopen() so we can actually run both jobs at the same > >> time. I'm wondering if pausing all block jobs between > >> bdrv_reopen_prepare() and bdrv_reopen_commit() would do the > >> trick. Opinions? > > > > I would have to read up the details of the problem again, but I think > > with bdrv_drained_begin/end() we actually have the right tool now to fix > > it properly. We may need to pull up the drain (bdrv_drain_all() today) > > from bdrv_reopen_multiple() to its caller and just assert it in the > > function itself, but there shouldn't be much more to it than that. > > I think that's not enough, see point 2) here: > > https://lists.gnu.org/archive/html/qemu-block/2015-12/msg00180.html > > "I've been taking a look at the bdrv_drained_begin/end() API, but as I > understand it it prevents requests from a different AioContext. > Since all BDS in the same chain share the same context it does not > really help here." Yes, that's the part I meant with pulling up the calls. If I understand correctly, the problem is that first bdrv_reopen_queue() queues a few BDSes for reopen, then bdrv_drain_all() completes all running requests and can indirectly trigger a graph modification, and then bdrv_reopen_multiple() uses the queue which doesn't match reality any more. The solution to that should be simply changing the order of things: 1. bdrv_drained_begin() 2. bdrv_reopen_queue() 3. bdrv_reopen_multiple() * Instead of bdrv_drain_all(), assert that no requests are pending * We don't run requests, so we can't complete a block job and manipulate the graph any more 4. then bdrv_drained_end() But you're right that this changed order is really the key and not bdrv_drained_begin/end(). The latter are just cleaner than the existing bdrv_drain_all(), but don't actually contribute much to the fix without dataplane. I must have misremembered its importance. Kevin
On Tue 03 May 2016 03:48:47 PM CEST, Kevin Wolf wrote: > The solution to that should be simply changing the order of things: > > 1. bdrv_drained_begin() > 2. bdrv_reopen_queue() > 3. bdrv_reopen_multiple() > * Instead of bdrv_drain_all(), assert that no requests are pending No requests are pending in any member of the bs_queue? I don't have the time to debug it today, but this part doesn't seem to be working. > * We don't run requests, so we can't complete a block job and > manipulate the graph any more > 4. then bdrv_drained_end() Berto
Am 12.05.2016 um 15:47 hat Alberto Garcia geschrieben: > On Tue 03 May 2016 03:48:47 PM CEST, Kevin Wolf wrote: > > Am 03.05.2016 um 15:33 hat Alberto Garcia geschrieben: > >> On Tue 03 May 2016 03:23:24 PM CEST, Kevin Wolf wrote: > >> >> c) we fix bdrv_reopen() so we can actually run both jobs at the same > >> >> time. I'm wondering if pausing all block jobs between > >> >> bdrv_reopen_prepare() and bdrv_reopen_commit() would do the > >> >> trick. Opinions? > >> > > >> > I would have to read up the details of the problem again, but I think > >> > with bdrv_drained_begin/end() we actually have the right tool now to fix > >> > it properly. We may need to pull up the drain (bdrv_drain_all() today) > >> > from bdrv_reopen_multiple() to its caller and just assert it in the > >> > function itself, but there shouldn't be much more to it than that. > >> > >> I think that's not enough, see point 2) here: > >> > >> https://lists.gnu.org/archive/html/qemu-block/2015-12/msg00180.html > >> > >> "I've been taking a look at the bdrv_drained_begin/end() API, but as I > >> understand it it prevents requests from a different AioContext. > >> Since all BDS in the same chain share the same context it does not > >> really help here." > > > > Yes, that's the part I meant with pulling up the calls. > > > > If I understand correctly, the problem is that first bdrv_reopen_queue() > > queues a few BDSes for reopen, then bdrv_drain_all() completes all > > running requests and can indirectly trigger a graph modification, and > > then bdrv_reopen_multiple() uses the queue which doesn't match reality > > any more. > > > > The solution to that should be simply changing the order of things: > > > > 1. bdrv_drained_begin() > > 2. bdrv_reopen_queue() > > 3. bdrv_reopen_multiple() > > * Instead of bdrv_drain_all(), assert that no requests are pending > > * We don't run requests, so we can't complete a block job and > > manipulate the graph any more > > 4. then bdrv_drained_end() > > This doesn't work. Here's what happens: > > 1) Block job (a) starts (block-stream). > > 2) Block job (b) starts (block-stream, or block-commit). > > 3) job (b) calls bdrv_reopen() and does the drain call. > > 4) job (b) creates reopen_queue and calls bdrv_reopen_multiple(). > There are no pending requests at this point, but job (a) is sleeping. > > 5) bdrv_reopen_multiple() iterates over reopen_queue and calls > bdrv_reopen_prepare() -> bdrv_flush() -> bdrv_co_flush() -> > qemu_coroutine_yield(). I think between here and the next step is what I don't understand. bdrv_reopen_multiple() is not called in coroutine context, right? All block jobs use block_job_defer_to_main_loop() before they call bdrv_reopen(), as far as I can see. So bdrv_flush() shouldn't take the shortcut, but use a nested event loop. What is it that calls into job (a) from that event loop? It can't be a request completion because we already drained all requests. Is it a timer? > 6) job (a) resumes, finishes the job and removes nodes from the graph. > > 7) job (b) continues with bdrv_reopen_multiple() but now reopen_queue > contains invalid pointers. I don't fully understand the problem yet, but as a shot in the dark, would pausing block jobs in bdrv_drained_begin() help? Kevin
Am 12.05.2016 um 17:13 hat Alberto Garcia geschrieben: > On Thu 12 May 2016 05:04:51 PM CEST, Kevin Wolf wrote: > > Am 12.05.2016 um 15:47 hat Alberto Garcia geschrieben: > >> On Tue 03 May 2016 03:48:47 PM CEST, Kevin Wolf wrote: > >> > Am 03.05.2016 um 15:33 hat Alberto Garcia geschrieben: > >> >> On Tue 03 May 2016 03:23:24 PM CEST, Kevin Wolf wrote: > >> >> >> c) we fix bdrv_reopen() so we can actually run both jobs at the same > >> >> >> time. I'm wondering if pausing all block jobs between > >> >> >> bdrv_reopen_prepare() and bdrv_reopen_commit() would do the > >> >> >> trick. Opinions? > >> >> > > >> >> > I would have to read up the details of the problem again, but I think > >> >> > with bdrv_drained_begin/end() we actually have the right tool now to fix > >> >> > it properly. We may need to pull up the drain (bdrv_drain_all() today) > >> >> > from bdrv_reopen_multiple() to its caller and just assert it in the > >> >> > function itself, but there shouldn't be much more to it than that. > >> >> > >> >> I think that's not enough, see point 2) here: > >> >> > >> >> https://lists.gnu.org/archive/html/qemu-block/2015-12/msg00180.html > >> >> > >> >> "I've been taking a look at the bdrv_drained_begin/end() API, but as I > >> >> understand it it prevents requests from a different AioContext. > >> >> Since all BDS in the same chain share the same context it does not > >> >> really help here." > >> > > >> > Yes, that's the part I meant with pulling up the calls. > >> > > >> > If I understand correctly, the problem is that first bdrv_reopen_queue() > >> > queues a few BDSes for reopen, then bdrv_drain_all() completes all > >> > running requests and can indirectly trigger a graph modification, and > >> > then bdrv_reopen_multiple() uses the queue which doesn't match reality > >> > any more. > >> > > >> > The solution to that should be simply changing the order of things: > >> > > >> > 1. bdrv_drained_begin() > >> > 2. bdrv_reopen_queue() > >> > 3. bdrv_reopen_multiple() > >> > * Instead of bdrv_drain_all(), assert that no requests are pending > >> > * We don't run requests, so we can't complete a block job and > >> > manipulate the graph any more > >> > 4. then bdrv_drained_end() > >> > >> This doesn't work. Here's what happens: > >> > >> 1) Block job (a) starts (block-stream). > >> > >> 2) Block job (b) starts (block-stream, or block-commit). > >> > >> 3) job (b) calls bdrv_reopen() and does the drain call. > >> > >> 4) job (b) creates reopen_queue and calls bdrv_reopen_multiple(). > >> There are no pending requests at this point, but job (a) is sleeping. > >> > >> 5) bdrv_reopen_multiple() iterates over reopen_queue and calls > >> bdrv_reopen_prepare() -> bdrv_flush() -> bdrv_co_flush() -> > >> qemu_coroutine_yield(). > > > > I think between here and the next step is what I don't understand. > > > > bdrv_reopen_multiple() is not called in coroutine context, right? All > > block jobs use block_job_defer_to_main_loop() before they call > > bdrv_reopen(), as far as I can see. So bdrv_flush() shouldn't take the > > shortcut, but use a nested event loop. > > When bdrv_flush() is not called in coroutine context it does > qemu_coroutine_create() + qemu_coroutine_enter(). Right, but if the coroutine yields, we jump back to the caller, which looks like this: co = qemu_coroutine_create(bdrv_flush_co_entry); qemu_coroutine_enter(co, &rwco); while (rwco.ret == NOT_DONE) { aio_poll(aio_context, true); } So this loops until the flush has completed. The only way I can see how something else (job (a)) can run is if aio_poll() calls it. > > What is it that calls into job (a) from that event loop? It can't be a > > request completion because we already drained all requests. Is it a > > timer? > > If I didn't get it wrong it's this bit in bdrv_co_flush() > [...] That's the place that yields from (b), but it's not the place that calls into (a). > >> 6) job (a) resumes, finishes the job and removes nodes from the graph. > >> > >> 7) job (b) continues with bdrv_reopen_multiple() but now reopen_queue > >> contains invalid pointers. > > > > I don't fully understand the problem yet, but as a shot in the dark, > > would pausing block jobs in bdrv_drained_begin() help? > > Yeah, my impression is that pausing all jobs during bdrv_reopen() should > be enough. If you base your patches on top of my queue (which I think you already do for the greatest part), the nicest way to implement this would probably be that BlockBackends give their users a callback for .drained_begin/end and the jobs implement that as pausing themselves. We could, of course, directly pause block jobs in bdrv_drained_begin(), but that would feel a bit hackish. So maybe do that for a quick attempt whether it helps, and if it does, we can write the real thing then. Kevin
On Thu 12 May 2016 05:28:34 PM CEST, Kevin Wolf wrote: > Am 12.05.2016 um 17:13 hat Alberto Garcia geschrieben: >> On Thu 12 May 2016 05:04:51 PM CEST, Kevin Wolf wrote: >> > Am 12.05.2016 um 15:47 hat Alberto Garcia geschrieben: >> >> On Tue 03 May 2016 03:48:47 PM CEST, Kevin Wolf wrote: >> >> > Am 03.05.2016 um 15:33 hat Alberto Garcia geschrieben: >> >> >> On Tue 03 May 2016 03:23:24 PM CEST, Kevin Wolf wrote: >> >> >> >> c) we fix bdrv_reopen() so we can actually run both jobs at the same >> >> >> >> time. I'm wondering if pausing all block jobs between >> >> >> >> bdrv_reopen_prepare() and bdrv_reopen_commit() would do the >> >> >> >> trick. Opinions? >> >> >> > >> >> >> > I would have to read up the details of the problem again, but I think >> >> >> > with bdrv_drained_begin/end() we actually have the right tool now to fix >> >> >> > it properly. We may need to pull up the drain (bdrv_drain_all() today) >> >> >> > from bdrv_reopen_multiple() to its caller and just assert it in the >> >> >> > function itself, but there shouldn't be much more to it than that. >> >> >> >> >> >> I think that's not enough, see point 2) here: >> >> >> >> >> >> https://lists.gnu.org/archive/html/qemu-block/2015-12/msg00180.html >> >> >> >> >> >> "I've been taking a look at the bdrv_drained_begin/end() API, but as I >> >> >> understand it it prevents requests from a different AioContext. >> >> >> Since all BDS in the same chain share the same context it does not >> >> >> really help here." >> >> > >> >> > Yes, that's the part I meant with pulling up the calls. >> >> > >> >> > If I understand correctly, the problem is that first bdrv_reopen_queue() >> >> > queues a few BDSes for reopen, then bdrv_drain_all() completes all >> >> > running requests and can indirectly trigger a graph modification, and >> >> > then bdrv_reopen_multiple() uses the queue which doesn't match reality >> >> > any more. >> >> > >> >> > The solution to that should be simply changing the order of things: >> >> > >> >> > 1. bdrv_drained_begin() >> >> > 2. bdrv_reopen_queue() >> >> > 3. bdrv_reopen_multiple() >> >> > * Instead of bdrv_drain_all(), assert that no requests are pending >> >> > * We don't run requests, so we can't complete a block job and >> >> > manipulate the graph any more >> >> > 4. then bdrv_drained_end() >> >> >> >> This doesn't work. Here's what happens: >> >> >> >> 1) Block job (a) starts (block-stream). >> >> >> >> 2) Block job (b) starts (block-stream, or block-commit). >> >> >> >> 3) job (b) calls bdrv_reopen() and does the drain call. >> >> >> >> 4) job (b) creates reopen_queue and calls bdrv_reopen_multiple(). >> >> There are no pending requests at this point, but job (a) is sleeping. >> >> >> >> 5) bdrv_reopen_multiple() iterates over reopen_queue and calls >> >> bdrv_reopen_prepare() -> bdrv_flush() -> bdrv_co_flush() -> >> >> qemu_coroutine_yield(). >> > >> > I think between here and the next step is what I don't understand. >> > >> > bdrv_reopen_multiple() is not called in coroutine context, right? All >> > block jobs use block_job_defer_to_main_loop() before they call >> > bdrv_reopen(), as far as I can see. So bdrv_flush() shouldn't take the >> > shortcut, but use a nested event loop. >> >> When bdrv_flush() is not called in coroutine context it does >> qemu_coroutine_create() + qemu_coroutine_enter(). > > Right, but if the coroutine yields, we jump back to the caller, which > looks like this: > > co = qemu_coroutine_create(bdrv_flush_co_entry); > qemu_coroutine_enter(co, &rwco); > while (rwco.ret == NOT_DONE) { > aio_poll(aio_context, true); > } > > So this loops until the flush has completed. The only way I can see how > something else (job (a)) can run is if aio_poll() calls it. If I'm not wrong that can happen if job (a) is sleeping. If job (a) is launched with a rate limit, then the bdrv_drain() call will not drain it completely but return when the block job hits the I/O limit -> block_job_sleep_ns() -> co_aio_sleep_ns(). >> > I don't fully understand the problem yet, but as a shot in the >> > dark, would pausing block jobs in bdrv_drained_begin() help? >> >> Yeah, my impression is that pausing all jobs during bdrv_reopen() >> should be enough. > > If you base your patches on top of my queue (which I think you already > do for the greatest part), the nicest way to implement this would > probably be that BlockBackends give their users a callback for > .drained_begin/end and the jobs implement that as pausing themselves. > > We could, of course, directly pause block jobs in > bdrv_drained_begin(), but that would feel a bit hackish. So maybe do > that for a quick attempt whether it helps, and if it does, we can > write the real thing then. I'll try that. On a related note, bdrv_drain_all() already pauses all block jobs. The problem is that when it ends it resumes them, so it can happen that there's again pending I/O requests before bdrv_drain_all() returns. That sounds like a problem to me, or am I overlooking anything? Berto
Am 17.05.2016 um 16:26 hat Alberto Garcia geschrieben: > On Thu 12 May 2016 05:28:34 PM CEST, Kevin Wolf wrote: > > Am 12.05.2016 um 17:13 hat Alberto Garcia geschrieben: > >> On Thu 12 May 2016 05:04:51 PM CEST, Kevin Wolf wrote: > >> > Am 12.05.2016 um 15:47 hat Alberto Garcia geschrieben: > >> >> On Tue 03 May 2016 03:48:47 PM CEST, Kevin Wolf wrote: > >> >> > Am 03.05.2016 um 15:33 hat Alberto Garcia geschrieben: > >> >> >> On Tue 03 May 2016 03:23:24 PM CEST, Kevin Wolf wrote: > >> >> >> >> c) we fix bdrv_reopen() so we can actually run both jobs at the same > >> >> >> >> time. I'm wondering if pausing all block jobs between > >> >> >> >> bdrv_reopen_prepare() and bdrv_reopen_commit() would do the > >> >> >> >> trick. Opinions? > >> >> >> > > >> >> >> > I would have to read up the details of the problem again, but I think > >> >> >> > with bdrv_drained_begin/end() we actually have the right tool now to fix > >> >> >> > it properly. We may need to pull up the drain (bdrv_drain_all() today) > >> >> >> > from bdrv_reopen_multiple() to its caller and just assert it in the > >> >> >> > function itself, but there shouldn't be much more to it than that. > >> >> >> > >> >> >> I think that's not enough, see point 2) here: > >> >> >> > >> >> >> https://lists.gnu.org/archive/html/qemu-block/2015-12/msg00180.html > >> >> >> > >> >> >> "I've been taking a look at the bdrv_drained_begin/end() API, but as I > >> >> >> understand it it prevents requests from a different AioContext. > >> >> >> Since all BDS in the same chain share the same context it does not > >> >> >> really help here." > >> >> > > >> >> > Yes, that's the part I meant with pulling up the calls. > >> >> > > >> >> > If I understand correctly, the problem is that first bdrv_reopen_queue() > >> >> > queues a few BDSes for reopen, then bdrv_drain_all() completes all > >> >> > running requests and can indirectly trigger a graph modification, and > >> >> > then bdrv_reopen_multiple() uses the queue which doesn't match reality > >> >> > any more. > >> >> > > >> >> > The solution to that should be simply changing the order of things: > >> >> > > >> >> > 1. bdrv_drained_begin() > >> >> > 2. bdrv_reopen_queue() > >> >> > 3. bdrv_reopen_multiple() > >> >> > * Instead of bdrv_drain_all(), assert that no requests are pending > >> >> > * We don't run requests, so we can't complete a block job and > >> >> > manipulate the graph any more > >> >> > 4. then bdrv_drained_end() > >> >> > >> >> This doesn't work. Here's what happens: > >> >> > >> >> 1) Block job (a) starts (block-stream). > >> >> > >> >> 2) Block job (b) starts (block-stream, or block-commit). > >> >> > >> >> 3) job (b) calls bdrv_reopen() and does the drain call. > >> >> > >> >> 4) job (b) creates reopen_queue and calls bdrv_reopen_multiple(). > >> >> There are no pending requests at this point, but job (a) is sleeping. > >> >> > >> >> 5) bdrv_reopen_multiple() iterates over reopen_queue and calls > >> >> bdrv_reopen_prepare() -> bdrv_flush() -> bdrv_co_flush() -> > >> >> qemu_coroutine_yield(). > >> > > >> > I think between here and the next step is what I don't understand. > >> > > >> > bdrv_reopen_multiple() is not called in coroutine context, right? All > >> > block jobs use block_job_defer_to_main_loop() before they call > >> > bdrv_reopen(), as far as I can see. So bdrv_flush() shouldn't take the > >> > shortcut, but use a nested event loop. > >> > >> When bdrv_flush() is not called in coroutine context it does > >> qemu_coroutine_create() + qemu_coroutine_enter(). > > > > Right, but if the coroutine yields, we jump back to the caller, which > > looks like this: > > > > co = qemu_coroutine_create(bdrv_flush_co_entry); > > qemu_coroutine_enter(co, &rwco); > > while (rwco.ret == NOT_DONE) { > > aio_poll(aio_context, true); > > } > > > > So this loops until the flush has completed. The only way I can see how > > something else (job (a)) can run is if aio_poll() calls it. > > If I'm not wrong that can happen if job (a) is sleeping. > > If job (a) is launched with a rate limit, then the bdrv_drain() call > will not drain it completely but return when the block job hits the I/O > limit -> block_job_sleep_ns() -> co_aio_sleep_ns(). So timers. Currently, bdrv_drained_begin/end disables the FD handlers, but not timers. Not sure if that's easily fixable in a generic way, though, so maybe we need to disable timers one by one. Pausing a block job does not actually cause the timer to be cancelled. Maybe we need to change block_job_sleep_ns() so that the function returns only when the job isn't paused any more. Maybe something like: job->busy = false; if (!block_job_is_paused(job)) { co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns); /* The job could be paused now */ } if (block_job_is_paused(job)) { qemu_coroutine_yield(); } job->busy = true; Then it should be safe to assume that if block jobs are paused (even if they were sleeping), they won't issue new requests any more until they are resumed. > >> > I don't fully understand the problem yet, but as a shot in the > >> > dark, would pausing block jobs in bdrv_drained_begin() help? > >> > >> Yeah, my impression is that pausing all jobs during bdrv_reopen() > >> should be enough. > > > > If you base your patches on top of my queue (which I think you already > > do for the greatest part), the nicest way to implement this would > > probably be that BlockBackends give their users a callback for > > .drained_begin/end and the jobs implement that as pausing themselves. > > > > We could, of course, directly pause block jobs in > > bdrv_drained_begin(), but that would feel a bit hackish. So maybe do > > that for a quick attempt whether it helps, and if it does, we can > > write the real thing then. > > I'll try that. > > On a related note, bdrv_drain_all() already pauses all block jobs. The > problem is that when it ends it resumes them, so it can happen that > there's again pending I/O requests before bdrv_drain_all() returns. > > That sounds like a problem to me, or am I overlooking anything? Essentially every bdrv_drain(_all) call is a problem, it should always be a bdrv_drained_begin/end section. The only exception is where you want to drain only requests from a single source that you know doesn't send new requests (e.g. draining guest requests in vm_stop). Kevin
On Tue 17 May 2016 04:47:57 PM CEST, Kevin Wolf wrote: >> > Right, but if the coroutine yields, we jump back to the caller, which >> > looks like this: >> > >> > co = qemu_coroutine_create(bdrv_flush_co_entry); >> > qemu_coroutine_enter(co, &rwco); >> > while (rwco.ret == NOT_DONE) { >> > aio_poll(aio_context, true); >> > } >> > >> > So this loops until the flush has completed. The only way I can see >> > how something else (job (a)) can run is if aio_poll() calls it. >> >> If I'm not wrong that can happen if job (a) is sleeping. >> >> If job (a) is launched with a rate limit, then the bdrv_drain() call >> will not drain it completely but return when the block job hits the >> I/O limit -> block_job_sleep_ns() -> co_aio_sleep_ns(). > > So timers. Currently, bdrv_drained_begin/end disables the FD handlers, > but not timers. Not sure if that's easily fixable in a generic way, > though, so maybe we need to disable timers one by one. > > Pausing a block job does not actually cause the timer to be cancelled. > Maybe we need to change block_job_sleep_ns() so that the function > returns only when the job isn't paused any more. Maybe something like: > > job->busy = false; > if (!block_job_is_paused(job)) { > co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns); > /* The job could be paused now */ > } > if (block_job_is_paused(job)) { > qemu_coroutine_yield(); > } > job->busy = true; > > Then it should be safe to assume that if block jobs are paused (even > if they were sleeping), they won't issue new requests any more until > they are resumed. That looks good, I'll give it a try. Berto
diff --git a/blockdev.c b/blockdev.c index 2e7712e..bfdc0e3 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2989,6 +2989,7 @@ void qmp_block_stream(const char *device, BlockBackend *blk; BlockDriverState *bs; BlockDriverState *base_bs = NULL; + BlockDriverState *active; AioContext *aio_context; Error *local_err = NULL; const char *base_name = NULL; @@ -2997,21 +2998,19 @@ void qmp_block_stream(const char *device, on_error = BLOCKDEV_ON_ERROR_REPORT; } - blk = blk_by_name(device); - if (!blk) { - error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, - "Device '%s' not found", device); + bs = bdrv_lookup_bs(device, device, errp); + if (!bs) { return; } - aio_context = blk_get_aio_context(blk); + aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (!blk_is_available(blk)) { + blk = blk_by_name(device); + if (blk && !blk_is_available(blk)) { error_setg(errp, "Device '%s' has no medium", device); goto out; } - bs = blk_bs(blk); if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_STREAM, errp)) { goto out; @@ -3027,6 +3026,22 @@ void qmp_block_stream(const char *device, base_name = base; } + /* Look for the top-level node that contains 'bs' in its chain */ + active = NULL; + do { + active = bdrv_next(active); + } while (active && !bdrv_chain_contains(active, bs)); + + if (active == NULL) { + error_setg(errp, "Cannot find top level node for '%s'", device); + goto out; + } + + /* Check for op blockers in the top-level node too */ + if (bdrv_op_is_blocked(active, BLOCK_OP_TYPE_STREAM, errp)) { + goto out; + } + /* if we are streaming the entire chain, the result will have no backing * file, and specifying one is therefore an error */ if (base_bs == NULL && has_backing_file) { @@ -3038,7 +3053,7 @@ void qmp_block_stream(const char *device, /* backing_file string overrides base bs filename */ base_name = has_backing_file ? backing_file : base_name; - stream_start(bs, NULL, base_bs, base_name, has_speed ? speed : 0, + stream_start(bs, active, base_bs, base_name, has_speed ? speed : 0, on_error, block_job_cb, bs, &local_err); if (local_err) { error_propagate(errp, local_err); diff --git a/qapi/block-core.json b/qapi/block-core.json index 59107ed..e20193a 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1405,6 +1405,10 @@ # with query-block-jobs. The operation can be stopped before it has completed # using the block-job-cancel command. # +# The node that receives the data is called the top image, can be located +# in any part of the whole chain and can be specified using its device +# or node name. +# # If a base file is specified then sectors are not copied from that base file and # its backing chain. When streaming completes the image file will have the base # file as its backing file. This can be used to stream a subset of the backing @@ -1413,12 +1417,12 @@ # On successful completion the image file is updated to drop the backing file # and the BLOCK_JOB_COMPLETED event is emitted. # -# @device: the device name +# @device: the device or node name of the top image # # @base: #optional the common backing file name # -# @backing-file: #optional The backing file string to write into the active -# layer. This filename is not validated. +# @backing-file: #optional The backing file string to write into the top +# image. This filename is not validated. # # If a pathname string is such that it cannot be # resolved by QEMU, that means that subsequent QMP or diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 index 3ac2443..107049b 100755 --- a/tests/qemu-iotests/030 +++ b/tests/qemu-iotests/030 @@ -126,7 +126,7 @@ class TestSingleDrive(iotests.QMPTestCase): def test_device_not_found(self): result = self.vm.qmp('block-stream', device='nonexistent') - self.assert_qmp(result, 'error/class', 'DeviceNotFound') + self.assert_qmp(result, 'error/class', 'GenericError') class TestSmallerBackingFile(iotests.QMPTestCase):
This patch makes the 'device' parameter of the 'block-stream' command accept a node name as well as a device name. In addition to that, operation blockers will be checked in all intermediate nodes between the top and the base node. Since qmp_block_stream() now uses the error from bdrv_lookup_bs() and no longer returns DeviceNotFound, iotest 030 is updated to expect GenericError instead. Signed-off-by: Alberto Garcia <berto@igalia.com> --- blockdev.c | 31 +++++++++++++++++++++++-------- qapi/block-core.json | 10 +++++++--- tests/qemu-iotests/030 | 2 +- 3 files changed, 31 insertions(+), 12 deletions(-)