Message ID | afe0bc649007593edaa77f5f3923acb733a24c6a.1724701542.git.maciej.szmigiero@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Multifd | expand |
"Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes: > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > These SaveVMHandlers help device provide its own asynchronous > transmission of the remaining data at the end of a precopy phase. > > In this use case the save_live_complete_precopy_begin handler might > be used to mark the stream boundary before proceeding with asynchronous > transmission of the remaining data while the > save_live_complete_precopy_end handler might be used to mark the > stream boundary after performing the asynchronous transmission. > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> Reviewed-by: Fabiano Rosas <farosas@suse.de>
On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > External email: Use caution opening links or attachments > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > These SaveVMHandlers help device provide its own asynchronous > transmission of the remaining data at the end of a precopy phase. > > In this use case the save_live_complete_precopy_begin handler might > be used to mark the stream boundary before proceeding with asynchronous > transmission of the remaining data while the > save_live_complete_precopy_end handler might be used to mark the > stream boundary after performing the asynchronous transmission. > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > --- > include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ > migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ > 2 files changed, 71 insertions(+) > > diff --git a/include/migration/register.h b/include/migration/register.h > index f60e797894e5..9de123252edf 100644 > --- a/include/migration/register.h > +++ b/include/migration/register.h > @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { > */ > int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); > > + /** > + * @save_live_complete_precopy_begin > + * > + * Called at the end of a precopy phase, before all > + * @save_live_complete_precopy handlers and before launching > + * all @save_live_complete_precopy_thread threads. > + * The handler might, for example, mark the stream boundary before > + * proceeding with asynchronous transmission of the remaining data via > + * @save_live_complete_precopy_thread. > + * When postcopy is enabled, devices that support postcopy will skip this step. > + * > + * @f: QEMUFile where the handler can synchronously send data before returning > + * @idstr: this device section idstr > + * @instance_id: this device section instance_id > + * @opaque: data pointer passed to register_savevm_live() > + * > + * Returns zero to indicate success and negative for error > + */ > + int (*save_live_complete_precopy_begin)(QEMUFile *f, > + char *idstr, uint32_t instance_id, > + void *opaque); > + /** > + * @save_live_complete_precopy_end > + * > + * Called at the end of a precopy phase, after @save_live_complete_precopy > + * handlers and after all @save_live_complete_precopy_thread threads have > + * finished. When postcopy is enabled, devices that support postcopy will > + * skip this step. > + * > + * @f: QEMUFile where the handler can synchronously send data before returning > + * @opaque: data pointer passed to register_savevm_live() > + * > + * Returns zero to indicate success and negative for error > + */ > + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); Is this handler necessary now that migration core is responsible for the threads and joins them? I don't see VFIO implementing it later on. Thanks. > + > /* This runs both outside and inside the BQL. */ > > /** > diff --git a/migration/savevm.c b/migration/savevm.c > index 6bb404b9c86f..d43acbbf20cf 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1496,6 +1496,27 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > SaveStateEntry *se; > int ret; > > + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > + if (!se->ops || (in_postcopy && se->ops->has_postcopy && > + se->ops->has_postcopy(se->opaque)) || > + !se->ops->save_live_complete_precopy_begin) { > + continue; > + } > + > + save_section_header(f, se, QEMU_VM_SECTION_END); > + > + ret = se->ops->save_live_complete_precopy_begin(f, > + se->idstr, se->instance_id, > + se->opaque); > + > + save_section_footer(f, se); > + > + if (ret < 0) { > + qemu_file_set_error(f, ret); > + return -1; > + } > + } > + > QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > if (!se->ops || > (in_postcopy && se->ops->has_postcopy && > @@ -1527,6 +1548,20 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > end_ts_each - start_ts_each); > } > > + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > + if (!se->ops || (in_postcopy && se->ops->has_postcopy && > + se->ops->has_postcopy(se->opaque)) || > + !se->ops->save_live_complete_precopy_end) { > + continue; > + } > + > + ret = se->ops->save_live_complete_precopy_end(f, se->opaque); > + if (ret < 0) { > + qemu_file_set_error(f, ret); > + return -1; > + } > + } > + > trace_vmstate_downtime_checkpoint("src-iterable-saved"); > > return 0;
On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > > External email: Use caution opening links or attachments > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > > > These SaveVMHandlers help device provide its own asynchronous > > transmission of the remaining data at the end of a precopy phase. > > > > In this use case the save_live_complete_precopy_begin handler might > > be used to mark the stream boundary before proceeding with asynchronous > > transmission of the remaining data while the > > save_live_complete_precopy_end handler might be used to mark the > > stream boundary after performing the asynchronous transmission. > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > > --- > > include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ > > migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ > > 2 files changed, 71 insertions(+) > > > > diff --git a/include/migration/register.h b/include/migration/register.h > > index f60e797894e5..9de123252edf 100644 > > --- a/include/migration/register.h > > +++ b/include/migration/register.h > > @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { > > */ > > int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); > > > > + /** > > + * @save_live_complete_precopy_begin > > + * > > + * Called at the end of a precopy phase, before all > > + * @save_live_complete_precopy handlers and before launching > > + * all @save_live_complete_precopy_thread threads. > > + * The handler might, for example, mark the stream boundary before > > + * proceeding with asynchronous transmission of the remaining data via > > + * @save_live_complete_precopy_thread. > > + * When postcopy is enabled, devices that support postcopy will skip this step. > > + * > > + * @f: QEMUFile where the handler can synchronously send data before returning > > + * @idstr: this device section idstr > > + * @instance_id: this device section instance_id > > + * @opaque: data pointer passed to register_savevm_live() > > + * > > + * Returns zero to indicate success and negative for error > > + */ > > + int (*save_live_complete_precopy_begin)(QEMUFile *f, > > + char *idstr, uint32_t instance_id, > > + void *opaque); > > + /** > > + * @save_live_complete_precopy_end > > + * > > + * Called at the end of a precopy phase, after @save_live_complete_precopy > > + * handlers and after all @save_live_complete_precopy_thread threads have > > + * finished. When postcopy is enabled, devices that support postcopy will > > + * skip this step. > > + * > > + * @f: QEMUFile where the handler can synchronously send data before returning > > + * @opaque: data pointer passed to register_savevm_live() > > + * > > + * Returns zero to indicate success and negative for error > > + */ > > + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); > > Is this handler necessary now that migration core is responsible for the > threads and joins them? I don't see VFIO implementing it later on. Right, I spot the same thing. This series added three hooks: begin, end, precopy_thread. What I think is it only needs one, which is precopy_async. My vague memory was that was what we used to discuss too, so that when migration precopy flushes the final round of iterable data, it does: (1) loop over all complete_precopy_async() and enqueue the tasks if existed into the migration worker pool. Then, (2) loop over all complete_precopy() like before. Optionally, we can enforce one vmstate handler only provides either complete_precopy_async() or complete_precopy(). In this case VFIO can update the two hooks during setup() by detecting multifd && !mapped_ram && nocomp.
On 5.09.2024 15:45, Avihai Horon wrote: > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >> External email: Use caution opening links or attachments >> >> >> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >> >> These SaveVMHandlers help device provide its own asynchronous >> transmission of the remaining data at the end of a precopy phase. >> >> In this use case the save_live_complete_precopy_begin handler might >> be used to mark the stream boundary before proceeding with asynchronous >> transmission of the remaining data while the >> save_live_complete_precopy_end handler might be used to mark the >> stream boundary after performing the asynchronous transmission. >> >> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >> --- >> include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ >> migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ >> 2 files changed, 71 insertions(+) >> >> diff --git a/include/migration/register.h b/include/migration/register.h >> index f60e797894e5..9de123252edf 100644 >> --- a/include/migration/register.h >> +++ b/include/migration/register.h >> @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { >> */ >> int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); >> >> + /** >> + * @save_live_complete_precopy_begin >> + * >> + * Called at the end of a precopy phase, before all >> + * @save_live_complete_precopy handlers and before launching >> + * all @save_live_complete_precopy_thread threads. >> + * The handler might, for example, mark the stream boundary before >> + * proceeding with asynchronous transmission of the remaining data via >> + * @save_live_complete_precopy_thread. >> + * When postcopy is enabled, devices that support postcopy will skip this step. >> + * >> + * @f: QEMUFile where the handler can synchronously send data before returning >> + * @idstr: this device section idstr >> + * @instance_id: this device section instance_id >> + * @opaque: data pointer passed to register_savevm_live() >> + * >> + * Returns zero to indicate success and negative for error >> + */ >> + int (*save_live_complete_precopy_begin)(QEMUFile *f, >> + char *idstr, uint32_t instance_id, >> + void *opaque); >> + /** >> + * @save_live_complete_precopy_end >> + * >> + * Called at the end of a precopy phase, after @save_live_complete_precopy >> + * handlers and after all @save_live_complete_precopy_thread threads have >> + * finished. When postcopy is enabled, devices that support postcopy will >> + * skip this step. >> + * >> + * @f: QEMUFile where the handler can synchronously send data before returning >> + * @opaque: data pointer passed to register_savevm_live() >> + * >> + * Returns zero to indicate success and negative for error >> + */ >> + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); > > Is this handler necessary now that migration core is responsible for the threads and joins them? I don't see VFIO implementing it later on. It's not 100% necessary for the current implementation but preserved for future usage and code consistency with the "_begin" handler (which IS necessary). > Thanks. Thanks, Maciej
On 9.09.2024 19:59, Peter Xu wrote: > On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: >> >> On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >>> External email: Use caution opening links or attachments >>> >>> >>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>> >>> These SaveVMHandlers help device provide its own asynchronous >>> transmission of the remaining data at the end of a precopy phase. >>> >>> In this use case the save_live_complete_precopy_begin handler might >>> be used to mark the stream boundary before proceeding with asynchronous >>> transmission of the remaining data while the >>> save_live_complete_precopy_end handler might be used to mark the >>> stream boundary after performing the asynchronous transmission. >>> >>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>> --- >>> include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ >>> migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ >>> 2 files changed, 71 insertions(+) >>> >>> diff --git a/include/migration/register.h b/include/migration/register.h >>> index f60e797894e5..9de123252edf 100644 >>> --- a/include/migration/register.h >>> +++ b/include/migration/register.h >>> @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { >>> */ >>> int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); >>> >>> + /** >>> + * @save_live_complete_precopy_begin >>> + * >>> + * Called at the end of a precopy phase, before all >>> + * @save_live_complete_precopy handlers and before launching >>> + * all @save_live_complete_precopy_thread threads. >>> + * The handler might, for example, mark the stream boundary before >>> + * proceeding with asynchronous transmission of the remaining data via >>> + * @save_live_complete_precopy_thread. >>> + * When postcopy is enabled, devices that support postcopy will skip this step. >>> + * >>> + * @f: QEMUFile where the handler can synchronously send data before returning >>> + * @idstr: this device section idstr >>> + * @instance_id: this device section instance_id >>> + * @opaque: data pointer passed to register_savevm_live() >>> + * >>> + * Returns zero to indicate success and negative for error >>> + */ >>> + int (*save_live_complete_precopy_begin)(QEMUFile *f, >>> + char *idstr, uint32_t instance_id, >>> + void *opaque); >>> + /** >>> + * @save_live_complete_precopy_end >>> + * >>> + * Called at the end of a precopy phase, after @save_live_complete_precopy >>> + * handlers and after all @save_live_complete_precopy_thread threads have >>> + * finished. When postcopy is enabled, devices that support postcopy will >>> + * skip this step. >>> + * >>> + * @f: QEMUFile where the handler can synchronously send data before returning >>> + * @opaque: data pointer passed to register_savevm_live() >>> + * >>> + * Returns zero to indicate success and negative for error >>> + */ >>> + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); >> >> Is this handler necessary now that migration core is responsible for the >> threads and joins them? I don't see VFIO implementing it later on. > > Right, I spot the same thing. > > This series added three hooks: begin, end, precopy_thread. > > What I think is it only needs one, which is precopy_async. My vague memory > was that was what we used to discuss too, so that when migration precopy > flushes the final round of iterable data, it does: > > (1) loop over all complete_precopy_async() and enqueue the tasks if > existed into the migration worker pool. Then, > > (2) loop over all complete_precopy() like before. > > Optionally, we can enforce one vmstate handler only provides either > complete_precopy_async() or complete_precopy(). In this case VFIO can > update the two hooks during setup() by detecting multifd && !mapped_ram && > nocomp. > The "_begin" hook is still necessary to mark the end of the device state sent via the main migration stream (during the phase VM is still running) since we can't start loading the multifd sent device state until all of that earlier data finishes loading first. We shouldn't send that boundary marker in .save_live_complete_precopy either since it would meant unnecessary waiting for other devices (not necessary VFIO ones) .save_live_complete_precopy bulk data. And VFIO SaveVMHandlers are shared for all VFIO devices (and const) so we can't really change them at runtime. Thanks, Maciej
On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: > On 9.09.2024 19:59, Peter Xu wrote: > > On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: > > > > > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > > > > External email: Use caution opening links or attachments > > > > > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > > > > > > > These SaveVMHandlers help device provide its own asynchronous > > > > transmission of the remaining data at the end of a precopy phase. > > > > > > > > In this use case the save_live_complete_precopy_begin handler might > > > > be used to mark the stream boundary before proceeding with asynchronous > > > > transmission of the remaining data while the > > > > save_live_complete_precopy_end handler might be used to mark the > > > > stream boundary after performing the asynchronous transmission. > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > > > > --- > > > > include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ > > > > migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ > > > > 2 files changed, 71 insertions(+) > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h > > > > index f60e797894e5..9de123252edf 100644 > > > > --- a/include/migration/register.h > > > > +++ b/include/migration/register.h > > > > @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { > > > > */ > > > > int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); > > > > > > > > + /** > > > > + * @save_live_complete_precopy_begin > > > > + * > > > > + * Called at the end of a precopy phase, before all > > > > + * @save_live_complete_precopy handlers and before launching > > > > + * all @save_live_complete_precopy_thread threads. > > > > + * The handler might, for example, mark the stream boundary before > > > > + * proceeding with asynchronous transmission of the remaining data via > > > > + * @save_live_complete_precopy_thread. > > > > + * When postcopy is enabled, devices that support postcopy will skip this step. > > > > + * > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > + * @idstr: this device section idstr > > > > + * @instance_id: this device section instance_id > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > + * > > > > + * Returns zero to indicate success and negative for error > > > > + */ > > > > + int (*save_live_complete_precopy_begin)(QEMUFile *f, > > > > + char *idstr, uint32_t instance_id, > > > > + void *opaque); > > > > + /** > > > > + * @save_live_complete_precopy_end > > > > + * > > > > + * Called at the end of a precopy phase, after @save_live_complete_precopy > > > > + * handlers and after all @save_live_complete_precopy_thread threads have > > > > + * finished. When postcopy is enabled, devices that support postcopy will > > > > + * skip this step. > > > > + * > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > + * > > > > + * Returns zero to indicate success and negative for error > > > > + */ > > > > + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); > > > > > > Is this handler necessary now that migration core is responsible for the > > > threads and joins them? I don't see VFIO implementing it later on. > > > > Right, I spot the same thing. > > > > This series added three hooks: begin, end, precopy_thread. > > > > What I think is it only needs one, which is precopy_async. My vague memory > > was that was what we used to discuss too, so that when migration precopy > > flushes the final round of iterable data, it does: > > > > (1) loop over all complete_precopy_async() and enqueue the tasks if > > existed into the migration worker pool. Then, > > > > (2) loop over all complete_precopy() like before. > > > > Optionally, we can enforce one vmstate handler only provides either > > complete_precopy_async() or complete_precopy(). In this case VFIO can > > update the two hooks during setup() by detecting multifd && !mapped_ram && > > nocomp. > > > > The "_begin" hook is still necessary to mark the end of the device state > sent via the main migration stream (during the phase VM is still running) > since we can't start loading the multifd sent device state until all of > that earlier data finishes loading first. Ah I remembered some more now, thanks. If vfio can send data during iterations this new hook will also not be needed, right? I remember you mentioned you'd have a look and see the challenges there, is there any conclusion yet on whether we can use multifd even during that? It's also a pity that we introduce this hook only because we want a boundary between "iterable stage" and "final stage". IIUC if we have any kind of message telling dest before hand that "we're going to the last stage" then this hook can be avoided. Now it's at least inefficient because we need to trigger begin() per-device, even if I think it's more of a global request saying that "we need to load all main stream data first before moving on". > > We shouldn't send that boundary marker in .save_live_complete_precopy > either since it would meant unnecessary waiting for other devices > (not necessary VFIO ones) .save_live_complete_precopy bulk data. > > And VFIO SaveVMHandlers are shared for all VFIO devices (and const) so > we can't really change them at runtime. In all cases, please consider dropping end() if it's never used; IMO it's fine if there is only begin(), and we shouldn't keep hooks that are never used. Thanks,
On Mon, Sep 09, 2024 at 03:08:40PM -0400, Peter Xu wrote: > On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: > > On 9.09.2024 19:59, Peter Xu wrote: > > > On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: > > > > > > > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > > > > > External email: Use caution opening links or attachments > > > > > > > > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > > > > > > > > > These SaveVMHandlers help device provide its own asynchronous > > > > > transmission of the remaining data at the end of a precopy phase. > > > > > > > > > > In this use case the save_live_complete_precopy_begin handler might > > > > > be used to mark the stream boundary before proceeding with asynchronous > > > > > transmission of the remaining data while the > > > > > save_live_complete_precopy_end handler might be used to mark the > > > > > stream boundary after performing the asynchronous transmission. > > > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > > > > > --- > > > > > include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ > > > > > migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ > > > > > 2 files changed, 71 insertions(+) > > > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h > > > > > index f60e797894e5..9de123252edf 100644 > > > > > --- a/include/migration/register.h > > > > > +++ b/include/migration/register.h > > > > > @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { > > > > > */ > > > > > int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); > > > > > > > > > > + /** > > > > > + * @save_live_complete_precopy_begin > > > > > + * > > > > > + * Called at the end of a precopy phase, before all > > > > > + * @save_live_complete_precopy handlers and before launching > > > > > + * all @save_live_complete_precopy_thread threads. > > > > > + * The handler might, for example, mark the stream boundary before > > > > > + * proceeding with asynchronous transmission of the remaining data via > > > > > + * @save_live_complete_precopy_thread. > > > > > + * When postcopy is enabled, devices that support postcopy will skip this step. > > > > > + * > > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > > + * @idstr: this device section idstr > > > > > + * @instance_id: this device section instance_id > > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > > + * > > > > > + * Returns zero to indicate success and negative for error > > > > > + */ > > > > > + int (*save_live_complete_precopy_begin)(QEMUFile *f, > > > > > + char *idstr, uint32_t instance_id, > > > > > + void *opaque); > > > > > + /** > > > > > + * @save_live_complete_precopy_end > > > > > + * > > > > > + * Called at the end of a precopy phase, after @save_live_complete_precopy > > > > > + * handlers and after all @save_live_complete_precopy_thread threads have > > > > > + * finished. When postcopy is enabled, devices that support postcopy will > > > > > + * skip this step. > > > > > + * > > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > > + * > > > > > + * Returns zero to indicate success and negative for error > > > > > + */ > > > > > + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); > > > > > > > > Is this handler necessary now that migration core is responsible for the > > > > threads and joins them? I don't see VFIO implementing it later on. > > > > > > Right, I spot the same thing. > > > > > > This series added three hooks: begin, end, precopy_thread. > > > > > > What I think is it only needs one, which is precopy_async. My vague memory > > > was that was what we used to discuss too, so that when migration precopy > > > flushes the final round of iterable data, it does: > > > > > > (1) loop over all complete_precopy_async() and enqueue the tasks if > > > existed into the migration worker pool. Then, > > > > > > (2) loop over all complete_precopy() like before. > > > > > > Optionally, we can enforce one vmstate handler only provides either > > > complete_precopy_async() or complete_precopy(). In this case VFIO can > > > update the two hooks during setup() by detecting multifd && !mapped_ram && > > > nocomp. > > > > > > > The "_begin" hook is still necessary to mark the end of the device state > > sent via the main migration stream (during the phase VM is still running) > > since we can't start loading the multifd sent device state until all of > > that earlier data finishes loading first. > > Ah I remembered some more now, thanks. > > If vfio can send data during iterations this new hook will also not be > needed, right? > > I remember you mentioned you'd have a look and see the challenges there, is > there any conclusion yet on whether we can use multifd even during that? > > It's also a pity that we introduce this hook only because we want a > boundary between "iterable stage" and "final stage". IIUC if we have any > kind of message telling dest before hand that "we're going to the last > stage" then this hook can be avoided. Now it's at least inefficient > because we need to trigger begin() per-device, even if I think it's more of > a global request saying that "we need to load all main stream data first > before moving on". Or, we could add one MIG_CMD_SWITCHOVER under QEMU_VM_COMMAND, then send it at the beginning of the switchover phase. Then we can have a generic marker on destination to be the boundary of "iterations" v.s. "switchover". Then I think we can also drop the begin() here, just to avoid one such sync per-device (also in case if others may have such need, like vdpa, then vdpa doesn't need that flag too). Fundamentally, that makes the VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE to be a migration flag.. But for sure the best is still if VFIO can enable multifd even during iterations. Then the boundary guard may not be needed. > > > > > We shouldn't send that boundary marker in .save_live_complete_precopy > > either since it would meant unnecessary waiting for other devices > > (not necessary VFIO ones) .save_live_complete_precopy bulk data. > > > > And VFIO SaveVMHandlers are shared for all VFIO devices (and const) so > > we can't really change them at runtime. > > In all cases, please consider dropping end() if it's never used; IMO it's > fine if there is only begin(), and we shouldn't keep hooks that are never > used. > > Thanks, > > -- > Peter Xu
On 9.09.2024 21:08, Peter Xu wrote: > On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: >> On 9.09.2024 19:59, Peter Xu wrote: >>> On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: >>>> >>>> On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >>>>> External email: Use caution opening links or attachments >>>>> >>>>> >>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>>>> >>>>> These SaveVMHandlers help device provide its own asynchronous >>>>> transmission of the remaining data at the end of a precopy phase. >>>>> >>>>> In this use case the save_live_complete_precopy_begin handler might >>>>> be used to mark the stream boundary before proceeding with asynchronous >>>>> transmission of the remaining data while the >>>>> save_live_complete_precopy_end handler might be used to mark the >>>>> stream boundary after performing the asynchronous transmission. >>>>> >>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>>>> --- >>>>> include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ >>>>> migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ >>>>> 2 files changed, 71 insertions(+) >>>>> >>>>> diff --git a/include/migration/register.h b/include/migration/register.h >>>>> index f60e797894e5..9de123252edf 100644 >>>>> --- a/include/migration/register.h >>>>> +++ b/include/migration/register.h >>>>> @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { >>>>> */ >>>>> int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); >>>>> >>>>> + /** >>>>> + * @save_live_complete_precopy_begin >>>>> + * >>>>> + * Called at the end of a precopy phase, before all >>>>> + * @save_live_complete_precopy handlers and before launching >>>>> + * all @save_live_complete_precopy_thread threads. >>>>> + * The handler might, for example, mark the stream boundary before >>>>> + * proceeding with asynchronous transmission of the remaining data via >>>>> + * @save_live_complete_precopy_thread. >>>>> + * When postcopy is enabled, devices that support postcopy will skip this step. >>>>> + * >>>>> + * @f: QEMUFile where the handler can synchronously send data before returning >>>>> + * @idstr: this device section idstr >>>>> + * @instance_id: this device section instance_id >>>>> + * @opaque: data pointer passed to register_savevm_live() >>>>> + * >>>>> + * Returns zero to indicate success and negative for error >>>>> + */ >>>>> + int (*save_live_complete_precopy_begin)(QEMUFile *f, >>>>> + char *idstr, uint32_t instance_id, >>>>> + void *opaque); >>>>> + /** >>>>> + * @save_live_complete_precopy_end >>>>> + * >>>>> + * Called at the end of a precopy phase, after @save_live_complete_precopy >>>>> + * handlers and after all @save_live_complete_precopy_thread threads have >>>>> + * finished. When postcopy is enabled, devices that support postcopy will >>>>> + * skip this step. >>>>> + * >>>>> + * @f: QEMUFile where the handler can synchronously send data before returning >>>>> + * @opaque: data pointer passed to register_savevm_live() >>>>> + * >>>>> + * Returns zero to indicate success and negative for error >>>>> + */ >>>>> + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); >>>> >>>> Is this handler necessary now that migration core is responsible for the >>>> threads and joins them? I don't see VFIO implementing it later on. >>> >>> Right, I spot the same thing. >>> >>> This series added three hooks: begin, end, precopy_thread. >>> >>> What I think is it only needs one, which is precopy_async. My vague memory >>> was that was what we used to discuss too, so that when migration precopy >>> flushes the final round of iterable data, it does: >>> >>> (1) loop over all complete_precopy_async() and enqueue the tasks if >>> existed into the migration worker pool. Then, >>> >>> (2) loop over all complete_precopy() like before. >>> >>> Optionally, we can enforce one vmstate handler only provides either >>> complete_precopy_async() or complete_precopy(). In this case VFIO can >>> update the two hooks during setup() by detecting multifd && !mapped_ram && >>> nocomp. >>> >> >> The "_begin" hook is still necessary to mark the end of the device state >> sent via the main migration stream (during the phase VM is still running) >> since we can't start loading the multifd sent device state until all of >> that earlier data finishes loading first. > > Ah I remembered some more now, thanks. > > If vfio can send data during iterations this new hook will also not be > needed, right? > > I remember you mentioned you'd have a look and see the challenges there, is > there any conclusion yet on whether we can use multifd even during that? Yeah, I looked at that and it wasn't a straightforward thing to introduce. I am worried that with all the things that already piled up (including the new thread pool implementation) we risk missing QEMU 9.2 too if this is included. > It's also a pity that we introduce this hook only because we want a > boundary between "iterable stage" and "final stage". IIUC if we have any > kind of message telling dest before hand that "we're going to the last > stage" then this hook can be avoided. Now it's at least inefficient > because we need to trigger begin() per-device, even if I think it's more of > a global request saying that "we need to load all main stream data first > before moving on". It should be pretty easy to remove that begin() hook once it is no longer needed - after all, it's only necessary for the sender. >> >> We shouldn't send that boundary marker in .save_live_complete_precopy >> either since it would meant unnecessary waiting for other devices >> (not necessary VFIO ones) .save_live_complete_precopy bulk data. >> >> And VFIO SaveVMHandlers are shared for all VFIO devices (and const) so >> we can't really change them at runtime. > > In all cases, please consider dropping end() if it's never used; IMO it's > fine if there is only begin(), and we shouldn't keep hooks that are never > used. Okay, will remove the end() hook then. > Thanks, > Thanks, Maciej
On 9.09.2024 21:32, Peter Xu wrote: > On Mon, Sep 09, 2024 at 03:08:40PM -0400, Peter Xu wrote: >> On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: >>> On 9.09.2024 19:59, Peter Xu wrote: >>>> On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: >>>>> >>>>> On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >>>>>> External email: Use caution opening links or attachments >>>>>> >>>>>> >>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>>>>> >>>>>> These SaveVMHandlers help device provide its own asynchronous >>>>>> transmission of the remaining data at the end of a precopy phase. >>>>>> >>>>>> In this use case the save_live_complete_precopy_begin handler might >>>>>> be used to mark the stream boundary before proceeding with asynchronous >>>>>> transmission of the remaining data while the >>>>>> save_live_complete_precopy_end handler might be used to mark the >>>>>> stream boundary after performing the asynchronous transmission. >>>>>> >>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>>>>> --- >>>>>> include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ >>>>>> migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ >>>>>> 2 files changed, 71 insertions(+) >>>>>> >>>>>> diff --git a/include/migration/register.h b/include/migration/register.h >>>>>> index f60e797894e5..9de123252edf 100644 >>>>>> --- a/include/migration/register.h >>>>>> +++ b/include/migration/register.h >>>>>> @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { >>>>>> */ >>>>>> int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); >>>>>> >>>>>> + /** >>>>>> + * @save_live_complete_precopy_begin >>>>>> + * >>>>>> + * Called at the end of a precopy phase, before all >>>>>> + * @save_live_complete_precopy handlers and before launching >>>>>> + * all @save_live_complete_precopy_thread threads. >>>>>> + * The handler might, for example, mark the stream boundary before >>>>>> + * proceeding with asynchronous transmission of the remaining data via >>>>>> + * @save_live_complete_precopy_thread. >>>>>> + * When postcopy is enabled, devices that support postcopy will skip this step. >>>>>> + * >>>>>> + * @f: QEMUFile where the handler can synchronously send data before returning >>>>>> + * @idstr: this device section idstr >>>>>> + * @instance_id: this device section instance_id >>>>>> + * @opaque: data pointer passed to register_savevm_live() >>>>>> + * >>>>>> + * Returns zero to indicate success and negative for error >>>>>> + */ >>>>>> + int (*save_live_complete_precopy_begin)(QEMUFile *f, >>>>>> + char *idstr, uint32_t instance_id, >>>>>> + void *opaque); >>>>>> + /** >>>>>> + * @save_live_complete_precopy_end >>>>>> + * >>>>>> + * Called at the end of a precopy phase, after @save_live_complete_precopy >>>>>> + * handlers and after all @save_live_complete_precopy_thread threads have >>>>>> + * finished. When postcopy is enabled, devices that support postcopy will >>>>>> + * skip this step. >>>>>> + * >>>>>> + * @f: QEMUFile where the handler can synchronously send data before returning >>>>>> + * @opaque: data pointer passed to register_savevm_live() >>>>>> + * >>>>>> + * Returns zero to indicate success and negative for error >>>>>> + */ >>>>>> + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); >>>>> >>>>> Is this handler necessary now that migration core is responsible for the >>>>> threads and joins them? I don't see VFIO implementing it later on. >>>> >>>> Right, I spot the same thing. >>>> >>>> This series added three hooks: begin, end, precopy_thread. >>>> >>>> What I think is it only needs one, which is precopy_async. My vague memory >>>> was that was what we used to discuss too, so that when migration precopy >>>> flushes the final round of iterable data, it does: >>>> >>>> (1) loop over all complete_precopy_async() and enqueue the tasks if >>>> existed into the migration worker pool. Then, >>>> >>>> (2) loop over all complete_precopy() like before. >>>> >>>> Optionally, we can enforce one vmstate handler only provides either >>>> complete_precopy_async() or complete_precopy(). In this case VFIO can >>>> update the two hooks during setup() by detecting multifd && !mapped_ram && >>>> nocomp. >>>> >>> >>> The "_begin" hook is still necessary to mark the end of the device state >>> sent via the main migration stream (during the phase VM is still running) >>> since we can't start loading the multifd sent device state until all of >>> that earlier data finishes loading first. >> >> Ah I remembered some more now, thanks. >> >> If vfio can send data during iterations this new hook will also not be >> needed, right? >> >> I remember you mentioned you'd have a look and see the challenges there, is >> there any conclusion yet on whether we can use multifd even during that? >> >> It's also a pity that we introduce this hook only because we want a >> boundary between "iterable stage" and "final stage". IIUC if we have any >> kind of message telling dest before hand that "we're going to the last >> stage" then this hook can be avoided. Now it's at least inefficient >> because we need to trigger begin() per-device, even if I think it's more of >> a global request saying that "we need to load all main stream data first >> before moving on". > > Or, we could add one MIG_CMD_SWITCHOVER under QEMU_VM_COMMAND, then send it > at the beginning of the switchover phase. Then we can have a generic > marker on destination to be the boundary of "iterations" v.s. "switchover". > Then I think we can also drop the begin() here, just to avoid one such sync > per-device (also in case if others may have such need, like vdpa, then vdpa > doesn't need that flag too). > > Fundamentally, that makes the VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE to be a > migration flag.. > > But for sure the best is still if VFIO can enable multifd even during > iterations. Then the boundary guard may not be needed. That begin() handler was supposed to be generic for multiple device types, that's why it was paired with the end() one that has no current use. But you are right that using a single "barrier" or "sync" command for all device types makes sense, so I will change it to MIG_CMD_SWITCHOVER. Thanks, Maciej
On Thu, Sep 19, 2024 at 09:47:53PM +0200, Maciej S. Szmigiero wrote: > On 9.09.2024 21:08, Peter Xu wrote: > > On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: > > > On 9.09.2024 19:59, Peter Xu wrote: > > > > On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: > > > > > > > > > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > > > > > > External email: Use caution opening links or attachments > > > > > > > > > > > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > > > > > > > > > > > These SaveVMHandlers help device provide its own asynchronous > > > > > > transmission of the remaining data at the end of a precopy phase. > > > > > > > > > > > > In this use case the save_live_complete_precopy_begin handler might > > > > > > be used to mark the stream boundary before proceeding with asynchronous > > > > > > transmission of the remaining data while the > > > > > > save_live_complete_precopy_end handler might be used to mark the > > > > > > stream boundary after performing the asynchronous transmission. > > > > > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > > > > > > --- > > > > > > include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ > > > > > > migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ > > > > > > 2 files changed, 71 insertions(+) > > > > > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h > > > > > > index f60e797894e5..9de123252edf 100644 > > > > > > --- a/include/migration/register.h > > > > > > +++ b/include/migration/register.h > > > > > > @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { > > > > > > */ > > > > > > int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); > > > > > > > > > > > > + /** > > > > > > + * @save_live_complete_precopy_begin > > > > > > + * > > > > > > + * Called at the end of a precopy phase, before all > > > > > > + * @save_live_complete_precopy handlers and before launching > > > > > > + * all @save_live_complete_precopy_thread threads. > > > > > > + * The handler might, for example, mark the stream boundary before > > > > > > + * proceeding with asynchronous transmission of the remaining data via > > > > > > + * @save_live_complete_precopy_thread. > > > > > > + * When postcopy is enabled, devices that support postcopy will skip this step. > > > > > > + * > > > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > > > + * @idstr: this device section idstr > > > > > > + * @instance_id: this device section instance_id > > > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > > > + * > > > > > > + * Returns zero to indicate success and negative for error > > > > > > + */ > > > > > > + int (*save_live_complete_precopy_begin)(QEMUFile *f, > > > > > > + char *idstr, uint32_t instance_id, > > > > > > + void *opaque); > > > > > > + /** > > > > > > + * @save_live_complete_precopy_end > > > > > > + * > > > > > > + * Called at the end of a precopy phase, after @save_live_complete_precopy > > > > > > + * handlers and after all @save_live_complete_precopy_thread threads have > > > > > > + * finished. When postcopy is enabled, devices that support postcopy will > > > > > > + * skip this step. > > > > > > + * > > > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > > > + * > > > > > > + * Returns zero to indicate success and negative for error > > > > > > + */ > > > > > > + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); > > > > > > > > > > Is this handler necessary now that migration core is responsible for the > > > > > threads and joins them? I don't see VFIO implementing it later on. > > > > > > > > Right, I spot the same thing. > > > > > > > > This series added three hooks: begin, end, precopy_thread. > > > > > > > > What I think is it only needs one, which is precopy_async. My vague memory > > > > was that was what we used to discuss too, so that when migration precopy > > > > flushes the final round of iterable data, it does: > > > > > > > > (1) loop over all complete_precopy_async() and enqueue the tasks if > > > > existed into the migration worker pool. Then, > > > > > > > > (2) loop over all complete_precopy() like before. > > > > > > > > Optionally, we can enforce one vmstate handler only provides either > > > > complete_precopy_async() or complete_precopy(). In this case VFIO can > > > > update the two hooks during setup() by detecting multifd && !mapped_ram && > > > > nocomp. > > > > > > > > > > The "_begin" hook is still necessary to mark the end of the device state > > > sent via the main migration stream (during the phase VM is still running) > > > since we can't start loading the multifd sent device state until all of > > > that earlier data finishes loading first. > > > > Ah I remembered some more now, thanks. > > > > If vfio can send data during iterations this new hook will also not be > > needed, right? > > > > I remember you mentioned you'd have a look and see the challenges there, is > > there any conclusion yet on whether we can use multifd even during that? > > Yeah, I looked at that and it wasn't a straightforward thing to introduce. > > I am worried that with all the things that already piled up (including the > new thread pool implementation) we risk missing QEMU 9.2 too if this is > included. Not explicitly required, but IMHO it'll be nice to provide a paragraph in the new version when repost explaining the challenges of using it during iterations. It'll be not only for me but for whoever may want to extend your solution to iterations. I asked this question again mostly because I found that when with iteration support the design looks simpler in begin(), so that the extra sync is not needed. But I confess you know better than me, so whatever you think best is ok here. > > > It's also a pity that we introduce this hook only because we want a > > boundary between "iterable stage" and "final stage". IIUC if we have any > > kind of message telling dest before hand that "we're going to the last > > stage" then this hook can be avoided. Now it's at least inefficient > > because we need to trigger begin() per-device, even if I think it's more of > > a global request saying that "we need to load all main stream data first > > before moving on". > > It should be pretty easy to remove that begin() hook once it is no longer > needed - after all, it's only necessary for the sender. Do you mean you have plan to remove the begin() hook even without making interate() work too? That's definitely nice if so. > > > > > > > We shouldn't send that boundary marker in .save_live_complete_precopy > > > either since it would meant unnecessary waiting for other devices > > > (not necessary VFIO ones) .save_live_complete_precopy bulk data. > > > > > > And VFIO SaveVMHandlers are shared for all VFIO devices (and const) so > > > we can't really change them at runtime. > > > > In all cases, please consider dropping end() if it's never used; IMO it's > > fine if there is only begin(), and we shouldn't keep hooks that are never > > used. > > Okay, will remove the end() hook then. > > > Thanks, > > > > Thanks, > Maciej >
On 19.09.2024 22:54, Peter Xu wrote: > On Thu, Sep 19, 2024 at 09:47:53PM +0200, Maciej S. Szmigiero wrote: >> On 9.09.2024 21:08, Peter Xu wrote: >>> On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: >>>> On 9.09.2024 19:59, Peter Xu wrote: >>>>> On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: >>>>>> >>>>>> On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >>>>>>> External email: Use caution opening links or attachments >>>>>>> >>>>>>> >>>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>>>>>> >>>>>>> These SaveVMHandlers help device provide its own asynchronous >>>>>>> transmission of the remaining data at the end of a precopy phase. >>>>>>> >>>>>>> In this use case the save_live_complete_precopy_begin handler might >>>>>>> be used to mark the stream boundary before proceeding with asynchronous >>>>>>> transmission of the remaining data while the >>>>>>> save_live_complete_precopy_end handler might be used to mark the >>>>>>> stream boundary after performing the asynchronous transmission. >>>>>>> >>>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>>>>>> --- >>>>>>> include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ >>>>>>> migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ >>>>>>> 2 files changed, 71 insertions(+) >>>>>>> >>>>>>> diff --git a/include/migration/register.h b/include/migration/register.h >>>>>>> index f60e797894e5..9de123252edf 100644 >>>>>>> --- a/include/migration/register.h >>>>>>> +++ b/include/migration/register.h >>>>>>> @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { >>>>>>> */ >>>>>>> int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); >>>>>>> >>>>>>> + /** >>>>>>> + * @save_live_complete_precopy_begin >>>>>>> + * >>>>>>> + * Called at the end of a precopy phase, before all >>>>>>> + * @save_live_complete_precopy handlers and before launching >>>>>>> + * all @save_live_complete_precopy_thread threads. >>>>>>> + * The handler might, for example, mark the stream boundary before >>>>>>> + * proceeding with asynchronous transmission of the remaining data via >>>>>>> + * @save_live_complete_precopy_thread. >>>>>>> + * When postcopy is enabled, devices that support postcopy will skip this step. >>>>>>> + * >>>>>>> + * @f: QEMUFile where the handler can synchronously send data before returning >>>>>>> + * @idstr: this device section idstr >>>>>>> + * @instance_id: this device section instance_id >>>>>>> + * @opaque: data pointer passed to register_savevm_live() >>>>>>> + * >>>>>>> + * Returns zero to indicate success and negative for error >>>>>>> + */ >>>>>>> + int (*save_live_complete_precopy_begin)(QEMUFile *f, >>>>>>> + char *idstr, uint32_t instance_id, >>>>>>> + void *opaque); >>>>>>> + /** >>>>>>> + * @save_live_complete_precopy_end >>>>>>> + * >>>>>>> + * Called at the end of a precopy phase, after @save_live_complete_precopy >>>>>>> + * handlers and after all @save_live_complete_precopy_thread threads have >>>>>>> + * finished. When postcopy is enabled, devices that support postcopy will >>>>>>> + * skip this step. >>>>>>> + * >>>>>>> + * @f: QEMUFile where the handler can synchronously send data before returning >>>>>>> + * @opaque: data pointer passed to register_savevm_live() >>>>>>> + * >>>>>>> + * Returns zero to indicate success and negative for error >>>>>>> + */ >>>>>>> + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); >>>>>> >>>>>> Is this handler necessary now that migration core is responsible for the >>>>>> threads and joins them? I don't see VFIO implementing it later on. >>>>> >>>>> Right, I spot the same thing. >>>>> >>>>> This series added three hooks: begin, end, precopy_thread. >>>>> >>>>> What I think is it only needs one, which is precopy_async. My vague memory >>>>> was that was what we used to discuss too, so that when migration precopy >>>>> flushes the final round of iterable data, it does: >>>>> >>>>> (1) loop over all complete_precopy_async() and enqueue the tasks if >>>>> existed into the migration worker pool. Then, >>>>> >>>>> (2) loop over all complete_precopy() like before. >>>>> >>>>> Optionally, we can enforce one vmstate handler only provides either >>>>> complete_precopy_async() or complete_precopy(). In this case VFIO can >>>>> update the two hooks during setup() by detecting multifd && !mapped_ram && >>>>> nocomp. >>>>> >>>> >>>> The "_begin" hook is still necessary to mark the end of the device state >>>> sent via the main migration stream (during the phase VM is still running) >>>> since we can't start loading the multifd sent device state until all of >>>> that earlier data finishes loading first. >>> >>> Ah I remembered some more now, thanks. >>> >>> If vfio can send data during iterations this new hook will also not be >>> needed, right? >>> >>> I remember you mentioned you'd have a look and see the challenges there, is >>> there any conclusion yet on whether we can use multifd even during that? >> >> Yeah, I looked at that and it wasn't a straightforward thing to introduce. >> >> I am worried that with all the things that already piled up (including the >> new thread pool implementation) we risk missing QEMU 9.2 too if this is >> included. > > Not explicitly required, but IMHO it'll be nice to provide a paragraph in > the new version when repost explaining the challenges of using it during > iterations. It'll be not only for me but for whoever may want to extend > your solution to iterations. Will do. > I asked this question again mostly because I found that when with iteration > support the design looks simpler in begin(), so that the extra sync is not > needed. But I confess you know better than me, so whatever you think best > is ok here. If we do the MIG_CMD_SWITCHOVER / QEMU_VM_COMMAND thing common for all devices then we don't need begin() even without live-phase multifd device state transfer. >> >>> It's also a pity that we introduce this hook only because we want a >>> boundary between "iterable stage" and "final stage". IIUC if we have any >>> kind of message telling dest before hand that "we're going to the last >>> stage" then this hook can be avoided. Now it's at least inefficient >>> because we need to trigger begin() per-device, even if I think it's more of >>> a global request saying that "we need to load all main stream data first >>> before moving on". >> >> It should be pretty easy to remove that begin() hook once it is no longer >> needed - after all, it's only necessary for the sender. > > Do you mean you have plan to remove the begin() hook even without making > interate() work too? That's definitely nice if so. As I wrote above, I think with MIG_CMD_SWITCHOVER it shouldn't be needed? Thanks, Maciej
On Fri, Sep 20, 2024 at 05:22:54PM +0200, Maciej S. Szmigiero wrote: > On 19.09.2024 22:54, Peter Xu wrote: > > On Thu, Sep 19, 2024 at 09:47:53PM +0200, Maciej S. Szmigiero wrote: > > > On 9.09.2024 21:08, Peter Xu wrote: > > > > On Mon, Sep 09, 2024 at 08:32:45PM +0200, Maciej S. Szmigiero wrote: > > > > > On 9.09.2024 19:59, Peter Xu wrote: > > > > > > On Thu, Sep 05, 2024 at 04:45:48PM +0300, Avihai Horon wrote: > > > > > > > > > > > > > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > > > > > > > > External email: Use caution opening links or attachments > > > > > > > > > > > > > > > > > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > > > > > > > > > > > > > > > These SaveVMHandlers help device provide its own asynchronous > > > > > > > > transmission of the remaining data at the end of a precopy phase. > > > > > > > > > > > > > > > > In this use case the save_live_complete_precopy_begin handler might > > > > > > > > be used to mark the stream boundary before proceeding with asynchronous > > > > > > > > transmission of the remaining data while the > > > > > > > > save_live_complete_precopy_end handler might be used to mark the > > > > > > > > stream boundary after performing the asynchronous transmission. > > > > > > > > > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > > > > > > > > --- > > > > > > > > include/migration/register.h | 36 ++++++++++++++++++++++++++++++++++++ > > > > > > > > migration/savevm.c | 35 +++++++++++++++++++++++++++++++++++ > > > > > > > > 2 files changed, 71 insertions(+) > > > > > > > > > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h > > > > > > > > index f60e797894e5..9de123252edf 100644 > > > > > > > > --- a/include/migration/register.h > > > > > > > > +++ b/include/migration/register.h > > > > > > > > @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { > > > > > > > > */ > > > > > > > > int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); > > > > > > > > > > > > > > > > + /** > > > > > > > > + * @save_live_complete_precopy_begin > > > > > > > > + * > > > > > > > > + * Called at the end of a precopy phase, before all > > > > > > > > + * @save_live_complete_precopy handlers and before launching > > > > > > > > + * all @save_live_complete_precopy_thread threads. > > > > > > > > + * The handler might, for example, mark the stream boundary before > > > > > > > > + * proceeding with asynchronous transmission of the remaining data via > > > > > > > > + * @save_live_complete_precopy_thread. > > > > > > > > + * When postcopy is enabled, devices that support postcopy will skip this step. > > > > > > > > + * > > > > > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > > > > > + * @idstr: this device section idstr > > > > > > > > + * @instance_id: this device section instance_id > > > > > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > > > > > + * > > > > > > > > + * Returns zero to indicate success and negative for error > > > > > > > > + */ > > > > > > > > + int (*save_live_complete_precopy_begin)(QEMUFile *f, > > > > > > > > + char *idstr, uint32_t instance_id, > > > > > > > > + void *opaque); > > > > > > > > + /** > > > > > > > > + * @save_live_complete_precopy_end > > > > > > > > + * > > > > > > > > + * Called at the end of a precopy phase, after @save_live_complete_precopy > > > > > > > > + * handlers and after all @save_live_complete_precopy_thread threads have > > > > > > > > + * finished. When postcopy is enabled, devices that support postcopy will > > > > > > > > + * skip this step. > > > > > > > > + * > > > > > > > > + * @f: QEMUFile where the handler can synchronously send data before returning > > > > > > > > + * @opaque: data pointer passed to register_savevm_live() > > > > > > > > + * > > > > > > > > + * Returns zero to indicate success and negative for error > > > > > > > > + */ > > > > > > > > + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); > > > > > > > > > > > > > > Is this handler necessary now that migration core is responsible for the > > > > > > > threads and joins them? I don't see VFIO implementing it later on. > > > > > > > > > > > > Right, I spot the same thing. > > > > > > > > > > > > This series added three hooks: begin, end, precopy_thread. > > > > > > > > > > > > What I think is it only needs one, which is precopy_async. My vague memory > > > > > > was that was what we used to discuss too, so that when migration precopy > > > > > > flushes the final round of iterable data, it does: > > > > > > > > > > > > (1) loop over all complete_precopy_async() and enqueue the tasks if > > > > > > existed into the migration worker pool. Then, > > > > > > > > > > > > (2) loop over all complete_precopy() like before. > > > > > > > > > > > > Optionally, we can enforce one vmstate handler only provides either > > > > > > complete_precopy_async() or complete_precopy(). In this case VFIO can > > > > > > update the two hooks during setup() by detecting multifd && !mapped_ram && > > > > > > nocomp. > > > > > > > > > > > > > > > > The "_begin" hook is still necessary to mark the end of the device state > > > > > sent via the main migration stream (during the phase VM is still running) > > > > > since we can't start loading the multifd sent device state until all of > > > > > that earlier data finishes loading first. > > > > > > > > Ah I remembered some more now, thanks. > > > > > > > > If vfio can send data during iterations this new hook will also not be > > > > needed, right? > > > > > > > > I remember you mentioned you'd have a look and see the challenges there, is > > > > there any conclusion yet on whether we can use multifd even during that? > > > > > > Yeah, I looked at that and it wasn't a straightforward thing to introduce. > > > > > > I am worried that with all the things that already piled up (including the > > > new thread pool implementation) we risk missing QEMU 9.2 too if this is > > > included. > > > > Not explicitly required, but IMHO it'll be nice to provide a paragraph in > > the new version when repost explaining the challenges of using it during > > iterations. It'll be not only for me but for whoever may want to extend > > your solution to iterations. > > Will do. > > > I asked this question again mostly because I found that when with iteration > > support the design looks simpler in begin(), so that the extra sync is not > > needed. But I confess you know better than me, so whatever you think best > > is ok here. > > If we do the MIG_CMD_SWITCHOVER / QEMU_VM_COMMAND thing common for all > devices then we don't need begin() even without live-phase multifd > device state transfer. > > > > > > > > It's also a pity that we introduce this hook only because we want a > > > > boundary between "iterable stage" and "final stage". IIUC if we have any > > > > kind of message telling dest before hand that "we're going to the last > > > > stage" then this hook can be avoided. Now it's at least inefficient > > > > because we need to trigger begin() per-device, even if I think it's more of > > > > a global request saying that "we need to load all main stream data first > > > > before moving on". > > > > > > It should be pretty easy to remove that begin() hook once it is no longer > > > needed - after all, it's only necessary for the sender. > > > > Do you mean you have plan to remove the begin() hook even without making > > interate() work too? That's definitely nice if so. > > As I wrote above, I think with MIG_CMD_SWITCHOVER it shouldn't be needed? Ah I see, yes if with that it's ok. Just a heads-up - please remember to add one migration_properties[] entry and a compat property for pre-9.1 so that we don't generate that message when migrating to old binaries. Meanwhile if we're going to add it, let's also make sure postcopy also has it, as it shares the same SWITCHOVER idea. Thanks,
diff --git a/include/migration/register.h b/include/migration/register.h index f60e797894e5..9de123252edf 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -103,6 +103,42 @@ typedef struct SaveVMHandlers { */ int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); + /** + * @save_live_complete_precopy_begin + * + * Called at the end of a precopy phase, before all + * @save_live_complete_precopy handlers and before launching + * all @save_live_complete_precopy_thread threads. + * The handler might, for example, mark the stream boundary before + * proceeding with asynchronous transmission of the remaining data via + * @save_live_complete_precopy_thread. + * When postcopy is enabled, devices that support postcopy will skip this step. + * + * @f: QEMUFile where the handler can synchronously send data before returning + * @idstr: this device section idstr + * @instance_id: this device section instance_id + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ + int (*save_live_complete_precopy_begin)(QEMUFile *f, + char *idstr, uint32_t instance_id, + void *opaque); + /** + * @save_live_complete_precopy_end + * + * Called at the end of a precopy phase, after @save_live_complete_precopy + * handlers and after all @save_live_complete_precopy_thread threads have + * finished. When postcopy is enabled, devices that support postcopy will + * skip this step. + * + * @f: QEMUFile where the handler can synchronously send data before returning + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ + int (*save_live_complete_precopy_end)(QEMUFile *f, void *opaque); + /* This runs both outside and inside the BQL. */ /** diff --git a/migration/savevm.c b/migration/savevm.c index 6bb404b9c86f..d43acbbf20cf 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1496,6 +1496,27 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) SaveStateEntry *se; int ret; + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || (in_postcopy && se->ops->has_postcopy && + se->ops->has_postcopy(se->opaque)) || + !se->ops->save_live_complete_precopy_begin) { + continue; + } + + save_section_header(f, se, QEMU_VM_SECTION_END); + + ret = se->ops->save_live_complete_precopy_begin(f, + se->idstr, se->instance_id, + se->opaque); + + save_section_footer(f, se); + + if (ret < 0) { + qemu_file_set_error(f, ret); + return -1; + } + } + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || (in_postcopy && se->ops->has_postcopy && @@ -1527,6 +1548,20 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) end_ts_each - start_ts_each); } + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || (in_postcopy && se->ops->has_postcopy && + se->ops->has_postcopy(se->opaque)) || + !se->ops->save_live_complete_precopy_end) { + continue; + } + + ret = se->ops->save_live_complete_precopy_end(f, se->opaque); + if (ret < 0) { + qemu_file_set_error(f, ret); + return -1; + } + } + trace_vmstate_downtime_checkpoint("src-iterable-saved"); return 0;