Message ID | 20210106152120.31279-4-andrey.gruzdev@virtuozzo.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | UFFD write-tracking migration/snapshots | expand |
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > In this particular implementation the same single migration > thread is responsible for both normal linear dirty page > migration and procesing UFFD page fault events. > > Processing write faults includes reading UFFD file descriptor, > finding respective RAM block and saving faulting page to > the migration stream. After page has been saved, write protection > can be removed. Since asynchronous version of qemu_put_buffer() > is expected to be used to save pages, we also have to flush > migraion stream prior to un-protecting saved memory range. > > Write protection is being removed for any previously protected > memory chunk that has hit the migration stream. That's valid > for pages from linear page scan along with write fault pages. > > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > Acked-by: Peter Xu <peterx@redhat.com> I think this is OK, so: Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> I think I'd have preferred to have kept the #ifdef LINUX's out of there, and I suspect using the opaque's for hte pre/post hook is overly complex; but other wise OK. Dave > --- > include/exec/memory.h | 7 ++ > migration/ram.c | 269 +++++++++++++++++++++++++++++++++++++++-- > migration/ram.h | 2 + > migration/trace-events | 2 + > 4 files changed, 272 insertions(+), 8 deletions(-) > > diff --git a/include/exec/memory.h b/include/exec/memory.h > index b76b1256bf..1aa1c6a3f4 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -150,6 +150,13 @@ typedef struct IOMMUTLBEvent { > #define RAM_PMEM (1 << 5) > > > +/* > + * UFFDIO_WRITEPROTECT is used on this RAMBlock to > + * support 'write-tracking' migration type. > + * Implies ram_state->ram_wt_enabled. > + */ > +#define RAM_UF_WRITEPROTECT (1 << 6) > + > static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, > IOMMUNotifierFlag flags, > hwaddr start, hwaddr end, > diff --git a/migration/ram.c b/migration/ram.c > index ae8de17153..5707382db1 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -56,6 +56,11 @@ > #include "savevm.h" > #include "qemu/iov.h" > #include "multifd.h" > +#include "sysemu/runstate.h" > + > +#ifdef CONFIG_LINUX > +#include "qemu/userfaultfd.h" > +#endif > > /***********************************************************/ > /* ram save/restore */ > @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { > struct RAMState { > /* QEMUFile used for this migration */ > QEMUFile *f; > + /* UFFD file descriptor, used in 'write-tracking' migration */ > + int uffdio_fd; > /* Last block that we have visited searching for dirty pages */ > RAMBlock *last_seen_block; > /* Last block from where we have sent data */ > @@ -1434,6 +1441,40 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) > return block; > } > > +#ifdef CONFIG_LINUX > +/** > + * poll_fault_page: try to get next UFFD write fault page and, if pending fault > + * is found, return RAM block pointer and page offset > + * > + * Returns pointer to the RAMBlock containing faulting page, > + * NULL if no write faults are pending > + * > + * @rs: current RAM state > + * @offset: page offset from the beginning of the block > + */ > +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > +{ > + struct uffd_msg uffd_msg; > + void *page_address; > + RAMBlock *bs; > + int res; > + > + if (!migrate_background_snapshot()) { > + return NULL; > + } > + > + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); > + if (res <= 0) { > + return NULL; > + } > + > + page_address = (void *) uffd_msg.arg.pagefault.address; > + bs = qemu_ram_block_from_host(page_address, false, offset); > + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); > + return bs; > +} > +#endif /* CONFIG_LINUX */ > + > /** > * get_queued_page: unqueue a page from the postcopy requests > * > @@ -1473,6 +1514,16 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) > > } while (block && !dirty); > > +#ifdef CONFIG_LINUX > + if (!block) { > + /* > + * Poll write faults too if background snapshot is enabled; that's > + * when we have vcpus got blocked by the write protected pages. > + */ > + block = poll_fault_page(rs, &offset); > + } > +#endif /* CONFIG_LINUX */ > + > if (block) { > /* > * As soon as we start servicing pages out of order, then we have > @@ -1746,6 +1797,53 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, > return pages; > } > > +/** > + * ram_save_host_page_pre: ram_save_host_page() pre-notifier > + * > + * @rs: current RAM state > + * @pss: page-search-status structure > + * @opaque: pointer to receive opaque context value > + */ > +static inline > +void ram_save_host_page_pre(RAMState *rs, PageSearchStatus *pss, void **opaque) > +{ > + *(unsigned long *) opaque = pss->page; > +} > + > +/** > + * ram_save_host_page_post: ram_save_host_page() post-notifier > + * > + * @rs: current RAM state > + * @pss: page-search-status structure > + * @opaque: opaque context value > + * @res_override: pointer to the return value of ram_save_host_page(), > + * overwritten in case of an error > + */ > +static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > + void *opaque, int *res_override) > +{ > +#ifdef CONFIG_LINUX > + /* Check if page is from UFFD-managed region. */ > + if (pss->block->flags & RAM_UF_WRITEPROTECT) { > + unsigned long page_from = (unsigned long) opaque; > + > + void *page_address = pss->block->host + (page_from << TARGET_PAGE_BITS); > + uint64_t run_length = (pss->page - page_from + 1) << TARGET_PAGE_BITS; > + int res; > + > + /* Flush async buffers before un-protect. */ > + qemu_fflush(rs->f); > + /* Un-protect memory range. */ > + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, > + false, false); > + /* We don't want to override existing error from ram_save_host_page(). */ > + if (res < 0 && *res_override >= 0) { > + *res_override = res; > + } > + } > +#endif /* CONFIG_LINUX */ > +} > + > /** > * ram_find_and_save_block: finds a dirty page and sends it to f > * > @@ -1790,7 +1888,11 @@ static int ram_find_and_save_block(RAMState *rs, bool last_stage) > } > > if (found) { > + void *opaque; > + > + ram_save_host_page_pre(rs, &pss, &opaque); > pages = ram_save_host_page(rs, &pss, last_stage); > + ram_save_host_page_post(rs, &pss, opaque, &pages); > } > } while (!pages && again); > > @@ -1880,10 +1982,13 @@ static void ram_save_cleanup(void *opaque) > RAMState **rsp = opaque; > RAMBlock *block; > > - /* caller have hold iothread lock or is in a bh, so there is > - * no writing race against the migration bitmap > - */ > - memory_global_dirty_log_stop(); > + /* We don't use dirty log with background snapshots */ > + if (!migrate_background_snapshot()) { > + /* caller have hold iothread lock or is in a bh, so there is > + * no writing race against the migration bitmap > + */ > + memory_global_dirty_log_stop(); > + } > > RAMBLOCK_FOREACH_NOT_IGNORED(block) { > g_free(block->clear_bmap); > @@ -2343,8 +2448,11 @@ static void ram_init_bitmaps(RAMState *rs) > > WITH_RCU_READ_LOCK_GUARD() { > ram_list_init_bitmaps(); > - memory_global_dirty_log_start(); > - migration_bitmap_sync_precopy(rs); > + /* We don't use dirty log with background snapshots */ > + if (!migrate_background_snapshot()) { > + memory_global_dirty_log_start(); > + migration_bitmap_sync_precopy(rs); > + } > } > qemu_mutex_unlock_ramlist(); > qemu_mutex_unlock_iothread(); > @@ -3794,7 +3902,14 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) > */ > bool ram_write_tracking_available(void) > { > - /* TODO: implement */ > +#ifdef CONFIG_LINUX > + uint64_t uffd_features; > + int res; > + > + res = uffd_query_features(&uffd_features); > + return (res == 0 && > + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); > +#endif > return false; > } > > @@ -3805,10 +3920,148 @@ bool ram_write_tracking_available(void) > */ > bool ram_write_tracking_compatible(void) > { > - /* TODO: implement */ > +#ifdef CONFIG_LINUX > + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); > + int uffd_fd; > + RAMBlock *bs; > + bool ret = false; > + > + /* Open UFFD file descriptor */ > + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); > + if (uffd_fd < 0) { > + return false; > + } > + > + RCU_READ_LOCK_GUARD(); > + > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > + uint64_t uffd_ioctls; > + > + /* Nothing to do with read-only and MMIO-writable regions */ > + if (bs->mr->readonly || bs->mr->rom_device) { > + continue; > + } > + /* Try to register block memory via UFFD-IO to track writes */ > + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, > + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { > + goto out; > + } > + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { > + goto out; > + } > + } > + ret = true; > + > +out: > + uffd_close_fd(uffd_fd); > + return ret; > +#endif > return false; > } > > +/* > + * ram_write_tracking_start: start UFFD-WP memory tracking > + * > + * Returns 0 for success or negative value in case of error > + */ > +int ram_write_tracking_start(void) > +{ > +#ifdef CONFIG_LINUX > + int uffd_fd; > + RAMState *rs = ram_state; > + RAMBlock *bs; > + > + /* Open UFFD file descriptor */ > + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); > + if (uffd_fd < 0) { > + return uffd_fd; > + } > + rs->uffdio_fd = uffd_fd; > + > + RCU_READ_LOCK_GUARD(); > + > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > + /* Nothing to do with read-only and MMIO-writable regions */ > + if (bs->mr->readonly || bs->mr->rom_device) { > + continue; > + } > + > + /* Register block memory with UFFD to track writes */ > + if (uffd_register_memory(rs->uffdio_fd, bs->host, > + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { > + goto fail; > + } > + /* Apply UFFD write protection to the block memory range */ > + if (uffd_change_protection(rs->uffdio_fd, bs->host, > + bs->max_length, true, false)) { > + goto fail; > + } > + bs->flags |= RAM_UF_WRITEPROTECT; > + memory_region_ref(bs->mr); > + > + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, > + bs->host, bs->max_length); > + } > + > + return 0; > + > +fail: > + error_report("ram_write_tracking_start() failed: restoring initial memory state"); > + > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { > + continue; > + } > + /* > + * In case some memory block failed to be write-protected > + * remove protection and unregister all succeeded RAM blocks > + */ > + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); > + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); > + /* Cleanup flags and remove reference */ > + bs->flags &= ~RAM_UF_WRITEPROTECT; > + memory_region_unref(bs->mr); > + } > + > + uffd_close_fd(uffd_fd); > +#endif /* CONFIG_LINUX */ > + rs->uffdio_fd = -1; > + return -1; > +} > + > +/** > + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection > + */ > +void ram_write_tracking_stop(void) > +{ > +#ifdef CONFIG_LINUX > + RAMState *rs = ram_state; > + RAMBlock *bs; > + > + RCU_READ_LOCK_GUARD(); > + > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { > + continue; > + } > + /* Remove protection and unregister all affected RAM blocks */ > + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); > + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); > + > + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, > + bs->host, bs->max_length); > + > + /* Cleanup flags and remove reference */ > + bs->flags &= ~RAM_UF_WRITEPROTECT; > + memory_region_unref(bs->mr); > + } > + > + /* Finally close UFFD file descriptor */ > + uffd_close_fd(rs->uffdio_fd); > + rs->uffdio_fd = -1; > +#endif /* CONFIG_LINUX */ > +} > + > static SaveVMHandlers savevm_ram_handlers = { > .save_setup = ram_save_setup, > .save_live_iterate = ram_save_iterate, > diff --git a/migration/ram.h b/migration/ram.h > index 1a9ff90304..c25540cb93 100644 > --- a/migration/ram.h > +++ b/migration/ram.h > @@ -82,5 +82,7 @@ void colo_incoming_start_dirty_log(void); > /* Background snapshot */ > bool ram_write_tracking_available(void); > bool ram_write_tracking_compatible(void); > +int ram_write_tracking_start(void); > +void ram_write_tracking_stop(void); > > #endif > diff --git a/migration/trace-events b/migration/trace-events > index 75de5004ac..668c562fed 100644 > --- a/migration/trace-events > +++ b/migration/trace-events > @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" > save_xbzrle_page_overflow(void) "" > ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" > ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 > +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" > +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" > > # multifd.c > multifd_new_send_channel_async(uint8_t id) "channel %d" > -- > 2.25.1 >
On 19.01.2021 20:49, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> In this particular implementation the same single migration >> thread is responsible for both normal linear dirty page >> migration and procesing UFFD page fault events. >> >> Processing write faults includes reading UFFD file descriptor, >> finding respective RAM block and saving faulting page to >> the migration stream. After page has been saved, write protection >> can be removed. Since asynchronous version of qemu_put_buffer() >> is expected to be used to save pages, we also have to flush >> migraion stream prior to un-protecting saved memory range. >> >> Write protection is being removed for any previously protected >> memory chunk that has hit the migration stream. That's valid >> for pages from linear page scan along with write fault pages. >> >> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >> Acked-by: Peter Xu <peterx@redhat.com> > I think this is OK, so: > > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > I think I'd have preferred to have kept the #ifdef LINUX's out of there, > and I suspect using the opaque's for hte pre/post hook is overly > complex; but other wise OK. > > Dave Mm, I think it's impossible to completely move #ifdef LINUX out of there, but I suspect all #ifdef LINUX code can be moved to a single place with stubs for the #else case.. Suppose it would be better then now. For pre/post hooks - not too complex, but for single #ifdef section it really looks better to move this stuff to ram_save_host_page(). Andrey >> --- >> include/exec/memory.h | 7 ++ >> migration/ram.c | 269 +++++++++++++++++++++++++++++++++++++++-- >> migration/ram.h | 2 + >> migration/trace-events | 2 + >> 4 files changed, 272 insertions(+), 8 deletions(-) >> >> diff --git a/include/exec/memory.h b/include/exec/memory.h >> index b76b1256bf..1aa1c6a3f4 100644 >> --- a/include/exec/memory.h >> +++ b/include/exec/memory.h >> @@ -150,6 +150,13 @@ typedef struct IOMMUTLBEvent { >> #define RAM_PMEM (1 << 5) >> >> >> +/* >> + * UFFDIO_WRITEPROTECT is used on this RAMBlock to >> + * support 'write-tracking' migration type. >> + * Implies ram_state->ram_wt_enabled. >> + */ >> +#define RAM_UF_WRITEPROTECT (1 << 6) >> + >> static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, >> IOMMUNotifierFlag flags, >> hwaddr start, hwaddr end, >> diff --git a/migration/ram.c b/migration/ram.c >> index ae8de17153..5707382db1 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -56,6 +56,11 @@ >> #include "savevm.h" >> #include "qemu/iov.h" >> #include "multifd.h" >> +#include "sysemu/runstate.h" >> + >> +#ifdef CONFIG_LINUX >> +#include "qemu/userfaultfd.h" >> +#endif >> >> /***********************************************************/ >> /* ram save/restore */ >> @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { >> struct RAMState { >> /* QEMUFile used for this migration */ >> QEMUFile *f; >> + /* UFFD file descriptor, used in 'write-tracking' migration */ >> + int uffdio_fd; >> /* Last block that we have visited searching for dirty pages */ >> RAMBlock *last_seen_block; >> /* Last block from where we have sent data */ >> @@ -1434,6 +1441,40 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) >> return block; >> } >> >> +#ifdef CONFIG_LINUX >> +/** >> + * poll_fault_page: try to get next UFFD write fault page and, if pending fault >> + * is found, return RAM block pointer and page offset >> + * >> + * Returns pointer to the RAMBlock containing faulting page, >> + * NULL if no write faults are pending >> + * >> + * @rs: current RAM state >> + * @offset: page offset from the beginning of the block >> + */ >> +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) >> +{ >> + struct uffd_msg uffd_msg; >> + void *page_address; >> + RAMBlock *bs; >> + int res; >> + >> + if (!migrate_background_snapshot()) { >> + return NULL; >> + } >> + >> + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); >> + if (res <= 0) { >> + return NULL; >> + } >> + >> + page_address = (void *) uffd_msg.arg.pagefault.address; >> + bs = qemu_ram_block_from_host(page_address, false, offset); >> + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); >> + return bs; >> +} >> +#endif /* CONFIG_LINUX */ >> + >> /** >> * get_queued_page: unqueue a page from the postcopy requests >> * >> @@ -1473,6 +1514,16 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) >> >> } while (block && !dirty); >> >> +#ifdef CONFIG_LINUX >> + if (!block) { >> + /* >> + * Poll write faults too if background snapshot is enabled; that's >> + * when we have vcpus got blocked by the write protected pages. >> + */ >> + block = poll_fault_page(rs, &offset); >> + } >> +#endif /* CONFIG_LINUX */ >> + >> if (block) { >> /* >> * As soon as we start servicing pages out of order, then we have >> @@ -1746,6 +1797,53 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, >> return pages; >> } >> >> +/** >> + * ram_save_host_page_pre: ram_save_host_page() pre-notifier >> + * >> + * @rs: current RAM state >> + * @pss: page-search-status structure >> + * @opaque: pointer to receive opaque context value >> + */ >> +static inline >> +void ram_save_host_page_pre(RAMState *rs, PageSearchStatus *pss, void **opaque) >> +{ >> + *(unsigned long *) opaque = pss->page; >> +} >> + >> +/** >> + * ram_save_host_page_post: ram_save_host_page() post-notifier >> + * >> + * @rs: current RAM state >> + * @pss: page-search-status structure >> + * @opaque: opaque context value >> + * @res_override: pointer to the return value of ram_save_host_page(), >> + * overwritten in case of an error >> + */ >> +static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, >> + void *opaque, int *res_override) >> +{ >> +#ifdef CONFIG_LINUX >> + /* Check if page is from UFFD-managed region. */ >> + if (pss->block->flags & RAM_UF_WRITEPROTECT) { >> + unsigned long page_from = (unsigned long) opaque; >> + >> + void *page_address = pss->block->host + (page_from << TARGET_PAGE_BITS); >> + uint64_t run_length = (pss->page - page_from + 1) << TARGET_PAGE_BITS; >> + int res; >> + >> + /* Flush async buffers before un-protect. */ >> + qemu_fflush(rs->f); >> + /* Un-protect memory range. */ >> + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, >> + false, false); >> + /* We don't want to override existing error from ram_save_host_page(). */ >> + if (res < 0 && *res_override >= 0) { >> + *res_override = res; >> + } >> + } >> +#endif /* CONFIG_LINUX */ >> +} >> + >> /** >> * ram_find_and_save_block: finds a dirty page and sends it to f >> * >> @@ -1790,7 +1888,11 @@ static int ram_find_and_save_block(RAMState *rs, bool last_stage) >> } >> >> if (found) { >> + void *opaque; >> + >> + ram_save_host_page_pre(rs, &pss, &opaque); >> pages = ram_save_host_page(rs, &pss, last_stage); >> + ram_save_host_page_post(rs, &pss, opaque, &pages); >> } >> } while (!pages && again); >> >> @@ -1880,10 +1982,13 @@ static void ram_save_cleanup(void *opaque) >> RAMState **rsp = opaque; >> RAMBlock *block; >> >> - /* caller have hold iothread lock or is in a bh, so there is >> - * no writing race against the migration bitmap >> - */ >> - memory_global_dirty_log_stop(); >> + /* We don't use dirty log with background snapshots */ >> + if (!migrate_background_snapshot()) { >> + /* caller have hold iothread lock or is in a bh, so there is >> + * no writing race against the migration bitmap >> + */ >> + memory_global_dirty_log_stop(); >> + } >> >> RAMBLOCK_FOREACH_NOT_IGNORED(block) { >> g_free(block->clear_bmap); >> @@ -2343,8 +2448,11 @@ static void ram_init_bitmaps(RAMState *rs) >> >> WITH_RCU_READ_LOCK_GUARD() { >> ram_list_init_bitmaps(); >> - memory_global_dirty_log_start(); >> - migration_bitmap_sync_precopy(rs); >> + /* We don't use dirty log with background snapshots */ >> + if (!migrate_background_snapshot()) { >> + memory_global_dirty_log_start(); >> + migration_bitmap_sync_precopy(rs); >> + } >> } >> qemu_mutex_unlock_ramlist(); >> qemu_mutex_unlock_iothread(); >> @@ -3794,7 +3902,14 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) >> */ >> bool ram_write_tracking_available(void) >> { >> - /* TODO: implement */ >> +#ifdef CONFIG_LINUX >> + uint64_t uffd_features; >> + int res; >> + >> + res = uffd_query_features(&uffd_features); >> + return (res == 0 && >> + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); >> +#endif >> return false; >> } >> >> @@ -3805,10 +3920,148 @@ bool ram_write_tracking_available(void) >> */ >> bool ram_write_tracking_compatible(void) >> { >> - /* TODO: implement */ >> +#ifdef CONFIG_LINUX >> + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); >> + int uffd_fd; >> + RAMBlock *bs; >> + bool ret = false; >> + >> + /* Open UFFD file descriptor */ >> + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); >> + if (uffd_fd < 0) { >> + return false; >> + } >> + >> + RCU_READ_LOCK_GUARD(); >> + >> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >> + uint64_t uffd_ioctls; >> + >> + /* Nothing to do with read-only and MMIO-writable regions */ >> + if (bs->mr->readonly || bs->mr->rom_device) { >> + continue; >> + } >> + /* Try to register block memory via UFFD-IO to track writes */ >> + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, >> + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { >> + goto out; >> + } >> + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { >> + goto out; >> + } >> + } >> + ret = true; >> + >> +out: >> + uffd_close_fd(uffd_fd); >> + return ret; >> +#endif >> return false; >> } >> >> +/* >> + * ram_write_tracking_start: start UFFD-WP memory tracking >> + * >> + * Returns 0 for success or negative value in case of error >> + */ >> +int ram_write_tracking_start(void) >> +{ >> +#ifdef CONFIG_LINUX >> + int uffd_fd; >> + RAMState *rs = ram_state; >> + RAMBlock *bs; >> + >> + /* Open UFFD file descriptor */ >> + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); >> + if (uffd_fd < 0) { >> + return uffd_fd; >> + } >> + rs->uffdio_fd = uffd_fd; >> + >> + RCU_READ_LOCK_GUARD(); >> + >> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >> + /* Nothing to do with read-only and MMIO-writable regions */ >> + if (bs->mr->readonly || bs->mr->rom_device) { >> + continue; >> + } >> + >> + /* Register block memory with UFFD to track writes */ >> + if (uffd_register_memory(rs->uffdio_fd, bs->host, >> + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { >> + goto fail; >> + } >> + /* Apply UFFD write protection to the block memory range */ >> + if (uffd_change_protection(rs->uffdio_fd, bs->host, >> + bs->max_length, true, false)) { >> + goto fail; >> + } >> + bs->flags |= RAM_UF_WRITEPROTECT; >> + memory_region_ref(bs->mr); >> + >> + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, >> + bs->host, bs->max_length); >> + } >> + >> + return 0; >> + >> +fail: >> + error_report("ram_write_tracking_start() failed: restoring initial memory state"); >> + >> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >> + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { >> + continue; >> + } >> + /* >> + * In case some memory block failed to be write-protected >> + * remove protection and unregister all succeeded RAM blocks >> + */ >> + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); >> + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); >> + /* Cleanup flags and remove reference */ >> + bs->flags &= ~RAM_UF_WRITEPROTECT; >> + memory_region_unref(bs->mr); >> + } >> + >> + uffd_close_fd(uffd_fd); >> +#endif /* CONFIG_LINUX */ >> + rs->uffdio_fd = -1; >> + return -1; >> +} >> + >> +/** >> + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection >> + */ >> +void ram_write_tracking_stop(void) >> +{ >> +#ifdef CONFIG_LINUX >> + RAMState *rs = ram_state; >> + RAMBlock *bs; >> + >> + RCU_READ_LOCK_GUARD(); >> + >> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >> + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { >> + continue; >> + } >> + /* Remove protection and unregister all affected RAM blocks */ >> + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); >> + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); >> + >> + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, >> + bs->host, bs->max_length); >> + >> + /* Cleanup flags and remove reference */ >> + bs->flags &= ~RAM_UF_WRITEPROTECT; >> + memory_region_unref(bs->mr); >> + } >> + >> + /* Finally close UFFD file descriptor */ >> + uffd_close_fd(rs->uffdio_fd); >> + rs->uffdio_fd = -1; >> +#endif /* CONFIG_LINUX */ >> +} >> + >> static SaveVMHandlers savevm_ram_handlers = { >> .save_setup = ram_save_setup, >> .save_live_iterate = ram_save_iterate, >> diff --git a/migration/ram.h b/migration/ram.h >> index 1a9ff90304..c25540cb93 100644 >> --- a/migration/ram.h >> +++ b/migration/ram.h >> @@ -82,5 +82,7 @@ void colo_incoming_start_dirty_log(void); >> /* Background snapshot */ >> bool ram_write_tracking_available(void); >> bool ram_write_tracking_compatible(void); >> +int ram_write_tracking_start(void); >> +void ram_write_tracking_stop(void); >> >> #endif >> diff --git a/migration/trace-events b/migration/trace-events >> index 75de5004ac..668c562fed 100644 >> --- a/migration/trace-events >> +++ b/migration/trace-events >> @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" >> save_xbzrle_page_overflow(void) "" >> ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" >> ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 >> +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" >> +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" >> >> # multifd.c >> multifd_new_send_channel_async(uint8_t id) "channel %d" >> -- >> 2.25.1 >>
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > On 19.01.2021 20:49, Dr. David Alan Gilbert wrote: > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > In this particular implementation the same single migration > > > thread is responsible for both normal linear dirty page > > > migration and procesing UFFD page fault events. > > > > > > Processing write faults includes reading UFFD file descriptor, > > > finding respective RAM block and saving faulting page to > > > the migration stream. After page has been saved, write protection > > > can be removed. Since asynchronous version of qemu_put_buffer() > > > is expected to be used to save pages, we also have to flush > > > migraion stream prior to un-protecting saved memory range. > > > > > > Write protection is being removed for any previously protected > > > memory chunk that has hit the migration stream. That's valid > > > for pages from linear page scan along with write fault pages. > > > > > > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > > > Acked-by: Peter Xu <peterx@redhat.com> > > I think this is OK, so: > > > > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > > > I think I'd have preferred to have kept the #ifdef LINUX's out of there, > > and I suspect using the opaque's for hte pre/post hook is overly > > complex; but other wise OK. > > > > Dave > > Mm, I think it's impossible to completely move #ifdef LINUX out of there, > but I suspect all #ifdef LINUX code can be moved to a single place with > stubs for the #else case.. Suppose it would be better then now. > For pre/post hooks - not too complex, but for single #ifdef section it really > looks better to move this stuff to ram_save_host_page(). An add on patch to consolidate the ifdef'ing to one place would be interesting; I'll look at taking this series as is (other than the thread name fix up etc) when I get around to my next pull (although I need to review some other peoples patches first!) Dave > Andrey > > > > --- > > > include/exec/memory.h | 7 ++ > > > migration/ram.c | 269 +++++++++++++++++++++++++++++++++++++++-- > > > migration/ram.h | 2 + > > > migration/trace-events | 2 + > > > 4 files changed, 272 insertions(+), 8 deletions(-) > > > > > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > > index b76b1256bf..1aa1c6a3f4 100644 > > > --- a/include/exec/memory.h > > > +++ b/include/exec/memory.h > > > @@ -150,6 +150,13 @@ typedef struct IOMMUTLBEvent { > > > #define RAM_PMEM (1 << 5) > > > +/* > > > + * UFFDIO_WRITEPROTECT is used on this RAMBlock to > > > + * support 'write-tracking' migration type. > > > + * Implies ram_state->ram_wt_enabled. > > > + */ > > > +#define RAM_UF_WRITEPROTECT (1 << 6) > > > + > > > static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, > > > IOMMUNotifierFlag flags, > > > hwaddr start, hwaddr end, > > > diff --git a/migration/ram.c b/migration/ram.c > > > index ae8de17153..5707382db1 100644 > > > --- a/migration/ram.c > > > +++ b/migration/ram.c > > > @@ -56,6 +56,11 @@ > > > #include "savevm.h" > > > #include "qemu/iov.h" > > > #include "multifd.h" > > > +#include "sysemu/runstate.h" > > > + > > > +#ifdef CONFIG_LINUX > > > +#include "qemu/userfaultfd.h" > > > +#endif > > > /***********************************************************/ > > > /* ram save/restore */ > > > @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { > > > struct RAMState { > > > /* QEMUFile used for this migration */ > > > QEMUFile *f; > > > + /* UFFD file descriptor, used in 'write-tracking' migration */ > > > + int uffdio_fd; > > > /* Last block that we have visited searching for dirty pages */ > > > RAMBlock *last_seen_block; > > > /* Last block from where we have sent data */ > > > @@ -1434,6 +1441,40 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) > > > return block; > > > } > > > +#ifdef CONFIG_LINUX > > > +/** > > > + * poll_fault_page: try to get next UFFD write fault page and, if pending fault > > > + * is found, return RAM block pointer and page offset > > > + * > > > + * Returns pointer to the RAMBlock containing faulting page, > > > + * NULL if no write faults are pending > > > + * > > > + * @rs: current RAM state > > > + * @offset: page offset from the beginning of the block > > > + */ > > > +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > > > +{ > > > + struct uffd_msg uffd_msg; > > > + void *page_address; > > > + RAMBlock *bs; > > > + int res; > > > + > > > + if (!migrate_background_snapshot()) { > > > + return NULL; > > > + } > > > + > > > + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); > > > + if (res <= 0) { > > > + return NULL; > > > + } > > > + > > > + page_address = (void *) uffd_msg.arg.pagefault.address; > > > + bs = qemu_ram_block_from_host(page_address, false, offset); > > > + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); > > > + return bs; > > > +} > > > +#endif /* CONFIG_LINUX */ > > > + > > > /** > > > * get_queued_page: unqueue a page from the postcopy requests > > > * > > > @@ -1473,6 +1514,16 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) > > > } while (block && !dirty); > > > +#ifdef CONFIG_LINUX > > > + if (!block) { > > > + /* > > > + * Poll write faults too if background snapshot is enabled; that's > > > + * when we have vcpus got blocked by the write protected pages. > > > + */ > > > + block = poll_fault_page(rs, &offset); > > > + } > > > +#endif /* CONFIG_LINUX */ > > > + > > > if (block) { > > > /* > > > * As soon as we start servicing pages out of order, then we have > > > @@ -1746,6 +1797,53 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, > > > return pages; > > > } > > > +/** > > > + * ram_save_host_page_pre: ram_save_host_page() pre-notifier > > > + * > > > + * @rs: current RAM state > > > + * @pss: page-search-status structure > > > + * @opaque: pointer to receive opaque context value > > > + */ > > > +static inline > > > +void ram_save_host_page_pre(RAMState *rs, PageSearchStatus *pss, void **opaque) > > > +{ > > > + *(unsigned long *) opaque = pss->page; > > > +} > > > + > > > +/** > > > + * ram_save_host_page_post: ram_save_host_page() post-notifier > > > + * > > > + * @rs: current RAM state > > > + * @pss: page-search-status structure > > > + * @opaque: opaque context value > > > + * @res_override: pointer to the return value of ram_save_host_page(), > > > + * overwritten in case of an error > > > + */ > > > +static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > > > + void *opaque, int *res_override) > > > +{ > > > +#ifdef CONFIG_LINUX > > > + /* Check if page is from UFFD-managed region. */ > > > + if (pss->block->flags & RAM_UF_WRITEPROTECT) { > > > + unsigned long page_from = (unsigned long) opaque; > > > + > > > + void *page_address = pss->block->host + (page_from << TARGET_PAGE_BITS); > > > + uint64_t run_length = (pss->page - page_from + 1) << TARGET_PAGE_BITS; > > > + int res; > > > + > > > + /* Flush async buffers before un-protect. */ > > > + qemu_fflush(rs->f); > > > + /* Un-protect memory range. */ > > > + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, > > > + false, false); > > > + /* We don't want to override existing error from ram_save_host_page(). */ > > > + if (res < 0 && *res_override >= 0) { > > > + *res_override = res; > > > + } > > > + } > > > +#endif /* CONFIG_LINUX */ > > > +} > > > + > > > /** > > > * ram_find_and_save_block: finds a dirty page and sends it to f > > > * > > > @@ -1790,7 +1888,11 @@ static int ram_find_and_save_block(RAMState *rs, bool last_stage) > > > } > > > if (found) { > > > + void *opaque; > > > + > > > + ram_save_host_page_pre(rs, &pss, &opaque); > > > pages = ram_save_host_page(rs, &pss, last_stage); > > > + ram_save_host_page_post(rs, &pss, opaque, &pages); > > > } > > > } while (!pages && again); > > > @@ -1880,10 +1982,13 @@ static void ram_save_cleanup(void *opaque) > > > RAMState **rsp = opaque; > > > RAMBlock *block; > > > - /* caller have hold iothread lock or is in a bh, so there is > > > - * no writing race against the migration bitmap > > > - */ > > > - memory_global_dirty_log_stop(); > > > + /* We don't use dirty log with background snapshots */ > > > + if (!migrate_background_snapshot()) { > > > + /* caller have hold iothread lock or is in a bh, so there is > > > + * no writing race against the migration bitmap > > > + */ > > > + memory_global_dirty_log_stop(); > > > + } > > > RAMBLOCK_FOREACH_NOT_IGNORED(block) { > > > g_free(block->clear_bmap); > > > @@ -2343,8 +2448,11 @@ static void ram_init_bitmaps(RAMState *rs) > > > WITH_RCU_READ_LOCK_GUARD() { > > > ram_list_init_bitmaps(); > > > - memory_global_dirty_log_start(); > > > - migration_bitmap_sync_precopy(rs); > > > + /* We don't use dirty log with background snapshots */ > > > + if (!migrate_background_snapshot()) { > > > + memory_global_dirty_log_start(); > > > + migration_bitmap_sync_precopy(rs); > > > + } > > > } > > > qemu_mutex_unlock_ramlist(); > > > qemu_mutex_unlock_iothread(); > > > @@ -3794,7 +3902,14 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) > > > */ > > > bool ram_write_tracking_available(void) > > > { > > > - /* TODO: implement */ > > > +#ifdef CONFIG_LINUX > > > + uint64_t uffd_features; > > > + int res; > > > + > > > + res = uffd_query_features(&uffd_features); > > > + return (res == 0 && > > > + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); > > > +#endif > > > return false; > > > } > > > @@ -3805,10 +3920,148 @@ bool ram_write_tracking_available(void) > > > */ > > > bool ram_write_tracking_compatible(void) > > > { > > > - /* TODO: implement */ > > > +#ifdef CONFIG_LINUX > > > + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); > > > + int uffd_fd; > > > + RAMBlock *bs; > > > + bool ret = false; > > > + > > > + /* Open UFFD file descriptor */ > > > + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); > > > + if (uffd_fd < 0) { > > > + return false; > > > + } > > > + > > > + RCU_READ_LOCK_GUARD(); > > > + > > > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > > > + uint64_t uffd_ioctls; > > > + > > > + /* Nothing to do with read-only and MMIO-writable regions */ > > > + if (bs->mr->readonly || bs->mr->rom_device) { > > > + continue; > > > + } > > > + /* Try to register block memory via UFFD-IO to track writes */ > > > + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, > > > + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { > > > + goto out; > > > + } > > > + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { > > > + goto out; > > > + } > > > + } > > > + ret = true; > > > + > > > +out: > > > + uffd_close_fd(uffd_fd); > > > + return ret; > > > +#endif > > > return false; > > > } > > > +/* > > > + * ram_write_tracking_start: start UFFD-WP memory tracking > > > + * > > > + * Returns 0 for success or negative value in case of error > > > + */ > > > +int ram_write_tracking_start(void) > > > +{ > > > +#ifdef CONFIG_LINUX > > > + int uffd_fd; > > > + RAMState *rs = ram_state; > > > + RAMBlock *bs; > > > + > > > + /* Open UFFD file descriptor */ > > > + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); > > > + if (uffd_fd < 0) { > > > + return uffd_fd; > > > + } > > > + rs->uffdio_fd = uffd_fd; > > > + > > > + RCU_READ_LOCK_GUARD(); > > > + > > > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > > > + /* Nothing to do with read-only and MMIO-writable regions */ > > > + if (bs->mr->readonly || bs->mr->rom_device) { > > > + continue; > > > + } > > > + > > > + /* Register block memory with UFFD to track writes */ > > > + if (uffd_register_memory(rs->uffdio_fd, bs->host, > > > + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { > > > + goto fail; > > > + } > > > + /* Apply UFFD write protection to the block memory range */ > > > + if (uffd_change_protection(rs->uffdio_fd, bs->host, > > > + bs->max_length, true, false)) { > > > + goto fail; > > > + } > > > + bs->flags |= RAM_UF_WRITEPROTECT; > > > + memory_region_ref(bs->mr); > > > + > > > + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, > > > + bs->host, bs->max_length); > > > + } > > > + > > > + return 0; > > > + > > > +fail: > > > + error_report("ram_write_tracking_start() failed: restoring initial memory state"); > > > + > > > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > > > + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { > > > + continue; > > > + } > > > + /* > > > + * In case some memory block failed to be write-protected > > > + * remove protection and unregister all succeeded RAM blocks > > > + */ > > > + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); > > > + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); > > > + /* Cleanup flags and remove reference */ > > > + bs->flags &= ~RAM_UF_WRITEPROTECT; > > > + memory_region_unref(bs->mr); > > > + } > > > + > > > + uffd_close_fd(uffd_fd); > > > +#endif /* CONFIG_LINUX */ > > > + rs->uffdio_fd = -1; > > > + return -1; > > > +} > > > + > > > +/** > > > + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection > > > + */ > > > +void ram_write_tracking_stop(void) > > > +{ > > > +#ifdef CONFIG_LINUX > > > + RAMState *rs = ram_state; > > > + RAMBlock *bs; > > > + > > > + RCU_READ_LOCK_GUARD(); > > > + > > > + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { > > > + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { > > > + continue; > > > + } > > > + /* Remove protection and unregister all affected RAM blocks */ > > > + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); > > > + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); > > > + > > > + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, > > > + bs->host, bs->max_length); > > > + > > > + /* Cleanup flags and remove reference */ > > > + bs->flags &= ~RAM_UF_WRITEPROTECT; > > > + memory_region_unref(bs->mr); > > > + } > > > + > > > + /* Finally close UFFD file descriptor */ > > > + uffd_close_fd(rs->uffdio_fd); > > > + rs->uffdio_fd = -1; > > > +#endif /* CONFIG_LINUX */ > > > +} > > > + > > > static SaveVMHandlers savevm_ram_handlers = { > > > .save_setup = ram_save_setup, > > > .save_live_iterate = ram_save_iterate, > > > diff --git a/migration/ram.h b/migration/ram.h > > > index 1a9ff90304..c25540cb93 100644 > > > --- a/migration/ram.h > > > +++ b/migration/ram.h > > > @@ -82,5 +82,7 @@ void colo_incoming_start_dirty_log(void); > > > /* Background snapshot */ > > > bool ram_write_tracking_available(void); > > > bool ram_write_tracking_compatible(void); > > > +int ram_write_tracking_start(void); > > > +void ram_write_tracking_stop(void); > > > #endif > > > diff --git a/migration/trace-events b/migration/trace-events > > > index 75de5004ac..668c562fed 100644 > > > --- a/migration/trace-events > > > +++ b/migration/trace-events > > > @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" > > > save_xbzrle_page_overflow(void) "" > > > ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" > > > ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 > > > +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" > > > +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" > > > # multifd.c > > > multifd_new_send_channel_async(uint8_t id) "channel %d" > > > -- > > > 2.25.1 > > > > > -- > Andrey Gruzdev, Principal Engineer > Virtuozzo GmbH +7-903-247-6397 > virtuzzo.com >
On 21.01.2021 13:09, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> On 19.01.2021 20:49, Dr. David Alan Gilbert wrote: >>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>> In this particular implementation the same single migration >>>> thread is responsible for both normal linear dirty page >>>> migration and procesing UFFD page fault events. >>>> >>>> Processing write faults includes reading UFFD file descriptor, >>>> finding respective RAM block and saving faulting page to >>>> the migration stream. After page has been saved, write protection >>>> can be removed. Since asynchronous version of qemu_put_buffer() >>>> is expected to be used to save pages, we also have to flush >>>> migraion stream prior to un-protecting saved memory range. >>>> >>>> Write protection is being removed for any previously protected >>>> memory chunk that has hit the migration stream. That's valid >>>> for pages from linear page scan along with write fault pages. >>>> >>>> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >>>> Acked-by: Peter Xu <peterx@redhat.com> >>> I think this is OK, so: >>> >>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> >>> >>> I think I'd have preferred to have kept the #ifdef LINUX's out of there, >>> and I suspect using the opaque's for hte pre/post hook is overly >>> complex; but other wise OK. >>> >>> Dave >> Mm, I think it's impossible to completely move #ifdef LINUX out of there, >> but I suspect all #ifdef LINUX code can be moved to a single place with >> stubs for the #else case.. Suppose it would be better then now. >> For pre/post hooks - not too complex, but for single #ifdef section it really >> looks better to move this stuff to ram_save_host_page(). > An add on patch to consolidate the ifdef'ing to one place would be > interesting; I'll look at taking this series as is (other than the > thread name fix up etc) when I get around to my next pull (although I > need to review some other peoples patches first!) > > Dave I'll make it and prepare next series, hope to finish in 1-2 days. Thanks! Andrey >> Andrey >> >>>> --- >>>> include/exec/memory.h | 7 ++ >>>> migration/ram.c | 269 +++++++++++++++++++++++++++++++++++++++-- >>>> migration/ram.h | 2 + >>>> migration/trace-events | 2 + >>>> 4 files changed, 272 insertions(+), 8 deletions(-) >>>> >>>> diff --git a/include/exec/memory.h b/include/exec/memory.h >>>> index b76b1256bf..1aa1c6a3f4 100644 >>>> --- a/include/exec/memory.h >>>> +++ b/include/exec/memory.h >>>> @@ -150,6 +150,13 @@ typedef struct IOMMUTLBEvent { >>>> #define RAM_PMEM (1 << 5) >>>> +/* >>>> + * UFFDIO_WRITEPROTECT is used on this RAMBlock to >>>> + * support 'write-tracking' migration type. >>>> + * Implies ram_state->ram_wt_enabled. >>>> + */ >>>> +#define RAM_UF_WRITEPROTECT (1 << 6) >>>> + >>>> static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, >>>> IOMMUNotifierFlag flags, >>>> hwaddr start, hwaddr end, >>>> diff --git a/migration/ram.c b/migration/ram.c >>>> index ae8de17153..5707382db1 100644 >>>> --- a/migration/ram.c >>>> +++ b/migration/ram.c >>>> @@ -56,6 +56,11 @@ >>>> #include "savevm.h" >>>> #include "qemu/iov.h" >>>> #include "multifd.h" >>>> +#include "sysemu/runstate.h" >>>> + >>>> +#ifdef CONFIG_LINUX >>>> +#include "qemu/userfaultfd.h" >>>> +#endif >>>> /***********************************************************/ >>>> /* ram save/restore */ >>>> @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { >>>> struct RAMState { >>>> /* QEMUFile used for this migration */ >>>> QEMUFile *f; >>>> + /* UFFD file descriptor, used in 'write-tracking' migration */ >>>> + int uffdio_fd; >>>> /* Last block that we have visited searching for dirty pages */ >>>> RAMBlock *last_seen_block; >>>> /* Last block from where we have sent data */ >>>> @@ -1434,6 +1441,40 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) >>>> return block; >>>> } >>>> +#ifdef CONFIG_LINUX >>>> +/** >>>> + * poll_fault_page: try to get next UFFD write fault page and, if pending fault >>>> + * is found, return RAM block pointer and page offset >>>> + * >>>> + * Returns pointer to the RAMBlock containing faulting page, >>>> + * NULL if no write faults are pending >>>> + * >>>> + * @rs: current RAM state >>>> + * @offset: page offset from the beginning of the block >>>> + */ >>>> +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) >>>> +{ >>>> + struct uffd_msg uffd_msg; >>>> + void *page_address; >>>> + RAMBlock *bs; >>>> + int res; >>>> + >>>> + if (!migrate_background_snapshot()) { >>>> + return NULL; >>>> + } >>>> + >>>> + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); >>>> + if (res <= 0) { >>>> + return NULL; >>>> + } >>>> + >>>> + page_address = (void *) uffd_msg.arg.pagefault.address; >>>> + bs = qemu_ram_block_from_host(page_address, false, offset); >>>> + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); >>>> + return bs; >>>> +} >>>> +#endif /* CONFIG_LINUX */ >>>> + >>>> /** >>>> * get_queued_page: unqueue a page from the postcopy requests >>>> * >>>> @@ -1473,6 +1514,16 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) >>>> } while (block && !dirty); >>>> +#ifdef CONFIG_LINUX >>>> + if (!block) { >>>> + /* >>>> + * Poll write faults too if background snapshot is enabled; that's >>>> + * when we have vcpus got blocked by the write protected pages. >>>> + */ >>>> + block = poll_fault_page(rs, &offset); >>>> + } >>>> +#endif /* CONFIG_LINUX */ >>>> + >>>> if (block) { >>>> /* >>>> * As soon as we start servicing pages out of order, then we have >>>> @@ -1746,6 +1797,53 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, >>>> return pages; >>>> } >>>> +/** >>>> + * ram_save_host_page_pre: ram_save_host_page() pre-notifier >>>> + * >>>> + * @rs: current RAM state >>>> + * @pss: page-search-status structure >>>> + * @opaque: pointer to receive opaque context value >>>> + */ >>>> +static inline >>>> +void ram_save_host_page_pre(RAMState *rs, PageSearchStatus *pss, void **opaque) >>>> +{ >>>> + *(unsigned long *) opaque = pss->page; >>>> +} >>>> + >>>> +/** >>>> + * ram_save_host_page_post: ram_save_host_page() post-notifier >>>> + * >>>> + * @rs: current RAM state >>>> + * @pss: page-search-status structure >>>> + * @opaque: opaque context value >>>> + * @res_override: pointer to the return value of ram_save_host_page(), >>>> + * overwritten in case of an error >>>> + */ >>>> +static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, >>>> + void *opaque, int *res_override) >>>> +{ >>>> +#ifdef CONFIG_LINUX >>>> + /* Check if page is from UFFD-managed region. */ >>>> + if (pss->block->flags & RAM_UF_WRITEPROTECT) { >>>> + unsigned long page_from = (unsigned long) opaque; >>>> + >>>> + void *page_address = pss->block->host + (page_from << TARGET_PAGE_BITS); >>>> + uint64_t run_length = (pss->page - page_from + 1) << TARGET_PAGE_BITS; >>>> + int res; >>>> + >>>> + /* Flush async buffers before un-protect. */ >>>> + qemu_fflush(rs->f); >>>> + /* Un-protect memory range. */ >>>> + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, >>>> + false, false); >>>> + /* We don't want to override existing error from ram_save_host_page(). */ >>>> + if (res < 0 && *res_override >= 0) { >>>> + *res_override = res; >>>> + } >>>> + } >>>> +#endif /* CONFIG_LINUX */ >>>> +} >>>> + >>>> /** >>>> * ram_find_and_save_block: finds a dirty page and sends it to f >>>> * >>>> @@ -1790,7 +1888,11 @@ static int ram_find_and_save_block(RAMState *rs, bool last_stage) >>>> } >>>> if (found) { >>>> + void *opaque; >>>> + >>>> + ram_save_host_page_pre(rs, &pss, &opaque); >>>> pages = ram_save_host_page(rs, &pss, last_stage); >>>> + ram_save_host_page_post(rs, &pss, opaque, &pages); >>>> } >>>> } while (!pages && again); >>>> @@ -1880,10 +1982,13 @@ static void ram_save_cleanup(void *opaque) >>>> RAMState **rsp = opaque; >>>> RAMBlock *block; >>>> - /* caller have hold iothread lock or is in a bh, so there is >>>> - * no writing race against the migration bitmap >>>> - */ >>>> - memory_global_dirty_log_stop(); >>>> + /* We don't use dirty log with background snapshots */ >>>> + if (!migrate_background_snapshot()) { >>>> + /* caller have hold iothread lock or is in a bh, so there is >>>> + * no writing race against the migration bitmap >>>> + */ >>>> + memory_global_dirty_log_stop(); >>>> + } >>>> RAMBLOCK_FOREACH_NOT_IGNORED(block) { >>>> g_free(block->clear_bmap); >>>> @@ -2343,8 +2448,11 @@ static void ram_init_bitmaps(RAMState *rs) >>>> WITH_RCU_READ_LOCK_GUARD() { >>>> ram_list_init_bitmaps(); >>>> - memory_global_dirty_log_start(); >>>> - migration_bitmap_sync_precopy(rs); >>>> + /* We don't use dirty log with background snapshots */ >>>> + if (!migrate_background_snapshot()) { >>>> + memory_global_dirty_log_start(); >>>> + migration_bitmap_sync_precopy(rs); >>>> + } >>>> } >>>> qemu_mutex_unlock_ramlist(); >>>> qemu_mutex_unlock_iothread(); >>>> @@ -3794,7 +3902,14 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) >>>> */ >>>> bool ram_write_tracking_available(void) >>>> { >>>> - /* TODO: implement */ >>>> +#ifdef CONFIG_LINUX >>>> + uint64_t uffd_features; >>>> + int res; >>>> + >>>> + res = uffd_query_features(&uffd_features); >>>> + return (res == 0 && >>>> + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); >>>> +#endif >>>> return false; >>>> } >>>> @@ -3805,10 +3920,148 @@ bool ram_write_tracking_available(void) >>>> */ >>>> bool ram_write_tracking_compatible(void) >>>> { >>>> - /* TODO: implement */ >>>> +#ifdef CONFIG_LINUX >>>> + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); >>>> + int uffd_fd; >>>> + RAMBlock *bs; >>>> + bool ret = false; >>>> + >>>> + /* Open UFFD file descriptor */ >>>> + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); >>>> + if (uffd_fd < 0) { >>>> + return false; >>>> + } >>>> + >>>> + RCU_READ_LOCK_GUARD(); >>>> + >>>> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >>>> + uint64_t uffd_ioctls; >>>> + >>>> + /* Nothing to do with read-only and MMIO-writable regions */ >>>> + if (bs->mr->readonly || bs->mr->rom_device) { >>>> + continue; >>>> + } >>>> + /* Try to register block memory via UFFD-IO to track writes */ >>>> + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, >>>> + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { >>>> + goto out; >>>> + } >>>> + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { >>>> + goto out; >>>> + } >>>> + } >>>> + ret = true; >>>> + >>>> +out: >>>> + uffd_close_fd(uffd_fd); >>>> + return ret; >>>> +#endif >>>> return false; >>>> } >>>> +/* >>>> + * ram_write_tracking_start: start UFFD-WP memory tracking >>>> + * >>>> + * Returns 0 for success or negative value in case of error >>>> + */ >>>> +int ram_write_tracking_start(void) >>>> +{ >>>> +#ifdef CONFIG_LINUX >>>> + int uffd_fd; >>>> + RAMState *rs = ram_state; >>>> + RAMBlock *bs; >>>> + >>>> + /* Open UFFD file descriptor */ >>>> + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); >>>> + if (uffd_fd < 0) { >>>> + return uffd_fd; >>>> + } >>>> + rs->uffdio_fd = uffd_fd; >>>> + >>>> + RCU_READ_LOCK_GUARD(); >>>> + >>>> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >>>> + /* Nothing to do with read-only and MMIO-writable regions */ >>>> + if (bs->mr->readonly || bs->mr->rom_device) { >>>> + continue; >>>> + } >>>> + >>>> + /* Register block memory with UFFD to track writes */ >>>> + if (uffd_register_memory(rs->uffdio_fd, bs->host, >>>> + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { >>>> + goto fail; >>>> + } >>>> + /* Apply UFFD write protection to the block memory range */ >>>> + if (uffd_change_protection(rs->uffdio_fd, bs->host, >>>> + bs->max_length, true, false)) { >>>> + goto fail; >>>> + } >>>> + bs->flags |= RAM_UF_WRITEPROTECT; >>>> + memory_region_ref(bs->mr); >>>> + >>>> + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, >>>> + bs->host, bs->max_length); >>>> + } >>>> + >>>> + return 0; >>>> + >>>> +fail: >>>> + error_report("ram_write_tracking_start() failed: restoring initial memory state"); >>>> + >>>> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >>>> + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { >>>> + continue; >>>> + } >>>> + /* >>>> + * In case some memory block failed to be write-protected >>>> + * remove protection and unregister all succeeded RAM blocks >>>> + */ >>>> + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); >>>> + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); >>>> + /* Cleanup flags and remove reference */ >>>> + bs->flags &= ~RAM_UF_WRITEPROTECT; >>>> + memory_region_unref(bs->mr); >>>> + } >>>> + >>>> + uffd_close_fd(uffd_fd); >>>> +#endif /* CONFIG_LINUX */ >>>> + rs->uffdio_fd = -1; >>>> + return -1; >>>> +} >>>> + >>>> +/** >>>> + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection >>>> + */ >>>> +void ram_write_tracking_stop(void) >>>> +{ >>>> +#ifdef CONFIG_LINUX >>>> + RAMState *rs = ram_state; >>>> + RAMBlock *bs; >>>> + >>>> + RCU_READ_LOCK_GUARD(); >>>> + >>>> + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { >>>> + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { >>>> + continue; >>>> + } >>>> + /* Remove protection and unregister all affected RAM blocks */ >>>> + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); >>>> + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); >>>> + >>>> + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, >>>> + bs->host, bs->max_length); >>>> + >>>> + /* Cleanup flags and remove reference */ >>>> + bs->flags &= ~RAM_UF_WRITEPROTECT; >>>> + memory_region_unref(bs->mr); >>>> + } >>>> + >>>> + /* Finally close UFFD file descriptor */ >>>> + uffd_close_fd(rs->uffdio_fd); >>>> + rs->uffdio_fd = -1; >>>> +#endif /* CONFIG_LINUX */ >>>> +} >>>> + >>>> static SaveVMHandlers savevm_ram_handlers = { >>>> .save_setup = ram_save_setup, >>>> .save_live_iterate = ram_save_iterate, >>>> diff --git a/migration/ram.h b/migration/ram.h >>>> index 1a9ff90304..c25540cb93 100644 >>>> --- a/migration/ram.h >>>> +++ b/migration/ram.h >>>> @@ -82,5 +82,7 @@ void colo_incoming_start_dirty_log(void); >>>> /* Background snapshot */ >>>> bool ram_write_tracking_available(void); >>>> bool ram_write_tracking_compatible(void); >>>> +int ram_write_tracking_start(void); >>>> +void ram_write_tracking_stop(void); >>>> #endif >>>> diff --git a/migration/trace-events b/migration/trace-events >>>> index 75de5004ac..668c562fed 100644 >>>> --- a/migration/trace-events >>>> +++ b/migration/trace-events >>>> @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" >>>> save_xbzrle_page_overflow(void) "" >>>> ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" >>>> ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 >>>> +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" >>>> +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" >>>> # multifd.c >>>> multifd_new_send_channel_async(uint8_t id) "channel %d" >>>> -- >>>> 2.25.1 >>>> >> -- >> Andrey Gruzdev, Principal Engineer >> Virtuozzo GmbH +7-903-247-6397 >> virtuzzo.com >>
diff --git a/include/exec/memory.h b/include/exec/memory.h index b76b1256bf..1aa1c6a3f4 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -150,6 +150,13 @@ typedef struct IOMMUTLBEvent { #define RAM_PMEM (1 << 5) +/* + * UFFDIO_WRITEPROTECT is used on this RAMBlock to + * support 'write-tracking' migration type. + * Implies ram_state->ram_wt_enabled. + */ +#define RAM_UF_WRITEPROTECT (1 << 6) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, diff --git a/migration/ram.c b/migration/ram.c index ae8de17153..5707382db1 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -56,6 +56,11 @@ #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" +#include "sysemu/runstate.h" + +#ifdef CONFIG_LINUX +#include "qemu/userfaultfd.h" +#endif /***********************************************************/ /* ram save/restore */ @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { struct RAMState { /* QEMUFile used for this migration */ QEMUFile *f; + /* UFFD file descriptor, used in 'write-tracking' migration */ + int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; /* Last block from where we have sent data */ @@ -1434,6 +1441,40 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) return block; } +#ifdef CONFIG_LINUX +/** + * poll_fault_page: try to get next UFFD write fault page and, if pending fault + * is found, return RAM block pointer and page offset + * + * Returns pointer to the RAMBlock containing faulting page, + * NULL if no write faults are pending + * + * @rs: current RAM state + * @offset: page offset from the beginning of the block + */ +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + struct uffd_msg uffd_msg; + void *page_address; + RAMBlock *bs; + int res; + + if (!migrate_background_snapshot()) { + return NULL; + } + + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); + if (res <= 0) { + return NULL; + } + + page_address = (void *) uffd_msg.arg.pagefault.address; + bs = qemu_ram_block_from_host(page_address, false, offset); + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); + return bs; +} +#endif /* CONFIG_LINUX */ + /** * get_queued_page: unqueue a page from the postcopy requests * @@ -1473,6 +1514,16 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) } while (block && !dirty); +#ifdef CONFIG_LINUX + if (!block) { + /* + * Poll write faults too if background snapshot is enabled; that's + * when we have vcpus got blocked by the write protected pages. + */ + block = poll_fault_page(rs, &offset); + } +#endif /* CONFIG_LINUX */ + if (block) { /* * As soon as we start servicing pages out of order, then we have @@ -1746,6 +1797,53 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, return pages; } +/** + * ram_save_host_page_pre: ram_save_host_page() pre-notifier + * + * @rs: current RAM state + * @pss: page-search-status structure + * @opaque: pointer to receive opaque context value + */ +static inline +void ram_save_host_page_pre(RAMState *rs, PageSearchStatus *pss, void **opaque) +{ + *(unsigned long *) opaque = pss->page; +} + +/** + * ram_save_host_page_post: ram_save_host_page() post-notifier + * + * @rs: current RAM state + * @pss: page-search-status structure + * @opaque: opaque context value + * @res_override: pointer to the return value of ram_save_host_page(), + * overwritten in case of an error + */ +static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, + void *opaque, int *res_override) +{ +#ifdef CONFIG_LINUX + /* Check if page is from UFFD-managed region. */ + if (pss->block->flags & RAM_UF_WRITEPROTECT) { + unsigned long page_from = (unsigned long) opaque; + + void *page_address = pss->block->host + (page_from << TARGET_PAGE_BITS); + uint64_t run_length = (pss->page - page_from + 1) << TARGET_PAGE_BITS; + int res; + + /* Flush async buffers before un-protect. */ + qemu_fflush(rs->f); + /* Un-protect memory range. */ + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, + false, false); + /* We don't want to override existing error from ram_save_host_page(). */ + if (res < 0 && *res_override >= 0) { + *res_override = res; + } + } +#endif /* CONFIG_LINUX */ +} + /** * ram_find_and_save_block: finds a dirty page and sends it to f * @@ -1790,7 +1888,11 @@ static int ram_find_and_save_block(RAMState *rs, bool last_stage) } if (found) { + void *opaque; + + ram_save_host_page_pre(rs, &pss, &opaque); pages = ram_save_host_page(rs, &pss, last_stage); + ram_save_host_page_post(rs, &pss, opaque, &pages); } } while (!pages && again); @@ -1880,10 +1982,13 @@ static void ram_save_cleanup(void *opaque) RAMState **rsp = opaque; RAMBlock *block; - /* caller have hold iothread lock or is in a bh, so there is - * no writing race against the migration bitmap - */ - memory_global_dirty_log_stop(); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against the migration bitmap + */ + memory_global_dirty_log_stop(); + } RAMBLOCK_FOREACH_NOT_IGNORED(block) { g_free(block->clear_bmap); @@ -2343,8 +2448,11 @@ static void ram_init_bitmaps(RAMState *rs) WITH_RCU_READ_LOCK_GUARD() { ram_list_init_bitmaps(); - memory_global_dirty_log_start(); - migration_bitmap_sync_precopy(rs); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + memory_global_dirty_log_start(); + migration_bitmap_sync_precopy(rs); + } } qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); @@ -3794,7 +3902,14 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) */ bool ram_write_tracking_available(void) { - /* TODO: implement */ +#ifdef CONFIG_LINUX + uint64_t uffd_features; + int res; + + res = uffd_query_features(&uffd_features); + return (res == 0 && + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); +#endif return false; } @@ -3805,10 +3920,148 @@ bool ram_write_tracking_available(void) */ bool ram_write_tracking_compatible(void) { - /* TODO: implement */ +#ifdef CONFIG_LINUX + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); + int uffd_fd; + RAMBlock *bs; + bool ret = false; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); + if (uffd_fd < 0) { + return false; + } + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + uint64_t uffd_ioctls; + + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + /* Try to register block memory via UFFD-IO to track writes */ + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { + goto out; + } + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { + goto out; + } + } + ret = true; + +out: + uffd_close_fd(uffd_fd); + return ret; +#endif return false; } +/* + * ram_write_tracking_start: start UFFD-WP memory tracking + * + * Returns 0 for success or negative value in case of error + */ +int ram_write_tracking_start(void) +{ +#ifdef CONFIG_LINUX + int uffd_fd; + RAMState *rs = ram_state; + RAMBlock *bs; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); + if (uffd_fd < 0) { + return uffd_fd; + } + rs->uffdio_fd = uffd_fd; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + + /* Register block memory with UFFD to track writes */ + if (uffd_register_memory(rs->uffdio_fd, bs->host, + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { + goto fail; + } + /* Apply UFFD write protection to the block memory range */ + if (uffd_change_protection(rs->uffdio_fd, bs->host, + bs->max_length, true, false)) { + goto fail; + } + bs->flags |= RAM_UF_WRITEPROTECT; + memory_region_ref(bs->mr); + + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, + bs->host, bs->max_length); + } + + return 0; + +fail: + error_report("ram_write_tracking_start() failed: restoring initial memory state"); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* + * In case some memory block failed to be write-protected + * remove protection and unregister all succeeded RAM blocks + */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + uffd_close_fd(uffd_fd); +#endif /* CONFIG_LINUX */ + rs->uffdio_fd = -1; + return -1; +} + +/** + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection + */ +void ram_write_tracking_stop(void) +{ +#ifdef CONFIG_LINUX + RAMState *rs = ram_state; + RAMBlock *bs; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* Remove protection and unregister all affected RAM blocks */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, + bs->host, bs->max_length); + + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + /* Finally close UFFD file descriptor */ + uffd_close_fd(rs->uffdio_fd); + rs->uffdio_fd = -1; +#endif /* CONFIG_LINUX */ +} + static SaveVMHandlers savevm_ram_handlers = { .save_setup = ram_save_setup, .save_live_iterate = ram_save_iterate, diff --git a/migration/ram.h b/migration/ram.h index 1a9ff90304..c25540cb93 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -82,5 +82,7 @@ void colo_incoming_start_dirty_log(void); /* Background snapshot */ bool ram_write_tracking_available(void); bool ram_write_tracking_compatible(void); +int ram_write_tracking_start(void); +void ram_write_tracking_stop(void); #endif diff --git a/migration/trace-events b/migration/trace-events index 75de5004ac..668c562fed 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %d"