Message ID | 20180216131625.9639-23-dgilbert@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Feb 16, 2018 at 01:16:18PM +0000, Dr. David Alan Gilbert (git) wrote: > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com> > > Cause the vhost-user client to be woken up whenever: > a) We place a page in postcopy mode > b) We get a fault and the page has already been received > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > --- > migration/postcopy-ram.c | 14 ++++++++++---- > migration/trace-events | 1 + > 2 files changed, 11 insertions(+), 4 deletions(-) > > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 879711968c..13561703b5 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -566,7 +566,11 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, > > trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), > rb_offset); > - /* TODO: Check bitmap to see if we already have the page */ > + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { > + trace_postcopy_request_shared_page_present(pcfd->idstr, > + qemu_ram_get_idstr(rb), rb_offset); > + return postcopy_wake_shared(pcfd, client_addr, rb); > + } > if (rb != mis->last_rb) { > mis->last_rb = rb; > migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > @@ -863,7 +867,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, > } > > trace_postcopy_place_page(host); > - return 0; > + return postcopy_notify_shared_wake(rb, > + qemu_ram_block_host_offset(rb, host)); > } > > /* > @@ -887,6 +892,9 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, > > return -e; > } > + return postcopy_notify_shared_wake(rb, > + qemu_ram_block_host_offset(rb, > + host)); > } else { > /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ > if (!mis->postcopy_tmp_zero_page) { > @@ -906,8 +914,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, > return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, > rb); > } > - > - return 0; > } Could there be race? E.g.: ram_load_thread page_fault_thread ----------------- ------------------- if (recv_bitmap_set()) wake() copy_page() recv_bitmap_set() wake() request_page() Then the last requested page may never be serviced? Thanks,
* Peter Xu (peterx@redhat.com) wrote: > On Fri, Feb 16, 2018 at 01:16:18PM +0000, Dr. David Alan Gilbert (git) wrote: > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com> > > > > Cause the vhost-user client to be woken up whenever: > > a) We place a page in postcopy mode > > b) We get a fault and the page has already been received > > > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > --- > > migration/postcopy-ram.c | 14 ++++++++++---- > > migration/trace-events | 1 + > > 2 files changed, 11 insertions(+), 4 deletions(-) > > > > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > > index 879711968c..13561703b5 100644 > > --- a/migration/postcopy-ram.c > > +++ b/migration/postcopy-ram.c > > @@ -566,7 +566,11 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, > > > > trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), > > rb_offset); > > - /* TODO: Check bitmap to see if we already have the page */ > > + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { > > + trace_postcopy_request_shared_page_present(pcfd->idstr, > > + qemu_ram_get_idstr(rb), rb_offset); > > + return postcopy_wake_shared(pcfd, client_addr, rb); > > + } > > if (rb != mis->last_rb) { > > mis->last_rb = rb; > > migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > > @@ -863,7 +867,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, > > } > > > > trace_postcopy_place_page(host); > > - return 0; > > + return postcopy_notify_shared_wake(rb, > > + qemu_ram_block_host_offset(rb, host)); > > } > > > > /* > > @@ -887,6 +892,9 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, > > > > return -e; > > } > > + return postcopy_notify_shared_wake(rb, > > + qemu_ram_block_host_offset(rb, > > + host)); > > } else { > > /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ > > if (!mis->postcopy_tmp_zero_page) { > > @@ -906,8 +914,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, > > return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, > > rb); > > } > > - > > - return 0; > > } > > Could there be race? E.g.: > > ram_load_thread page_fault_thread > ----------------- ------------------- > > if (recv_bitmap_set()) > wake() > copy_page() > recv_bitmap_set() > wake() > request_page() > > Then the last requested page may never be serviced? The postcopy finishes when the last page is received, and thus when that also performs the wake() (from the load thread); so that's not a problem. You can get the case where a page that qemu has already received, still needs to be woken for the shared users (which is why we have the wake in the fault_thread). When the postcopy finishes, the client is sent a POSTCOPY_END, at which point it closes it's userfaultfd and it should wake everything remaining up; so any late requests shouldn't be a problem (the END is sent before the fault-thread quits). Dave > Thanks, > > -- > Peter Xu -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Tue, Mar 06, 2018 at 10:36:52AM +0000, Dr. David Alan Gilbert wrote: > * Peter Xu (peterx@redhat.com) wrote: > > On Fri, Feb 16, 2018 at 01:16:18PM +0000, Dr. David Alan Gilbert (git) wrote: > > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com> > > > > > > Cause the vhost-user client to be woken up whenever: > > > a) We place a page in postcopy mode > > > b) We get a fault and the page has already been received > > > > > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > > --- > > > migration/postcopy-ram.c | 14 ++++++++++---- > > > migration/trace-events | 1 + > > > 2 files changed, 11 insertions(+), 4 deletions(-) > > > > > > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > > > index 879711968c..13561703b5 100644 > > > --- a/migration/postcopy-ram.c > > > +++ b/migration/postcopy-ram.c > > > @@ -566,7 +566,11 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, > > > > > > trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), > > > rb_offset); > > > - /* TODO: Check bitmap to see if we already have the page */ > > > + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { > > > + trace_postcopy_request_shared_page_present(pcfd->idstr, > > > + qemu_ram_get_idstr(rb), rb_offset); > > > + return postcopy_wake_shared(pcfd, client_addr, rb); > > > + } > > > if (rb != mis->last_rb) { > > > mis->last_rb = rb; > > > migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > > > @@ -863,7 +867,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, > > > } > > > > > > trace_postcopy_place_page(host); > > > - return 0; > > > + return postcopy_notify_shared_wake(rb, > > > + qemu_ram_block_host_offset(rb, host)); > > > } > > > > > > /* > > > @@ -887,6 +892,9 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, > > > > > > return -e; > > > } > > > + return postcopy_notify_shared_wake(rb, > > > + qemu_ram_block_host_offset(rb, > > > + host)); > > > } else { > > > /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ > > > if (!mis->postcopy_tmp_zero_page) { > > > @@ -906,8 +914,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, > > > return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, > > > rb); > > > } > > > - > > > - return 0; > > > } > > > > Could there be race? E.g.: > > > > ram_load_thread page_fault_thread > > ----------------- ------------------- > > > > if (recv_bitmap_set()) > > wake() > > copy_page() > > recv_bitmap_set() > > wake() > > request_page() > > > > Then the last requested page may never be serviced? > > The postcopy finishes when the last page is received, and thus when that > also performs the wake() (from the load thread); so that's not a > problem. > You can get the case where a page that qemu has already received, still > needs to be woken for the shared users (which is why we have the wake in > the fault_thread). > When the postcopy finishes, the client is sent a POSTCOPY_END, at which > point it closes it's userfaultfd and it should wake everything remaining > up; so any late requests shouldn't be a problem (the END is sent > before the fault-thread quits). Yeah now I think the race is invalid - the wake() in ram_load_thread will wake up the paused thread in this case. I misunderstood. Thanks,
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 879711968c..13561703b5 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -566,7 +566,11 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), rb_offset); - /* TODO: Check bitmap to see if we already have the page */ + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { + trace_postcopy_request_shared_page_present(pcfd->idstr, + qemu_ram_get_idstr(rb), rb_offset); + return postcopy_wake_shared(pcfd, client_addr, rb); + } if (rb != mis->last_rb) { mis->last_rb = rb; migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), @@ -863,7 +867,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, } trace_postcopy_place_page(host); - return 0; + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, host)); } /* @@ -887,6 +892,9 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, return -e; } + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, + host)); } else { /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ if (!mis->postcopy_tmp_zero_page) { @@ -906,8 +914,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); } - - return 0; } /* diff --git a/migration/trace-events b/migration/trace-events index b0acaaa8a0..1e353a317f 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -199,6 +199,7 @@ postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" postcopy_ram_incoming_cleanup_join(void) "" postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64 +postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64 postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s" save_xbzrle_page_skipping(void) ""