[v5,09/29] postcopy: Allow registering of fd handler

Message ID	20180312172124.56461-10-dgilbert@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org> From: "Dr. David Alan Gilbert (git)" <dgilbert@redhat.com> To: qemu-devel@nongnu.org, mst@redhat.com, maxime.coquelin@redhat.com, marcandre.lureau@redhat.com, peterx@redhat.com, quintela@redhat.com Date: Mon, 12 Mar 2018 17:21:04 +0000 Message-Id: <20180312172124.56461-10-dgilbert@redhat.com> In-Reply-To: <20180312172124.56461-1-dgilbert@redhat.com> References: <20180312172124.56461-1-dgilbert@redhat.com> Subject: [Qemu-devel] [PATCH v5 09/29] postcopy: Allow registering of fd handler Precedence: list Cc: aarcange@redhat.com Errors-To: qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org>

Message ID

20180312172124.56461-10-dgilbert@redhat.com (mailing list archive)

State

New, archived

Headers

From: "Dr. David Alan Gilbert (git)" <dgilbert@redhat.com>
To: qemu-devel@nongnu.org, mst@redhat.com, maxime.coquelin@redhat.com,
	marcandre.lureau@redhat.com, peterx@redhat.com, quintela@redhat.com
Date: Mon, 12 Mar 2018 17:21:04 +0000
Message-Id: <20180312172124.56461-10-dgilbert@redhat.com>
In-Reply-To: <20180312172124.56461-1-dgilbert@redhat.com>
References: <20180312172124.56461-1-dgilbert@redhat.com>
Subject: [Qemu-devel] [PATCH v5 09/29] postcopy: Allow registering of fd
	handler
Precedence: list
Cc: aarcange@redhat.com
Errors-To: qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org
Sender: "Qemu-devel"
	<qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org>

Commit Message

Dr. David Alan Gilbert March 12, 2018, 5:21 p.m. UTC

From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Allow other userfaultfd's to be registered into the fault thread
so that handlers for shared memory can get responses.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
 migration/migration.c    |   6 ++
 migration/migration.h    |   2 +
 migration/postcopy-ram.c | 210 +++++++++++++++++++++++++++++++++++------------
 migration/postcopy-ram.h |  21 +++++
 migration/trace-events   |   2 +
 5 files changed, 188 insertions(+), 53 deletions(-)

Comments

Michael S. Tsirkin March 13, 2018, 8:34 p.m. UTC | #1

On Mon, Mar 12, 2018 at 05:21:04PM +0000, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Allow other userfaultfd's to be registered into the fault thread
> so that handlers for shared memory can get responses.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Reviewed-by: Peter Xu <peterx@redhat.com>

new blank line at EOF errors.

> ---
>  migration/migration.c    |   6 ++
>  migration/migration.h    |   2 +
>  migration/postcopy-ram.c | 210 +++++++++++++++++++++++++++++++++++------------
>  migration/postcopy-ram.h |  21 +++++
>  migration/trace-events   |   2 +
>  5 files changed, 188 insertions(+), 53 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 6a4780ef6f..1f22f463d3 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -155,6 +155,8 @@ MigrationIncomingState *migration_incoming_get_current(void)
>      if (!once) {
>          mis_current.state = MIGRATION_STATUS_NONE;
>          memset(&mis_current, 0, sizeof(MigrationIncomingState));
> +        mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE,
> +                                                   sizeof(struct PostCopyFD));
>          qemu_mutex_init(&mis_current.rp_mutex);
>          qemu_event_init(&mis_current.main_thread_load_event, false);
>          once = true;
> @@ -177,6 +179,10 @@ void migration_incoming_state_destroy(void)
>          qemu_fclose(mis->from_src_file);
>          mis->from_src_file = NULL;
>      }
> +    if (mis->postcopy_remote_fds) {
> +        g_array_free(mis->postcopy_remote_fds, TRUE);
> +        mis->postcopy_remote_fds = NULL;
> +    }
>  
>      qemu_event_reset(&mis->main_thread_load_event);
>  }
> diff --git a/migration/migration.h b/migration/migration.h
> index 08c5d2ded1..d02a759331 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -51,6 +51,8 @@ struct MigrationIncomingState {
>      QemuMutex rp_mutex;    /* We send replies from multiple threads */
>      void     *postcopy_tmp_page;
>      void     *postcopy_tmp_zero_page;
> +    /* PostCopyFD's for external userfaultfds & handlers of shared memory */
> +    GArray   *postcopy_remote_fds;
>  
>      QEMUBH *bh;
>  
> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> index 1089814d54..4ab1b7d36d 100644
> --- a/migration/postcopy-ram.c
> +++ b/migration/postcopy-ram.c
> @@ -533,29 +533,44 @@ static void *postcopy_ram_fault_thread(void *opaque)
>      MigrationIncomingState *mis = opaque;
>      struct uffd_msg msg;
>      int ret;
> +    size_t index;
>      RAMBlock *rb = NULL;
>      RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
>  
>      trace_postcopy_ram_fault_thread_entry();
>      qemu_sem_post(&mis->fault_thread_sem);
>  
> +    struct pollfd *pfd;
> +    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
> +
> +    pfd = g_new0(struct pollfd, pfd_len);
> +
> +    pfd[0].fd = mis->userfault_fd;
> +    pfd[0].events = POLLIN;
> +    pfd[1].fd = mis->userfault_event_fd;
> +    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
> +    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
> +    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
> +        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
> +                                                 struct PostCopyFD, index);
> +        pfd[2 + index].fd = pcfd->fd;
> +        pfd[2 + index].events = POLLIN;
> +        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
> +                                                  pcfd->fd);
> +    }
> +
>      while (true) {
>          ram_addr_t rb_offset;
> -        struct pollfd pfd[2];
> +        int poll_result;
>  
>          /*
>           * We're mainly waiting for the kernel to give us a faulting HVA,
>           * however we can be told to quit via userfault_quit_fd which is
>           * an eventfd
>           */
> -        pfd[0].fd = mis->userfault_fd;
> -        pfd[0].events = POLLIN;
> -        pfd[0].revents = 0;
> -        pfd[1].fd = mis->userfault_event_fd;
> -        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
> -        pfd[1].revents = 0;
> -
> -        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
> +
> +        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
> +        if (poll_result == -1) {
>              error_report("%s: userfault poll: %s", __func__, strerror(errno));
>              break;
>          }
> @@ -575,57 +590,117 @@ static void *postcopy_ram_fault_thread(void *opaque)
>              }
>          }
>  
> -        ret = read(mis->userfault_fd, &msg, sizeof(msg));
> -        if (ret != sizeof(msg)) {
> -            if (errno == EAGAIN) {
> -                /*
> -                 * if a wake up happens on the other thread just after
> -                 * the poll, there is nothing to read.
> -                 */
> -                continue;
> +        if (pfd[0].revents) {
> +            poll_result--;
> +            ret = read(mis->userfault_fd, &msg, sizeof(msg));
> +            if (ret != sizeof(msg)) {
> +                if (errno == EAGAIN) {
> +                    /*
> +                     * if a wake up happens on the other thread just after
> +                     * the poll, there is nothing to read.
> +                     */
> +                    continue;
> +                }
> +                if (ret < 0) {
> +                    error_report("%s: Failed to read full userfault "
> +                                 "message: %s",
> +                                 __func__, strerror(errno));
> +                    break;
> +                } else {
> +                    error_report("%s: Read %d bytes from userfaultfd "
> +                                 "expected %zd",
> +                                 __func__, ret, sizeof(msg));
> +                    break; /* Lost alignment, don't know what we'd read next */
> +                }
>              }
> -            if (ret < 0) {
> -                error_report("%s: Failed to read full userfault message: %s",
> -                             __func__, strerror(errno));
> -                break;
> -            } else {
> -                error_report("%s: Read %d bytes from userfaultfd expected %zd",
> -                             __func__, ret, sizeof(msg));
> -                break; /* Lost alignment, don't know what we'd read next */
> +            if (msg.event != UFFD_EVENT_PAGEFAULT) {
> +                error_report("%s: Read unexpected event %ud from userfaultfd",
> +                             __func__, msg.event);
> +                continue; /* It's not a page fault, shouldn't happen */
>              }
> -        }
> -        if (msg.event != UFFD_EVENT_PAGEFAULT) {
> -            error_report("%s: Read unexpected event %ud from userfaultfd",
> -                         __func__, msg.event);
> -            continue; /* It's not a page fault, shouldn't happen */
> -        }
>  
> -        rb = qemu_ram_block_from_host(
> -                 (void *)(uintptr_t)msg.arg.pagefault.address,
> -                 true, &rb_offset);
> -        if (!rb) {
> -            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
> -                         PRIx64, (uint64_t)msg.arg.pagefault.address);
> -            break;
> -        }
> +            rb = qemu_ram_block_from_host(
> +                     (void *)(uintptr_t)msg.arg.pagefault.address,
> +                     true, &rb_offset);
> +            if (!rb) {
> +                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
> +                             PRIx64, (uint64_t)msg.arg.pagefault.address);
> +                break;
> +            }
>  
> -        rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> -        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> +            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> +            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
>                                                  qemu_ram_get_idstr(rb),
>                                                  rb_offset);
> +            /*
> +             * Send the request to the source - we want to request one
> +             * of our host page sizes (which is >= TPS)
> +             */
> +            if (rb != last_rb) {
> +                last_rb = rb;
> +                migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
> +                                         rb_offset, qemu_ram_pagesize(rb));
> +            } else {
> +                /* Save some space */
> +                migrate_send_rp_req_pages(mis, NULL,
> +                                         rb_offset, qemu_ram_pagesize(rb));
> +            }
> +        }
>  
> -        /*
> -         * Send the request to the source - we want to request one
> -         * of our host page sizes (which is >= TPS)
> -         */
> -        if (rb != last_rb) {
> -            last_rb = rb;
> -            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
> -                                     rb_offset, qemu_ram_pagesize(rb));
> -        } else {
> -            /* Save some space */
> -            migrate_send_rp_req_pages(mis, NULL,
> -                                     rb_offset, qemu_ram_pagesize(rb));
> +        /* Now handle any requests from external processes on shared memory */
> +        /* TODO: May need to handle devices deregistering during postcopy */
> +        for (index = 2; index < pfd_len && poll_result; index++) {
> +            if (pfd[index].revents) {
> +                struct PostCopyFD *pcfd =
> +                    &g_array_index(mis->postcopy_remote_fds,
> +                                   struct PostCopyFD, index - 2);
> +
> +                poll_result--;
> +                if (pfd[index].revents & POLLERR) {
> +                    error_report("%s: POLLERR on poll %zd fd=%d",
> +                                 __func__, index, pcfd->fd);
> +                    pfd[index].events = 0;
> +                    continue;
> +                }
> +
> +                ret = read(pcfd->fd, &msg, sizeof(msg));
> +                if (ret != sizeof(msg)) {
> +                    if (errno == EAGAIN) {
> +                        /*
> +                         * if a wake up happens on the other thread just after
> +                         * the poll, there is nothing to read.
> +                         */
> +                        continue;
> +                    }
> +                    if (ret < 0) {
> +                        error_report("%s: Failed to read full userfault "
> +                                     "message: %s (shared) revents=%d",
> +                                     __func__, strerror(errno),
> +                                     pfd[index].revents);
> +                        /*TODO: Could just disable this sharer */
> +                        break;
> +                    } else {
> +                        error_report("%s: Read %d bytes from userfaultfd "
> +                                     "expected %zd (shared)",
> +                                     __func__, ret, sizeof(msg));
> +                        /*TODO: Could just disable this sharer */
> +                        break; /*Lost alignment,don't know what we'd read next*/
> +                    }
> +                }
> +                if (msg.event != UFFD_EVENT_PAGEFAULT) {
> +                    error_report("%s: Read unexpected event %ud "
> +                                 "from userfaultfd (shared)",
> +                                 __func__, msg.event);
> +                    continue; /* It's not a page fault, shouldn't happen */
> +                }
> +                /* Call the device handler registered with us */
> +                ret = pcfd->handler(pcfd, &msg);
> +                if (ret) {
> +                    error_report("%s: Failed to resolve shared fault on %zd/%s",
> +                                 __func__, index, pcfd->idstr);
> +                    /* TODO: Fail? Disable this sharer? */
> +                }
> +            }
>          }
>      }
>      trace_postcopy_ram_fault_thread_exit();
> @@ -970,3 +1045,32 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
>  {
>      return atomic_xchg(&incoming_postcopy_state, new_state);
>  }
> +
> +/* Register a handler for external shared memory postcopy
> + * called on the destination.
> + */
> +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +
> +    mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
> +                                                  *pcfd);
> +}
> +
> +/* Unregister a handler for external shared memory postcopy
> + */
> +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
> +{
> +    guint i;
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +    GArray *pcrfds = mis->postcopy_remote_fds;
> +
> +    for (i = 0; i < pcrfds->len; i++) {
> +        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
> +        if (cur->fd == pcfd->fd) {
> +            mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
> +            return;
> +        }
> +    }
> +}
> +
> diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
> index 0421c98d57..f21eef6702 100644
> --- a/migration/postcopy-ram.h
> +++ b/migration/postcopy-ram.h
> @@ -143,4 +143,25 @@ void postcopy_remove_notifier(NotifierWithReturn *n);
>  /* Call the notifier list set by postcopy_add_start_notifier */
>  int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp);
>  
> +struct PostCopyFD;
> +
> +/* ufd is a pointer to the struct uffd_msg *TODO: more Portable! */
> +typedef int (*pcfdhandler)(struct PostCopyFD *pcfd, void *ufd);
> +
> +struct PostCopyFD {
> +    int fd;
> +    /* Data to pass to handler */
> +    void *data;
> +    /* Handler to be called whenever we get a poll event */
> +    pcfdhandler handler;
> +    /* A string to use in error messages */
> +    const char *idstr;
> +};
> +
> +/* Register a userfaultfd owned by an external process for
> + * shared memory.
> + */
> +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd);
> +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd);
> +
>  #endif
> diff --git a/migration/trace-events b/migration/trace-events
> index 93961dea16..1e617ad7a6 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -190,6 +190,8 @@ postcopy_place_page_zero(void *host_addr) "host=%p"
>  postcopy_ram_enable_notify(void) ""
>  postcopy_ram_fault_thread_entry(void) ""
>  postcopy_ram_fault_thread_exit(void) ""
> +postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
> +postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d"
>  postcopy_ram_fault_thread_quit(void) ""
>  postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx"
>  postcopy_ram_incoming_cleanup_closeuf(void) ""
> -- 
> 2.14.3

Dr. David Alan Gilbert March 14, 2018, 10:05 a.m. UTC | #2

* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Mon, Mar 12, 2018 at 05:21:04PM +0000, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Allow other userfaultfd's to be registered into the fault thread
> > so that handlers for shared memory can get responses.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > Reviewed-by: Peter Xu <peterx@redhat.com>
> 
> new blank line at EOF errors.

Done.

Dave

> > ---
> >  migration/migration.c    |   6 ++
> >  migration/migration.h    |   2 +
> >  migration/postcopy-ram.c | 210 +++++++++++++++++++++++++++++++++++------------
> >  migration/postcopy-ram.h |  21 +++++
> >  migration/trace-events   |   2 +
> >  5 files changed, 188 insertions(+), 53 deletions(-)
> > 
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 6a4780ef6f..1f22f463d3 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -155,6 +155,8 @@ MigrationIncomingState *migration_incoming_get_current(void)
> >      if (!once) {
> >          mis_current.state = MIGRATION_STATUS_NONE;
> >          memset(&mis_current, 0, sizeof(MigrationIncomingState));
> > +        mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE,
> > +                                                   sizeof(struct PostCopyFD));
> >          qemu_mutex_init(&mis_current.rp_mutex);
> >          qemu_event_init(&mis_current.main_thread_load_event, false);
> >          once = true;
> > @@ -177,6 +179,10 @@ void migration_incoming_state_destroy(void)
> >          qemu_fclose(mis->from_src_file);
> >          mis->from_src_file = NULL;
> >      }
> > +    if (mis->postcopy_remote_fds) {
> > +        g_array_free(mis->postcopy_remote_fds, TRUE);
> > +        mis->postcopy_remote_fds = NULL;
> > +    }
> >  
> >      qemu_event_reset(&mis->main_thread_load_event);
> >  }
> > diff --git a/migration/migration.h b/migration/migration.h
> > index 08c5d2ded1..d02a759331 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -51,6 +51,8 @@ struct MigrationIncomingState {
> >      QemuMutex rp_mutex;    /* We send replies from multiple threads */
> >      void     *postcopy_tmp_page;
> >      void     *postcopy_tmp_zero_page;
> > +    /* PostCopyFD's for external userfaultfds & handlers of shared memory */
> > +    GArray   *postcopy_remote_fds;
> >  
> >      QEMUBH *bh;
> >  
> > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> > index 1089814d54..4ab1b7d36d 100644
> > --- a/migration/postcopy-ram.c
> > +++ b/migration/postcopy-ram.c
> > @@ -533,29 +533,44 @@ static void *postcopy_ram_fault_thread(void *opaque)
> >      MigrationIncomingState *mis = opaque;
> >      struct uffd_msg msg;
> >      int ret;
> > +    size_t index;
> >      RAMBlock *rb = NULL;
> >      RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
> >  
> >      trace_postcopy_ram_fault_thread_entry();
> >      qemu_sem_post(&mis->fault_thread_sem);
> >  
> > +    struct pollfd *pfd;
> > +    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
> > +
> > +    pfd = g_new0(struct pollfd, pfd_len);
> > +
> > +    pfd[0].fd = mis->userfault_fd;
> > +    pfd[0].events = POLLIN;
> > +    pfd[1].fd = mis->userfault_event_fd;
> > +    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
> > +    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
> > +    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
> > +        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
> > +                                                 struct PostCopyFD, index);
> > +        pfd[2 + index].fd = pcfd->fd;
> > +        pfd[2 + index].events = POLLIN;
> > +        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
> > +                                                  pcfd->fd);
> > +    }
> > +
> >      while (true) {
> >          ram_addr_t rb_offset;
> > -        struct pollfd pfd[2];
> > +        int poll_result;
> >  
> >          /*
> >           * We're mainly waiting for the kernel to give us a faulting HVA,
> >           * however we can be told to quit via userfault_quit_fd which is
> >           * an eventfd
> >           */
> > -        pfd[0].fd = mis->userfault_fd;
> > -        pfd[0].events = POLLIN;
> > -        pfd[0].revents = 0;
> > -        pfd[1].fd = mis->userfault_event_fd;
> > -        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
> > -        pfd[1].revents = 0;
> > -
> > -        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
> > +
> > +        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
> > +        if (poll_result == -1) {
> >              error_report("%s: userfault poll: %s", __func__, strerror(errno));
> >              break;
> >          }
> > @@ -575,57 +590,117 @@ static void *postcopy_ram_fault_thread(void *opaque)
> >              }
> >          }
> >  
> > -        ret = read(mis->userfault_fd, &msg, sizeof(msg));
> > -        if (ret != sizeof(msg)) {
> > -            if (errno == EAGAIN) {
> > -                /*
> > -                 * if a wake up happens on the other thread just after
> > -                 * the poll, there is nothing to read.
> > -                 */
> > -                continue;
> > +        if (pfd[0].revents) {
> > +            poll_result--;
> > +            ret = read(mis->userfault_fd, &msg, sizeof(msg));
> > +            if (ret != sizeof(msg)) {
> > +                if (errno == EAGAIN) {
> > +                    /*
> > +                     * if a wake up happens on the other thread just after
> > +                     * the poll, there is nothing to read.
> > +                     */
> > +                    continue;
> > +                }
> > +                if (ret < 0) {
> > +                    error_report("%s: Failed to read full userfault "
> > +                                 "message: %s",
> > +                                 __func__, strerror(errno));
> > +                    break;
> > +                } else {
> > +                    error_report("%s: Read %d bytes from userfaultfd "
> > +                                 "expected %zd",
> > +                                 __func__, ret, sizeof(msg));
> > +                    break; /* Lost alignment, don't know what we'd read next */
> > +                }
> >              }
> > -            if (ret < 0) {
> > -                error_report("%s: Failed to read full userfault message: %s",
> > -                             __func__, strerror(errno));
> > -                break;
> > -            } else {
> > -                error_report("%s: Read %d bytes from userfaultfd expected %zd",
> > -                             __func__, ret, sizeof(msg));
> > -                break; /* Lost alignment, don't know what we'd read next */
> > +            if (msg.event != UFFD_EVENT_PAGEFAULT) {
> > +                error_report("%s: Read unexpected event %ud from userfaultfd",
> > +                             __func__, msg.event);
> > +                continue; /* It's not a page fault, shouldn't happen */
> >              }
> > -        }
> > -        if (msg.event != UFFD_EVENT_PAGEFAULT) {
> > -            error_report("%s: Read unexpected event %ud from userfaultfd",
> > -                         __func__, msg.event);
> > -            continue; /* It's not a page fault, shouldn't happen */
> > -        }
> >  
> > -        rb = qemu_ram_block_from_host(
> > -                 (void *)(uintptr_t)msg.arg.pagefault.address,
> > -                 true, &rb_offset);
> > -        if (!rb) {
> > -            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
> > -                         PRIx64, (uint64_t)msg.arg.pagefault.address);
> > -            break;
> > -        }
> > +            rb = qemu_ram_block_from_host(
> > +                     (void *)(uintptr_t)msg.arg.pagefault.address,
> > +                     true, &rb_offset);
> > +            if (!rb) {
> > +                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
> > +                             PRIx64, (uint64_t)msg.arg.pagefault.address);
> > +                break;
> > +            }
> >  
> > -        rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> > -        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> > +            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> > +            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> >                                                  qemu_ram_get_idstr(rb),
> >                                                  rb_offset);
> > +            /*
> > +             * Send the request to the source - we want to request one
> > +             * of our host page sizes (which is >= TPS)
> > +             */
> > +            if (rb != last_rb) {
> > +                last_rb = rb;
> > +                migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
> > +                                         rb_offset, qemu_ram_pagesize(rb));
> > +            } else {
> > +                /* Save some space */
> > +                migrate_send_rp_req_pages(mis, NULL,
> > +                                         rb_offset, qemu_ram_pagesize(rb));
> > +            }
> > +        }
> >  
> > -        /*
> > -         * Send the request to the source - we want to request one
> > -         * of our host page sizes (which is >= TPS)
> > -         */
> > -        if (rb != last_rb) {
> > -            last_rb = rb;
> > -            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
> > -                                     rb_offset, qemu_ram_pagesize(rb));
> > -        } else {
> > -            /* Save some space */
> > -            migrate_send_rp_req_pages(mis, NULL,
> > -                                     rb_offset, qemu_ram_pagesize(rb));
> > +        /* Now handle any requests from external processes on shared memory */
> > +        /* TODO: May need to handle devices deregistering during postcopy */
> > +        for (index = 2; index < pfd_len && poll_result; index++) {
> > +            if (pfd[index].revents) {
> > +                struct PostCopyFD *pcfd =
> > +                    &g_array_index(mis->postcopy_remote_fds,
> > +                                   struct PostCopyFD, index - 2);
> > +
> > +                poll_result--;
> > +                if (pfd[index].revents & POLLERR) {
> > +                    error_report("%s: POLLERR on poll %zd fd=%d",
> > +                                 __func__, index, pcfd->fd);
> > +                    pfd[index].events = 0;
> > +                    continue;
> > +                }
> > +
> > +                ret = read(pcfd->fd, &msg, sizeof(msg));
> > +                if (ret != sizeof(msg)) {
> > +                    if (errno == EAGAIN) {
> > +                        /*
> > +                         * if a wake up happens on the other thread just after
> > +                         * the poll, there is nothing to read.
> > +                         */
> > +                        continue;
> > +                    }
> > +                    if (ret < 0) {
> > +                        error_report("%s: Failed to read full userfault "
> > +                                     "message: %s (shared) revents=%d",
> > +                                     __func__, strerror(errno),
> > +                                     pfd[index].revents);
> > +                        /*TODO: Could just disable this sharer */
> > +                        break;
> > +                    } else {
> > +                        error_report("%s: Read %d bytes from userfaultfd "
> > +                                     "expected %zd (shared)",
> > +                                     __func__, ret, sizeof(msg));
> > +                        /*TODO: Could just disable this sharer */
> > +                        break; /*Lost alignment,don't know what we'd read next*/
> > +                    }
> > +                }
> > +                if (msg.event != UFFD_EVENT_PAGEFAULT) {
> > +                    error_report("%s: Read unexpected event %ud "
> > +                                 "from userfaultfd (shared)",
> > +                                 __func__, msg.event);
> > +                    continue; /* It's not a page fault, shouldn't happen */
> > +                }
> > +                /* Call the device handler registered with us */
> > +                ret = pcfd->handler(pcfd, &msg);
> > +                if (ret) {
> > +                    error_report("%s: Failed to resolve shared fault on %zd/%s",
> > +                                 __func__, index, pcfd->idstr);
> > +                    /* TODO: Fail? Disable this sharer? */
> > +                }
> > +            }
> >          }
> >      }
> >      trace_postcopy_ram_fault_thread_exit();
> > @@ -970,3 +1045,32 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
> >  {
> >      return atomic_xchg(&incoming_postcopy_state, new_state);
> >  }
> > +
> > +/* Register a handler for external shared memory postcopy
> > + * called on the destination.
> > + */
> > +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +
> > +    mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
> > +                                                  *pcfd);
> > +}
> > +
> > +/* Unregister a handler for external shared memory postcopy
> > + */
> > +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
> > +{
> > +    guint i;
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    GArray *pcrfds = mis->postcopy_remote_fds;
> > +
> > +    for (i = 0; i < pcrfds->len; i++) {
> > +        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
> > +        if (cur->fd == pcfd->fd) {
> > +            mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
> > +            return;
> > +        }
> > +    }
> > +}
> > +
> > diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
> > index 0421c98d57..f21eef6702 100644
> > --- a/migration/postcopy-ram.h
> > +++ b/migration/postcopy-ram.h
> > @@ -143,4 +143,25 @@ void postcopy_remove_notifier(NotifierWithReturn *n);
> >  /* Call the notifier list set by postcopy_add_start_notifier */
> >  int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp);
> >  
> > +struct PostCopyFD;
> > +
> > +/* ufd is a pointer to the struct uffd_msg *TODO: more Portable! */
> > +typedef int (*pcfdhandler)(struct PostCopyFD *pcfd, void *ufd);
> > +
> > +struct PostCopyFD {
> > +    int fd;
> > +    /* Data to pass to handler */
> > +    void *data;
> > +    /* Handler to be called whenever we get a poll event */
> > +    pcfdhandler handler;
> > +    /* A string to use in error messages */
> > +    const char *idstr;
> > +};
> > +
> > +/* Register a userfaultfd owned by an external process for
> > + * shared memory.
> > + */
> > +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd);
> > +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd);
> > +
> >  #endif
> > diff --git a/migration/trace-events b/migration/trace-events
> > index 93961dea16..1e617ad7a6 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -190,6 +190,8 @@ postcopy_place_page_zero(void *host_addr) "host=%p"
> >  postcopy_ram_enable_notify(void) ""
> >  postcopy_ram_fault_thread_entry(void) ""
> >  postcopy_ram_fault_thread_exit(void) ""
> > +postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
> > +postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d"
> >  postcopy_ram_fault_thread_quit(void) ""
> >  postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx"
> >  postcopy_ram_incoming_cleanup_closeuf(void) ""
> > -- 
> > 2.14.3
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

diff --git a/migration/migration.c b/migration/migration.c
index 6a4780ef6f..1f22f463d3 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -155,6 +155,8 @@  MigrationIncomingState *migration_incoming_get_current(void)
     if (!once) {
         mis_current.state = MIGRATION_STATUS_NONE;
         memset(&mis_current, 0, sizeof(MigrationIncomingState));
+        mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE,
+                                                   sizeof(struct PostCopyFD));
         qemu_mutex_init(&mis_current.rp_mutex);
         qemu_event_init(&mis_current.main_thread_load_event, false);
         once = true;
@@ -177,6 +179,10 @@  void migration_incoming_state_destroy(void)
         qemu_fclose(mis->from_src_file);
         mis->from_src_file = NULL;
     }
+    if (mis->postcopy_remote_fds) {
+        g_array_free(mis->postcopy_remote_fds, TRUE);
+        mis->postcopy_remote_fds = NULL;
+    }
 
     qemu_event_reset(&mis->main_thread_load_event);
 }
diff --git a/migration/migration.h b/migration/migration.h
index 08c5d2ded1..d02a759331 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -51,6 +51,8 @@  struct MigrationIncomingState {
     QemuMutex rp_mutex;    /* We send replies from multiple threads */
     void     *postcopy_tmp_page;
     void     *postcopy_tmp_zero_page;
+    /* PostCopyFD's for external userfaultfds & handlers of shared memory */
+    GArray   *postcopy_remote_fds;
 
     QEMUBH *bh;
 
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 1089814d54..4ab1b7d36d 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -533,29 +533,44 @@  static void *postcopy_ram_fault_thread(void *opaque)
     MigrationIncomingState *mis = opaque;
     struct uffd_msg msg;
     int ret;
+    size_t index;
     RAMBlock *rb = NULL;
     RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
 
     trace_postcopy_ram_fault_thread_entry();
     qemu_sem_post(&mis->fault_thread_sem);
 
+    struct pollfd *pfd;
+    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
+
+    pfd = g_new0(struct pollfd, pfd_len);
+
+    pfd[0].fd = mis->userfault_fd;
+    pfd[0].events = POLLIN;
+    pfd[1].fd = mis->userfault_event_fd;
+    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
+    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
+    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
+        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
+                                                 struct PostCopyFD, index);
+        pfd[2 + index].fd = pcfd->fd;
+        pfd[2 + index].events = POLLIN;
+        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
+                                                  pcfd->fd);
+    }
+
     while (true) {
         ram_addr_t rb_offset;
-        struct pollfd pfd[2];
+        int poll_result;
 
         /*
          * We're mainly waiting for the kernel to give us a faulting HVA,
          * however we can be told to quit via userfault_quit_fd which is
          * an eventfd
          */
-        pfd[0].fd = mis->userfault_fd;
-        pfd[0].events = POLLIN;
-        pfd[0].revents = 0;
-        pfd[1].fd = mis->userfault_event_fd;
-        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
-        pfd[1].revents = 0;
-
-        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
+
+        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
+        if (poll_result == -1) {
             error_report("%s: userfault poll: %s", __func__, strerror(errno));
             break;
         }
@@ -575,57 +590,117 @@  static void *postcopy_ram_fault_thread(void *opaque)
             }
         }
 
-        ret = read(mis->userfault_fd, &msg, sizeof(msg));
-        if (ret != sizeof(msg)) {
-            if (errno == EAGAIN) {
-                /*
-                 * if a wake up happens on the other thread just after
-                 * the poll, there is nothing to read.
-                 */
-                continue;
+        if (pfd[0].revents) {
+            poll_result--;
+            ret = read(mis->userfault_fd, &msg, sizeof(msg));
+            if (ret != sizeof(msg)) {
+                if (errno == EAGAIN) {
+                    /*
+                     * if a wake up happens on the other thread just after
+                     * the poll, there is nothing to read.
+                     */
+                    continue;
+                }
+                if (ret < 0) {
+                    error_report("%s: Failed to read full userfault "
+                                 "message: %s",
+                                 __func__, strerror(errno));
+                    break;
+                } else {
+                    error_report("%s: Read %d bytes from userfaultfd "
+                                 "expected %zd",
+                                 __func__, ret, sizeof(msg));
+                    break; /* Lost alignment, don't know what we'd read next */
+                }
             }
-            if (ret < 0) {
-                error_report("%s: Failed to read full userfault message: %s",
-                             __func__, strerror(errno));
-                break;
-            } else {
-                error_report("%s: Read %d bytes from userfaultfd expected %zd",
-                             __func__, ret, sizeof(msg));
-                break; /* Lost alignment, don't know what we'd read next */
+            if (msg.event != UFFD_EVENT_PAGEFAULT) {
+                error_report("%s: Read unexpected event %ud from userfaultfd",
+                             __func__, msg.event);
+                continue; /* It's not a page fault, shouldn't happen */
             }
-        }
-        if (msg.event != UFFD_EVENT_PAGEFAULT) {
-            error_report("%s: Read unexpected event %ud from userfaultfd",
-                         __func__, msg.event);
-            continue; /* It's not a page fault, shouldn't happen */
-        }
 
-        rb = qemu_ram_block_from_host(
-                 (void *)(uintptr_t)msg.arg.pagefault.address,
-                 true, &rb_offset);
-        if (!rb) {
-            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
-                         PRIx64, (uint64_t)msg.arg.pagefault.address);
-            break;
-        }
+            rb = qemu_ram_block_from_host(
+                     (void *)(uintptr_t)msg.arg.pagefault.address,
+                     true, &rb_offset);
+            if (!rb) {
+                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
+                             PRIx64, (uint64_t)msg.arg.pagefault.address);
+                break;
+            }
 
-        rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
-        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
+            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
+            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                 qemu_ram_get_idstr(rb),
                                                 rb_offset);
+            /*
+             * Send the request to the source - we want to request one
+             * of our host page sizes (which is >= TPS)
+             */
+            if (rb != last_rb) {
+                last_rb = rb;
+                migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
+                                         rb_offset, qemu_ram_pagesize(rb));
+            } else {
+                /* Save some space */
+                migrate_send_rp_req_pages(mis, NULL,
+                                         rb_offset, qemu_ram_pagesize(rb));
+            }
+        }
 
-        /*
-         * Send the request to the source - we want to request one
-         * of our host page sizes (which is >= TPS)
-         */
-        if (rb != last_rb) {
-            last_rb = rb;
-            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
-                                     rb_offset, qemu_ram_pagesize(rb));
-        } else {
-            /* Save some space */
-            migrate_send_rp_req_pages(mis, NULL,
-                                     rb_offset, qemu_ram_pagesize(rb));
+        /* Now handle any requests from external processes on shared memory */
+        /* TODO: May need to handle devices deregistering during postcopy */
+        for (index = 2; index < pfd_len && poll_result; index++) {
+            if (pfd[index].revents) {
+                struct PostCopyFD *pcfd =
+                    &g_array_index(mis->postcopy_remote_fds,
+                                   struct PostCopyFD, index - 2);
+
+                poll_result--;
+                if (pfd[index].revents & POLLERR) {
+                    error_report("%s: POLLERR on poll %zd fd=%d",
+                                 __func__, index, pcfd->fd);
+                    pfd[index].events = 0;
+                    continue;
+                }
+
+                ret = read(pcfd->fd, &msg, sizeof(msg));
+                if (ret != sizeof(msg)) {
+                    if (errno == EAGAIN) {
+                        /*
+                         * if a wake up happens on the other thread just after
+                         * the poll, there is nothing to read.
+                         */
+                        continue;
+                    }
+                    if (ret < 0) {
+                        error_report("%s: Failed to read full userfault "
+                                     "message: %s (shared) revents=%d",
+                                     __func__, strerror(errno),
+                                     pfd[index].revents);
+                        /*TODO: Could just disable this sharer */
+                        break;
+                    } else {
+                        error_report("%s: Read %d bytes from userfaultfd "
+                                     "expected %zd (shared)",
+                                     __func__, ret, sizeof(msg));
+                        /*TODO: Could just disable this sharer */
+                        break; /*Lost alignment,don't know what we'd read next*/
+                    }
+                }
+                if (msg.event != UFFD_EVENT_PAGEFAULT) {
+                    error_report("%s: Read unexpected event %ud "
+                                 "from userfaultfd (shared)",
+                                 __func__, msg.event);
+                    continue; /* It's not a page fault, shouldn't happen */
+                }
+                /* Call the device handler registered with us */
+                ret = pcfd->handler(pcfd, &msg);
+                if (ret) {
+                    error_report("%s: Failed to resolve shared fault on %zd/%s",
+                                 __func__, index, pcfd->idstr);
+                    /* TODO: Fail? Disable this sharer? */
+                }
+            }
         }
     }
     trace_postcopy_ram_fault_thread_exit();
@@ -970,3 +1045,32 @@  PostcopyState postcopy_state_set(PostcopyState new_state)
 {
     return atomic_xchg(&incoming_postcopy_state, new_state);
 }
+
+/* Register a handler for external shared memory postcopy
+ * called on the destination.
+ */
+void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
+                                                  *pcfd);
+}
+
+/* Unregister a handler for external shared memory postcopy
+ */
+void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
+{
+    guint i;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    GArray *pcrfds = mis->postcopy_remote_fds;
+
+    for (i = 0; i < pcrfds->len; i++) {
+        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
+        if (cur->fd == pcfd->fd) {
+            mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
+            return;
+        }
+    }
+}
+
diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
index 0421c98d57..f21eef6702 100644
--- a/migration/postcopy-ram.h
+++ b/migration/postcopy-ram.h
@@ -143,4 +143,25 @@  void postcopy_remove_notifier(NotifierWithReturn *n);
 /* Call the notifier list set by postcopy_add_start_notifier */
 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp);
 
+struct PostCopyFD;
+
+/* ufd is a pointer to the struct uffd_msg *TODO: more Portable! */
+typedef int (*pcfdhandler)(struct PostCopyFD *pcfd, void *ufd);
+
+struct PostCopyFD {
+    int fd;
+    /* Data to pass to handler */
+    void *data;
+    /* Handler to be called whenever we get a poll event */
+    pcfdhandler handler;
+    /* A string to use in error messages */
+    const char *idstr;
+};
+
+/* Register a userfaultfd owned by an external process for
+ * shared memory.
+ */
+void postcopy_register_shared_ufd(struct PostCopyFD *pcfd);
+void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd);
+
 #endif
diff --git a/migration/trace-events b/migration/trace-events
index 93961dea16..1e617ad7a6 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -190,6 +190,8 @@  postcopy_place_page_zero(void *host_addr) "host=%p"
 postcopy_ram_enable_notify(void) ""
 postcopy_ram_fault_thread_entry(void) ""
 postcopy_ram_fault_thread_exit(void) ""
+postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
+postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d"
 postcopy_ram_fault_thread_quit(void) ""
 postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx"
 postcopy_ram_incoming_cleanup_closeuf(void) ""

[v5,09/29] postcopy: Allow registering of fd handler

Commit Message

Comments

Patch