diff mbox

[v10,16/31] secondary vm suspend/resume/checkpoint code

Message ID 1456109555-28299-17-git-send-email-wency@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wen Congyang Feb. 22, 2016, 2:52 a.m. UTC
Secondary vm is running in colo mode. So we will do
the following things again and again:
1. Resume secondary vm
   a. Send CHECKPOINT_SVM_READY to master.
   b. If it is not the first resume, call libxl__checkpoint_devices_preresume().
   c. If it is the first resume(resume right after live migration),
      - call libxl__xc_domain_restore_done() to build the secondary vm.
      - enable secondary vm's logdirty.
      - call libxl__domain_resume() to resume secondary vm.
      - call libxl__checkpoint_devices_setup() to setup checkpoint devices.
   d. Send CHECKPOINT_SVM_RESUMED to master.
2. Wait a new checkpoint
   a. Call libxl__checkpoint_devices_commit().
   b. Read CHECKPOINT_NEW from master.
3. Suspend secondary vm
   a. Suspend secondary vm.
   b. Call libxl__checkpoint_devices_postsuspend().
   c. Send CHECKPOINT_SVM_SUSPENDED to master.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
---
 tools/libxc/xc_sr_common.h       |    2 +
 tools/libxc/xc_sr_save.c         |    3 +-
 tools/libxl/Makefile             |    1 +
 tools/libxl/libxl_colo.h         |   24 +
 tools/libxl/libxl_colo_restore.c | 1038 ++++++++++++++++++++++++++++++++++++++
 tools/libxl/libxl_create.c       |   37 ++
 tools/libxl/libxl_internal.h     |   19 +
 tools/libxl/libxl_save_callout.c |    7 +-
 tools/libxl/libxl_stream_read.c  |   12 +
 tools/libxl/libxl_types.idl      |    1 +
 10 files changed, 1142 insertions(+), 2 deletions(-)
 create mode 100644 tools/libxl/libxl_colo.h
 create mode 100644 tools/libxl/libxl_colo_restore.c

Comments

Wei Liu Feb. 25, 2016, 3:56 p.m. UTC | #1
On Mon, Feb 22, 2016 at 10:52:20AM +0800, Wen Congyang wrote:
> Secondary vm is running in colo mode. So we will do
> the following things again and again:
> 1. Resume secondary vm
>    a. Send CHECKPOINT_SVM_READY to master.
>    b. If it is not the first resume, call libxl__checkpoint_devices_preresume().
>    c. If it is the first resume(resume right after live migration),
>       - call libxl__xc_domain_restore_done() to build the secondary vm.
>       - enable secondary vm's logdirty.
>       - call libxl__domain_resume() to resume secondary vm.
>       - call libxl__checkpoint_devices_setup() to setup checkpoint devices.
>    d. Send CHECKPOINT_SVM_RESUMED to master.
> 2. Wait a new checkpoint
>    a. Call libxl__checkpoint_devices_commit().
>    b. Read CHECKPOINT_NEW from master.
> 3. Suspend secondary vm
>    a. Suspend secondary vm.
>    b. Call libxl__checkpoint_devices_postsuspend().
>    c. Send CHECKPOINT_SVM_SUSPENDED to master.
> 
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
> ---
>  tools/libxc/xc_sr_common.h       |    2 +
>  tools/libxc/xc_sr_save.c         |    3 +-
>  tools/libxl/Makefile             |    1 +
>  tools/libxl/libxl_colo.h         |   24 +
>  tools/libxl/libxl_colo_restore.c | 1038 ++++++++++++++++++++++++++++++++++++++
>  tools/libxl/libxl_create.c       |   37 ++
>  tools/libxl/libxl_internal.h     |   19 +
>  tools/libxl/libxl_save_callout.c |    7 +-
>  tools/libxl/libxl_stream_read.c  |   12 +
>  tools/libxl/libxl_types.idl      |    1 +

There is a bunch of TODOs in libxl_colo.c but I don't think you're in a
better position to judge whether they should be blocker or not.

>  10 files changed, 1142 insertions(+), 2 deletions(-)
>  create mode 100644 tools/libxl/libxl_colo.h
>  create mode 100644 tools/libxl/libxl_colo_restore.c
> 
> diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
> index 5d9f497..2bfed64 100644
> --- a/tools/libxc/xc_sr_common.h
> +++ b/tools/libxc/xc_sr_common.h
> @@ -184,10 +184,12 @@ struct xc_sr_context
>       * migration stream
>       * 0: Plain VM
>       * 1: Remus
> +     * 2: COLO
>       */
>      enum {
>          MIG_STREAM_NONE, /* plain stream */
>          MIG_STREAM_REMUS,
> +        MIG_STREAM_COLO,
>      } migration_stream;
>  
>      union /* Common save or restore data. */
> diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
> index fe210cc..7393355 100644
> --- a/tools/libxc/xc_sr_save.c
> +++ b/tools/libxc/xc_sr_save.c
> @@ -846,7 +846,8 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
>  
>      /* If altering migration_stream update this assert too. */
>      assert(checkpointed_stream == MIG_STREAM_NONE ||
> -           checkpointed_stream == MIG_STREAM_REMUS);
> +           checkpointed_stream == MIG_STREAM_REMUS ||
> +           checkpointed_stream == MIG_STREAM_COLO);
>  
>      /*
>       * TODO: Find some time to better tweak the live migration algorithm.

[...]

> +
> +#include "libxl_osdeps.h" /* must come before any other headers */
> +
> +#include "libxl_internal.h"
> +#include "libxl_colo.h"
> +#include "libxl_sr_stream_format.h"
> +
> +enum {
> +    LIBXL_COLO_SETUPED,
> +    LIBXL_COLO_SUSPENDED,
> +    LIBXL_COLO_RESUMED,
> +};
> +
> +typedef struct libxl__colo_restore_checkpoint_state libxl__colo_restore_checkpoint_state;
> +struct libxl__colo_restore_checkpoint_state {
> +    libxl__domain_suspend_state dsps;
> +    libxl__logdirty_switch lds;
> +    libxl__colo_restore_state *crs;
> +    libxl__stream_write_state sws;
> +    int status;
> +    bool preresume;
> +    /* used for teardown */
> +    int teardown_devices;
> +    int saved_rc;
> +    char *state_file;
> +
> +    void (*callback)(libxl__egc *,
> +                     libxl__colo_restore_checkpoint_state *,
> +                     int);
> +};
> +

Shouldn't the enum and struct belong to libxl_colo.h ?

> +
> +static void libxl__colo_restore_domain_resume_callback(void *data);
> +static void libxl__colo_restore_domain_checkpoint_callback(void *data);
> +static void libxl__colo_restore_domain_wait_checkpoint_callback(void *data);
> +static void libxl__colo_restore_domain_suspend_callback(void *data);
> +
> +static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
> +    NULL,
> +};
> +

It would be helpful to list the callbacks at the beginning of the time
in the order they are supposed to occur.

See libxl_create.c for example. Search for "Event callbacks, in this
order".

I've tried to map the algorithm you described in commit message to all
the callbacks, but without some references it is just too time consuming
from my end.

I think what I'm going to do is to make sure the normal path that
doesn't use COLO is not broken and leave the internal to you and Ian (if
he fancies to dig into details).

[...]
> +
> +void libxl__colo_restore_setup(libxl__egc *egc,
> +                               libxl__colo_restore_state *crs)
> +{
> +    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
> +    libxl__colo_restore_checkpoint_state *crcs;
> +    int rc = ERROR_FAIL;
> +
> +    /* Convenience aliases */
> +    libxl__srm_restore_autogen_callbacks *const callbacks =
> +        &dcs->srs.shs.callbacks.restore.a;
> +    const int domid = crs->domid;
> +
> +    STATE_AO_GC(crs->ao);
> +
> +    GCNEW(crcs);
> +    crs->crcs = crcs;
> +    crcs->crs = crs;
> +
> +    /* setup dsps */
> +    crcs->dsps.ao = ao;
> +    crcs->dsps.domid = domid;
> +    if (init_dsps(&crcs->dsps))
> +        goto err;
> +
> +    callbacks->suspend = libxl__colo_restore_domain_suspend_callback;
> +    callbacks->postcopy = libxl__colo_restore_domain_resume_callback;
> +    callbacks->checkpoint = libxl__colo_restore_domain_checkpoint_callback;
> +    callbacks->wait_checkpoint = libxl__colo_restore_domain_wait_checkpoint_callback;
> +
> +    /*
> +     * Secondary vm is running in colo mode, so we need to call
> +     * libxl__xc_domain_restore_done() to create secondary vm.
> +     * But we will exit in domain_create_cb(). So replace the
> +     * callback here.
> +     */
> +    crs->saved_cb = dcs->callback;
> +    dcs->callback = libxl__colo_domain_create_cb;
> +    crcs->state_file = GCSPRINTF(LIBXL_DEVICE_MODEL_RESTORE_FILE".%d", domid);

Can you use a different name space from the normal one?

For example, you can put

 #define LIBXL_COLO_DEVICE_MODEL_RESTORE_FILE    XXXX

in libxl_colo.h and use it in all COLO code.


> +    crcs->status = LIBXL_COLO_SETUPED;
> +
> +    libxl__logdirty_init(&crcs->lds);
> +    crcs->lds.ao = ao;
> +
> +    crcs->sws.fd = crs->send_back_fd;
> +    crcs->sws.ao = ao;
> +    crcs->sws.back_channel = true;
> +
> +    dcs->cds.concrete_data = crs;
> +
> +    libxl__stream_write_start(egc, &crcs->sws);
> +
> +    rc = 0;
> +
> +out:
> +    crs->callback(egc, crs, rc);
> +    return;
> +
> +err:
> +    goto out;
> +}
> +
> +static void libxl__colo_domain_create_cb(libxl__egc *egc,
> +                                         libxl__domain_create_state *dcs,
> +                                         int rc, uint32_t domid)
> +{
> +    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
> +
> +    crcs->callback(egc, crcs, rc);
> +}
> +
> +
[...]
> +
> +static void colo_disable_logdirty_done(libxl__egc *egc,
> +                                       libxl__logdirty_switch *lds,
> +                                       int rc)
> +{
> +    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
> +
> +    EGC_GC;
> +
> +    if (rc)
> +        LOG(WARN, "cannot disable logdirty");
> +
> +    if (crcs->status == LIBXL_COLO_SUSPENDED) {
> +        /*
> +         * failover when reading state from master, so no need to
> +         * call libxl__domain_restore().

You need to update this comment to the right function name.

> +         */
> +        colo_resume_vm(egc, crcs, 0);
> +        return;
> +    }
> +
> +    /* If we cannot disable logdirty, we still can do failover */
> +    crcs->callback(egc, crcs, 0);
> +}
> +
[...]
>  
> +/* colo related structure */
> +typedef struct libxl__colo_restore_state libxl__colo_restore_state;
> +typedef void libxl__colo_callback(libxl__egc *,
> +                                  libxl__colo_restore_state *, int rc);
> +struct libxl__colo_restore_state {
> +    /* must set by caller of libxl__colo_(setup|teardown) */
> +    libxl__ao *ao;
> +    uint32_t domid;
> +    int send_back_fd;
> +    int recv_fd;
> +    int hvm;
> +    libxl__colo_callback *callback;
> +
> +    /* private, colo restore checkpoint state */
> +    libxl__domain_create_cb *saved_cb;
> +    void *crcs;
> +};
>  

And this should go to libxl_colo.h, too? And libxl_internal.h includes
libxl_colo.h?

I just don't want to colo structures and functions scatter in
different places.

>  struct libxl__domain_create_state {
>      /* filled in by user */
> @@ -3486,6 +3503,8 @@ struct libxl__domain_create_state {
>      /* private to domain_create */
>      int guest_domid;
>      libxl__domain_build_state build_state;
> +    libxl__colo_restore_state crs;
> +    libxl__checkpoint_devices_state cds;
>      libxl__bootloader_state bl;
>      libxl__stub_dm_spawn_state dmss;
>          /* If we're not doing stubdom, we use only dmss.dm,
> diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
> index 0d6949a..b1810b2 100644
> --- a/tools/libxl/libxl_save_callout.c
> +++ b/tools/libxl/libxl_save_callout.c
> @@ -15,6 +15,7 @@
>  #include "libxl_osdeps.h"
>  
>  #include "libxl_internal.h"
> +#include "libxl_colo.h"
>  
>  /* stream_fd is as from the caller (eventually, the application).
>   * It may be 0, 1 or 2, in which case we need to dup it elsewhere.
> @@ -68,7 +69,11 @@ void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
>      shs->ao = ao;
>      shs->domid = domid;
>      shs->recv_callback = libxl__srm_callout_received_restore;
> -    shs->completion_callback = libxl__xc_domain_restore_done;
> +    if (dcs->restore_params.checkpointed_stream ==
> +                                                LIBXL_CHECKPOINTED_STREAM_COLO)

This is very strange line wrap.

> +        shs->completion_callback = libxl__colo_restore_teardown;
> +    else
> +        shs->completion_callback = libxl__xc_domain_restore_done;
>      shs->caller_state = dcs;
>      shs->need_results = 1;
>  
> diff --git a/tools/libxl/libxl_stream_read.c b/tools/libxl/libxl_stream_read.c
> index 5d980d9..d6bd2fe 100644
> --- a/tools/libxl/libxl_stream_read.c
> +++ b/tools/libxl/libxl_stream_read.c
> @@ -846,6 +846,18 @@ void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
>       */
>      if (libxl__stream_read_inuse(stream)) {
>          switch (checkpointed_stream) {
> +        case LIBXL_CHECKPOINTED_STREAM_COLO:
> +            if (stream->completion_callback) {
> +                /*
> +                 * restore, just build the secondary vm, don't close
> +                 * the stream
> +                 */
> +                stream->completion_callback(egc, stream, 0);
> +            } else {
> +                /* failover, just close the stream */
> +                stream_complete(egc, stream, 0);
> +            }
> +            break;
>          case LIBXL_CHECKPOINTED_STREAM_REMUS:
>              /*
>               * Failover from primary. Domain state is currently at a
> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
> index 632c009..33f4a90 100644
> --- a/tools/libxl/libxl_types.idl
> +++ b/tools/libxl/libxl_types.idl
> @@ -232,6 +232,7 @@ libxl_hdtype = Enumeration("hdtype", [
>  libxl_checkpointed_stream = Enumeration("checkpointed_stream", [
>      (0, "NONE"),
>      (1, "REMUS"),
> +    (2, "COLO"),
>      ])
>  
>  #
> -- 
> 2.5.0
> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
Wen Congyang Feb. 26, 2016, 2:30 a.m. UTC | #2
On 02/25/2016 11:56 PM, Wei Liu wrote:
> On Mon, Feb 22, 2016 at 10:52:20AM +0800, Wen Congyang wrote:
>> Secondary vm is running in colo mode. So we will do
>> the following things again and again:
>> 1. Resume secondary vm
>>    a. Send CHECKPOINT_SVM_READY to master.
>>    b. If it is not the first resume, call libxl__checkpoint_devices_preresume().
>>    c. If it is the first resume(resume right after live migration),
>>       - call libxl__xc_domain_restore_done() to build the secondary vm.
>>       - enable secondary vm's logdirty.
>>       - call libxl__domain_resume() to resume secondary vm.
>>       - call libxl__checkpoint_devices_setup() to setup checkpoint devices.
>>    d. Send CHECKPOINT_SVM_RESUMED to master.
>> 2. Wait a new checkpoint
>>    a. Call libxl__checkpoint_devices_commit().
>>    b. Read CHECKPOINT_NEW from master.
>> 3. Suspend secondary vm
>>    a. Suspend secondary vm.
>>    b. Call libxl__checkpoint_devices_postsuspend().
>>    c. Send CHECKPOINT_SVM_SUSPENDED to master.
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
>> ---
>>  tools/libxc/xc_sr_common.h       |    2 +
>>  tools/libxc/xc_sr_save.c         |    3 +-
>>  tools/libxl/Makefile             |    1 +
>>  tools/libxl/libxl_colo.h         |   24 +
>>  tools/libxl/libxl_colo_restore.c | 1038 ++++++++++++++++++++++++++++++++++++++
>>  tools/libxl/libxl_create.c       |   37 ++
>>  tools/libxl/libxl_internal.h     |   19 +
>>  tools/libxl/libxl_save_callout.c |    7 +-
>>  tools/libxl/libxl_stream_read.c  |   12 +
>>  tools/libxl/libxl_types.idl      |    1 +
> 
> There is a bunch of TODOs in libxl_colo.c but I don't think you're in a
> better position to judge whether they should be blocker or not.
> 
>>  10 files changed, 1142 insertions(+), 2 deletions(-)
>>  create mode 100644 tools/libxl/libxl_colo.h
>>  create mode 100644 tools/libxl/libxl_colo_restore.c
>>
>> diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
>> index 5d9f497..2bfed64 100644
>> --- a/tools/libxc/xc_sr_common.h
>> +++ b/tools/libxc/xc_sr_common.h
>> @@ -184,10 +184,12 @@ struct xc_sr_context
>>       * migration stream
>>       * 0: Plain VM
>>       * 1: Remus
>> +     * 2: COLO
>>       */
>>      enum {
>>          MIG_STREAM_NONE, /* plain stream */
>>          MIG_STREAM_REMUS,
>> +        MIG_STREAM_COLO,
>>      } migration_stream;
>>  
>>      union /* Common save or restore data. */
>> diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
>> index fe210cc..7393355 100644
>> --- a/tools/libxc/xc_sr_save.c
>> +++ b/tools/libxc/xc_sr_save.c
>> @@ -846,7 +846,8 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
>>  
>>      /* If altering migration_stream update this assert too. */
>>      assert(checkpointed_stream == MIG_STREAM_NONE ||
>> -           checkpointed_stream == MIG_STREAM_REMUS);
>> +           checkpointed_stream == MIG_STREAM_REMUS ||
>> +           checkpointed_stream == MIG_STREAM_COLO);
>>  
>>      /*
>>       * TODO: Find some time to better tweak the live migration algorithm.
> 
> [...]
> 
>> +
>> +#include "libxl_osdeps.h" /* must come before any other headers */
>> +
>> +#include "libxl_internal.h"
>> +#include "libxl_colo.h"
>> +#include "libxl_sr_stream_format.h"
>> +
>> +enum {
>> +    LIBXL_COLO_SETUPED,
>> +    LIBXL_COLO_SUSPENDED,
>> +    LIBXL_COLO_RESUMED,
>> +};
>> +
>> +typedef struct libxl__colo_restore_checkpoint_state libxl__colo_restore_checkpoint_state;
>> +struct libxl__colo_restore_checkpoint_state {
>> +    libxl__domain_suspend_state dsps;
>> +    libxl__logdirty_switch lds;
>> +    libxl__colo_restore_state *crs;
>> +    libxl__stream_write_state sws;
>> +    int status;
>> +    bool preresume;
>> +    /* used for teardown */
>> +    int teardown_devices;
>> +    int saved_rc;
>> +    char *state_file;
>> +
>> +    void (*callback)(libxl__egc *,
>> +                     libxl__colo_restore_checkpoint_state *,
>> +                     int);
>> +};
>> +
> 
> Shouldn't the enum and struct belong to libxl_colo.h ?

It only be used by restore side. I think it is OK to move them to libxl_colo.h.

> 
>> +
>> +static void libxl__colo_restore_domain_resume_callback(void *data);
>> +static void libxl__colo_restore_domain_checkpoint_callback(void *data);
>> +static void libxl__colo_restore_domain_wait_checkpoint_callback(void *data);
>> +static void libxl__colo_restore_domain_suspend_callback(void *data);
>> +
>> +static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
>> +    NULL,
>> +};
>> +
> 
> It would be helpful to list the callbacks at the beginning of the time
> in the order they are supposed to occur.
> 
> See libxl_create.c for example. Search for "Event callbacks, in this
> order".

OK, will fix it in the next version.

> 
> I've tried to map the algorithm you described in commit message to all
> the callbacks, but without some references it is just too time consuming
> from my end.
> 
> I think what I'm going to do is to make sure the normal path that
> doesn't use COLO is not broken and leave the internal to you and Ian (if
> he fancies to dig into details).
> 
> [...]
>> +
>> +void libxl__colo_restore_setup(libxl__egc *egc,
>> +                               libxl__colo_restore_state *crs)
>> +{
>> +    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
>> +    libxl__colo_restore_checkpoint_state *crcs;
>> +    int rc = ERROR_FAIL;
>> +
>> +    /* Convenience aliases */
>> +    libxl__srm_restore_autogen_callbacks *const callbacks =
>> +        &dcs->srs.shs.callbacks.restore.a;
>> +    const int domid = crs->domid;
>> +
>> +    STATE_AO_GC(crs->ao);
>> +
>> +    GCNEW(crcs);
>> +    crs->crcs = crcs;
>> +    crcs->crs = crs;
>> +
>> +    /* setup dsps */
>> +    crcs->dsps.ao = ao;
>> +    crcs->dsps.domid = domid;
>> +    if (init_dsps(&crcs->dsps))
>> +        goto err;
>> +
>> +    callbacks->suspend = libxl__colo_restore_domain_suspend_callback;
>> +    callbacks->postcopy = libxl__colo_restore_domain_resume_callback;
>> +    callbacks->checkpoint = libxl__colo_restore_domain_checkpoint_callback;
>> +    callbacks->wait_checkpoint = libxl__colo_restore_domain_wait_checkpoint_callback;
>> +
>> +    /*
>> +     * Secondary vm is running in colo mode, so we need to call
>> +     * libxl__xc_domain_restore_done() to create secondary vm.
>> +     * But we will exit in domain_create_cb(). So replace the
>> +     * callback here.
>> +     */
>> +    crs->saved_cb = dcs->callback;
>> +    dcs->callback = libxl__colo_domain_create_cb;
>> +    crcs->state_file = GCSPRINTF(LIBXL_DEVICE_MODEL_RESTORE_FILE".%d", domid);
> 
> Can you use a different name space from the normal one?

We write the qemu state into the restore file in write_emulator_blob().

> 
> For example, you can put
> 
>  #define LIBXL_COLO_DEVICE_MODEL_RESTORE_FILE    XXXX

So if we use a different name space, we should do
#define LIBXL_COLO_DEVICE_MODEL_RESTORE_FILE       LIBXL_DEVICE_MODEL_RESTORE_FILE

In colo codes, IIRC there is no other code use it.

> 
> in libxl_colo.h and use it in all COLO code.
> 
> 
>> +    crcs->status = LIBXL_COLO_SETUPED;
>> +
>> +    libxl__logdirty_init(&crcs->lds);
>> +    crcs->lds.ao = ao;
>> +
>> +    crcs->sws.fd = crs->send_back_fd;
>> +    crcs->sws.ao = ao;
>> +    crcs->sws.back_channel = true;
>> +
>> +    dcs->cds.concrete_data = crs;
>> +
>> +    libxl__stream_write_start(egc, &crcs->sws);
>> +
>> +    rc = 0;
>> +
>> +out:
>> +    crs->callback(egc, crs, rc);
>> +    return;
>> +
>> +err:
>> +    goto out;
>> +}
>> +
>> +static void libxl__colo_domain_create_cb(libxl__egc *egc,
>> +                                         libxl__domain_create_state *dcs,
>> +                                         int rc, uint32_t domid)
>> +{
>> +    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
>> +
>> +    crcs->callback(egc, crcs, rc);
>> +}
>> +
>> +
> [...]
>> +
>> +static void colo_disable_logdirty_done(libxl__egc *egc,
>> +                                       libxl__logdirty_switch *lds,
>> +                                       int rc)
>> +{
>> +    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
>> +
>> +    EGC_GC;
>> +
>> +    if (rc)
>> +        LOG(WARN, "cannot disable logdirty");
>> +
>> +    if (crcs->status == LIBXL_COLO_SUSPENDED) {
>> +        /*
>> +         * failover when reading state from master, so no need to
>> +         * call libxl__domain_restore().
> 
> You need to update this comment to the right function name.

OK, will fix it in the next version.

> 
>> +         */
>> +        colo_resume_vm(egc, crcs, 0);
>> +        return;
>> +    }
>> +
>> +    /* If we cannot disable logdirty, we still can do failover */
>> +    crcs->callback(egc, crcs, 0);
>> +}
>> +
> [...]
>>  
>> +/* colo related structure */
>> +typedef struct libxl__colo_restore_state libxl__colo_restore_state;
>> +typedef void libxl__colo_callback(libxl__egc *,
>> +                                  libxl__colo_restore_state *, int rc);
>> +struct libxl__colo_restore_state {
>> +    /* must set by caller of libxl__colo_(setup|teardown) */
>> +    libxl__ao *ao;
>> +    uint32_t domid;
>> +    int send_back_fd;
>> +    int recv_fd;
>> +    int hvm;
>> +    libxl__colo_callback *callback;
>> +
>> +    /* private, colo restore checkpoint state */
>> +    libxl__domain_create_cb *saved_cb;
>> +    void *crcs;
>> +};
>>  
> 
> And this should go to libxl_colo.h, too? And libxl_internal.h includes
> libxl_colo.h?
> 
> I just don't want to colo structures and functions scatter in
> different places.

OK, will fix it in the next version.

> 
>>  struct libxl__domain_create_state {
>>      /* filled in by user */
>> @@ -3486,6 +3503,8 @@ struct libxl__domain_create_state {
>>      /* private to domain_create */
>>      int guest_domid;
>>      libxl__domain_build_state build_state;
>> +    libxl__colo_restore_state crs;
>> +    libxl__checkpoint_devices_state cds;
>>      libxl__bootloader_state bl;
>>      libxl__stub_dm_spawn_state dmss;
>>          /* If we're not doing stubdom, we use only dmss.dm,
>> diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
>> index 0d6949a..b1810b2 100644
>> --- a/tools/libxl/libxl_save_callout.c
>> +++ b/tools/libxl/libxl_save_callout.c
>> @@ -15,6 +15,7 @@
>>  #include "libxl_osdeps.h"
>>  
>>  #include "libxl_internal.h"
>> +#include "libxl_colo.h"
>>  
>>  /* stream_fd is as from the caller (eventually, the application).
>>   * It may be 0, 1 or 2, in which case we need to dup it elsewhere.
>> @@ -68,7 +69,11 @@ void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
>>      shs->ao = ao;
>>      shs->domid = domid;
>>      shs->recv_callback = libxl__srm_callout_received_restore;
>> -    shs->completion_callback = libxl__xc_domain_restore_done;
>> +    if (dcs->restore_params.checkpointed_stream ==
>> +                                                LIBXL_CHECKPOINTED_STREAM_COLO)
> 
> This is very strange line wrap.

Yes, will fix it in the next version.

Thanks
Wen Congyang

> 
>> +        shs->completion_callback = libxl__colo_restore_teardown;
>> +    else
>> +        shs->completion_callback = libxl__xc_domain_restore_done;
>>      shs->caller_state = dcs;
>>      shs->need_results = 1;
>>  
>> diff --git a/tools/libxl/libxl_stream_read.c b/tools/libxl/libxl_stream_read.c
>> index 5d980d9..d6bd2fe 100644
>> --- a/tools/libxl/libxl_stream_read.c
>> +++ b/tools/libxl/libxl_stream_read.c
>> @@ -846,6 +846,18 @@ void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
>>       */
>>      if (libxl__stream_read_inuse(stream)) {
>>          switch (checkpointed_stream) {
>> +        case LIBXL_CHECKPOINTED_STREAM_COLO:
>> +            if (stream->completion_callback) {
>> +                /*
>> +                 * restore, just build the secondary vm, don't close
>> +                 * the stream
>> +                 */
>> +                stream->completion_callback(egc, stream, 0);
>> +            } else {
>> +                /* failover, just close the stream */
>> +                stream_complete(egc, stream, 0);
>> +            }
>> +            break;
>>          case LIBXL_CHECKPOINTED_STREAM_REMUS:
>>              /*
>>               * Failover from primary. Domain state is currently at a
>> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
>> index 632c009..33f4a90 100644
>> --- a/tools/libxl/libxl_types.idl
>> +++ b/tools/libxl/libxl_types.idl
>> @@ -232,6 +232,7 @@ libxl_hdtype = Enumeration("hdtype", [
>>  libxl_checkpointed_stream = Enumeration("checkpointed_stream", [
>>      (0, "NONE"),
>>      (1, "REMUS"),
>> +    (2, "COLO"),
>>      ])
>>  
>>  #
>> -- 
>> 2.5.0
>>
>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
> 
> 
> .
>
Wen Congyang March 1, 2016, 10:06 a.m. UTC | #3
On 02/25/2016 11:56 PM, Wei Liu wrote:
> On Mon, Feb 22, 2016 at 10:52:20AM +0800, Wen Congyang wrote:
>> Secondary vm is running in colo mode. So we will do
>> the following things again and again:
>> 1. Resume secondary vm
>>    a. Send CHECKPOINT_SVM_READY to master.
>>    b. If it is not the first resume, call libxl__checkpoint_devices_preresume().
>>    c. If it is the first resume(resume right after live migration),
>>       - call libxl__xc_domain_restore_done() to build the secondary vm.
>>       - enable secondary vm's logdirty.
>>       - call libxl__domain_resume() to resume secondary vm.
>>       - call libxl__checkpoint_devices_setup() to setup checkpoint devices.
>>    d. Send CHECKPOINT_SVM_RESUMED to master.
>> 2. Wait a new checkpoint
>>    a. Call libxl__checkpoint_devices_commit().
>>    b. Read CHECKPOINT_NEW from master.
>> 3. Suspend secondary vm
>>    a. Suspend secondary vm.
>>    b. Call libxl__checkpoint_devices_postsuspend().
>>    c. Send CHECKPOINT_SVM_SUSPENDED to master.
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
>> ---
>>  tools/libxc/xc_sr_common.h       |    2 +
>>  tools/libxc/xc_sr_save.c         |    3 +-
>>  tools/libxl/Makefile             |    1 +
>>  tools/libxl/libxl_colo.h         |   24 +
>>  tools/libxl/libxl_colo_restore.c | 1038 ++++++++++++++++++++++++++++++++++++++
>>  tools/libxl/libxl_create.c       |   37 ++
>>  tools/libxl/libxl_internal.h     |   19 +
>>  tools/libxl/libxl_save_callout.c |    7 +-
>>  tools/libxl/libxl_stream_read.c  |   12 +
>>  tools/libxl/libxl_types.idl      |    1 +
> 
> There is a bunch of TODOs in libxl_colo.c but I don't think you're in a
> better position to judge whether they should be blocker or not.
> 
>>  10 files changed, 1142 insertions(+), 2 deletions(-)
>>  create mode 100644 tools/libxl/libxl_colo.h
>>  create mode 100644 tools/libxl/libxl_colo_restore.c
>>
>> diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
>> index 5d9f497..2bfed64 100644
>> --- a/tools/libxc/xc_sr_common.h
>> +++ b/tools/libxc/xc_sr_common.h
>> @@ -184,10 +184,12 @@ struct xc_sr_context
>>       * migration stream
>>       * 0: Plain VM
>>       * 1: Remus
>> +     * 2: COLO
>>       */
>>      enum {
>>          MIG_STREAM_NONE, /* plain stream */
>>          MIG_STREAM_REMUS,
>> +        MIG_STREAM_COLO,
>>      } migration_stream;
>>  
>>      union /* Common save or restore data. */
>> diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
>> index fe210cc..7393355 100644
>> --- a/tools/libxc/xc_sr_save.c
>> +++ b/tools/libxc/xc_sr_save.c
>> @@ -846,7 +846,8 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
>>  
>>      /* If altering migration_stream update this assert too. */
>>      assert(checkpointed_stream == MIG_STREAM_NONE ||
>> -           checkpointed_stream == MIG_STREAM_REMUS);
>> +           checkpointed_stream == MIG_STREAM_REMUS ||
>> +           checkpointed_stream == MIG_STREAM_COLO);
>>  
>>      /*
>>       * TODO: Find some time to better tweak the live migration algorithm.
> 
> [...]
> 
>> +
>> +#include "libxl_osdeps.h" /* must come before any other headers */
>> +
>> +#include "libxl_internal.h"
>> +#include "libxl_colo.h"
>> +#include "libxl_sr_stream_format.h"
>> +
>> +enum {
>> +    LIBXL_COLO_SETUPED,
>> +    LIBXL_COLO_SUSPENDED,
>> +    LIBXL_COLO_RESUMED,
>> +};
>> +
>> +typedef struct libxl__colo_restore_checkpoint_state libxl__colo_restore_checkpoint_state;
>> +struct libxl__colo_restore_checkpoint_state {
>> +    libxl__domain_suspend_state dsps;
>> +    libxl__logdirty_switch lds;
>> +    libxl__colo_restore_state *crs;
>> +    libxl__stream_write_state sws;
>> +    int status;
>> +    bool preresume;
>> +    /* used for teardown */
>> +    int teardown_devices;
>> +    int saved_rc;
>> +    char *state_file;
>> +
>> +    void (*callback)(libxl__egc *,
>> +                     libxl__colo_restore_checkpoint_state *,
>> +                     int);
>> +};
>> +
> 
> Shouldn't the enum and struct belong to libxl_colo.h ?

If we inlucde libxl_colo.h in libxl_internal.h, we cannot move this into colo.h, because
this structure needs libxl__domain_suspend_state, libxl__logdirty_switch, ...
We cannot just declare it, because this structure needs know there size.

> 
>> +
>> +static void libxl__colo_restore_domain_resume_callback(void *data);
>> +static void libxl__colo_restore_domain_checkpoint_callback(void *data);
>> +static void libxl__colo_restore_domain_wait_checkpoint_callback(void *data);
>> +static void libxl__colo_restore_domain_suspend_callback(void *data);
>> +
>> +static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
>> +    NULL,
>> +};
>> +
> 
> It would be helpful to list the callbacks at the beginning of the time
> in the order they are supposed to occur.
> 
> See libxl_create.c for example. Search for "Event callbacks, in this
> order".
> 
> I've tried to map the algorithm you described in commit message to all
> the callbacks, but without some references it is just too time consuming
> from my end.
> 
> I think what I'm going to do is to make sure the normal path that
> doesn't use COLO is not broken and leave the internal to you and Ian (if
> he fancies to dig into details).
> 
> [...]
>> +
>> +void libxl__colo_restore_setup(libxl__egc *egc,
>> +                               libxl__colo_restore_state *crs)
>> +{
>> +    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
>> +    libxl__colo_restore_checkpoint_state *crcs;
>> +    int rc = ERROR_FAIL;
>> +
>> +    /* Convenience aliases */
>> +    libxl__srm_restore_autogen_callbacks *const callbacks =
>> +        &dcs->srs.shs.callbacks.restore.a;
>> +    const int domid = crs->domid;
>> +
>> +    STATE_AO_GC(crs->ao);
>> +
>> +    GCNEW(crcs);
>> +    crs->crcs = crcs;
>> +    crcs->crs = crs;
>> +
>> +    /* setup dsps */
>> +    crcs->dsps.ao = ao;
>> +    crcs->dsps.domid = domid;
>> +    if (init_dsps(&crcs->dsps))
>> +        goto err;
>> +
>> +    callbacks->suspend = libxl__colo_restore_domain_suspend_callback;
>> +    callbacks->postcopy = libxl__colo_restore_domain_resume_callback;
>> +    callbacks->checkpoint = libxl__colo_restore_domain_checkpoint_callback;
>> +    callbacks->wait_checkpoint = libxl__colo_restore_domain_wait_checkpoint_callback;
>> +
>> +    /*
>> +     * Secondary vm is running in colo mode, so we need to call
>> +     * libxl__xc_domain_restore_done() to create secondary vm.
>> +     * But we will exit in domain_create_cb(). So replace the
>> +     * callback here.
>> +     */
>> +    crs->saved_cb = dcs->callback;
>> +    dcs->callback = libxl__colo_domain_create_cb;
>> +    crcs->state_file = GCSPRINTF(LIBXL_DEVICE_MODEL_RESTORE_FILE".%d", domid);
> 
> Can you use a different name space from the normal one?
> 
> For example, you can put
> 
>  #define LIBXL_COLO_DEVICE_MODEL_RESTORE_FILE    XXXX
> 
> in libxl_colo.h and use it in all COLO code.
> 
> 
>> +    crcs->status = LIBXL_COLO_SETUPED;
>> +
>> +    libxl__logdirty_init(&crcs->lds);
>> +    crcs->lds.ao = ao;
>> +
>> +    crcs->sws.fd = crs->send_back_fd;
>> +    crcs->sws.ao = ao;
>> +    crcs->sws.back_channel = true;
>> +
>> +    dcs->cds.concrete_data = crs;
>> +
>> +    libxl__stream_write_start(egc, &crcs->sws);
>> +
>> +    rc = 0;
>> +
>> +out:
>> +    crs->callback(egc, crs, rc);
>> +    return;
>> +
>> +err:
>> +    goto out;
>> +}
>> +
>> +static void libxl__colo_domain_create_cb(libxl__egc *egc,
>> +                                         libxl__domain_create_state *dcs,
>> +                                         int rc, uint32_t domid)
>> +{
>> +    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
>> +
>> +    crcs->callback(egc, crcs, rc);
>> +}
>> +
>> +
> [...]
>> +
>> +static void colo_disable_logdirty_done(libxl__egc *egc,
>> +                                       libxl__logdirty_switch *lds,
>> +                                       int rc)
>> +{
>> +    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
>> +
>> +    EGC_GC;
>> +
>> +    if (rc)
>> +        LOG(WARN, "cannot disable logdirty");
>> +
>> +    if (crcs->status == LIBXL_COLO_SUSPENDED) {
>> +        /*
>> +         * failover when reading state from master, so no need to
>> +         * call libxl__domain_restore().
> 
> You need to update this comment to the right function name.
> 
>> +         */
>> +        colo_resume_vm(egc, crcs, 0);
>> +        return;
>> +    }
>> +
>> +    /* If we cannot disable logdirty, we still can do failover */
>> +    crcs->callback(egc, crcs, 0);
>> +}
>> +
> [...]
>>  
>> +/* colo related structure */
>> +typedef struct libxl__colo_restore_state libxl__colo_restore_state;
>> +typedef void libxl__colo_callback(libxl__egc *,
>> +                                  libxl__colo_restore_state *, int rc);
>> +struct libxl__colo_restore_state {
>> +    /* must set by caller of libxl__colo_(setup|teardown) */
>> +    libxl__ao *ao;
>> +    uint32_t domid;
>> +    int send_back_fd;
>> +    int recv_fd;
>> +    int hvm;
>> +    libxl__colo_callback *callback;
>> +
>> +    /* private, colo restore checkpoint state */
>> +    libxl__domain_create_cb *saved_cb;
>> +    void *crcs;
>> +};
>>  
> 
> And this should go to libxl_colo.h, too? And libxl_internal.h includes
> libxl_colo.h?

If we do so, we should declare libxl__ao, libxl__domain_create_cb in libxl_colo.h

Thanks
Wen Congyang

> 
> I just don't want to colo structures and functions scatter in
> different places.
> 
>>  struct libxl__domain_create_state {
>>      /* filled in by user */
>> @@ -3486,6 +3503,8 @@ struct libxl__domain_create_state {
>>      /* private to domain_create */
>>      int guest_domid;
>>      libxl__domain_build_state build_state;
>> +    libxl__colo_restore_state crs;
>> +    libxl__checkpoint_devices_state cds;
>>      libxl__bootloader_state bl;
>>      libxl__stub_dm_spawn_state dmss;
>>          /* If we're not doing stubdom, we use only dmss.dm,
>> diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
>> index 0d6949a..b1810b2 100644
>> --- a/tools/libxl/libxl_save_callout.c
>> +++ b/tools/libxl/libxl_save_callout.c
>> @@ -15,6 +15,7 @@
>>  #include "libxl_osdeps.h"
>>  
>>  #include "libxl_internal.h"
>> +#include "libxl_colo.h"
>>  
>>  /* stream_fd is as from the caller (eventually, the application).
>>   * It may be 0, 1 or 2, in which case we need to dup it elsewhere.
>> @@ -68,7 +69,11 @@ void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
>>      shs->ao = ao;
>>      shs->domid = domid;
>>      shs->recv_callback = libxl__srm_callout_received_restore;
>> -    shs->completion_callback = libxl__xc_domain_restore_done;
>> +    if (dcs->restore_params.checkpointed_stream ==
>> +                                                LIBXL_CHECKPOINTED_STREAM_COLO)
> 
> This is very strange line wrap.
> 
>> +        shs->completion_callback = libxl__colo_restore_teardown;
>> +    else
>> +        shs->completion_callback = libxl__xc_domain_restore_done;
>>      shs->caller_state = dcs;
>>      shs->need_results = 1;
>>  
>> diff --git a/tools/libxl/libxl_stream_read.c b/tools/libxl/libxl_stream_read.c
>> index 5d980d9..d6bd2fe 100644
>> --- a/tools/libxl/libxl_stream_read.c
>> +++ b/tools/libxl/libxl_stream_read.c
>> @@ -846,6 +846,18 @@ void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
>>       */
>>      if (libxl__stream_read_inuse(stream)) {
>>          switch (checkpointed_stream) {
>> +        case LIBXL_CHECKPOINTED_STREAM_COLO:
>> +            if (stream->completion_callback) {
>> +                /*
>> +                 * restore, just build the secondary vm, don't close
>> +                 * the stream
>> +                 */
>> +                stream->completion_callback(egc, stream, 0);
>> +            } else {
>> +                /* failover, just close the stream */
>> +                stream_complete(egc, stream, 0);
>> +            }
>> +            break;
>>          case LIBXL_CHECKPOINTED_STREAM_REMUS:
>>              /*
>>               * Failover from primary. Domain state is currently at a
>> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
>> index 632c009..33f4a90 100644
>> --- a/tools/libxl/libxl_types.idl
>> +++ b/tools/libxl/libxl_types.idl
>> @@ -232,6 +232,7 @@ libxl_hdtype = Enumeration("hdtype", [
>>  libxl_checkpointed_stream = Enumeration("checkpointed_stream", [
>>      (0, "NONE"),
>>      (1, "REMUS"),
>> +    (2, "COLO"),
>>      ])
>>  
>>  #
>> -- 
>> 2.5.0
>>
>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
> 
> 
> .
>
diff mbox

Patch

diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index 5d9f497..2bfed64 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -184,10 +184,12 @@  struct xc_sr_context
      * migration stream
      * 0: Plain VM
      * 1: Remus
+     * 2: COLO
      */
     enum {
         MIG_STREAM_NONE, /* plain stream */
         MIG_STREAM_REMUS,
+        MIG_STREAM_COLO,
     } migration_stream;
 
     union /* Common save or restore data. */
diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
index fe210cc..7393355 100644
--- a/tools/libxc/xc_sr_save.c
+++ b/tools/libxc/xc_sr_save.c
@@ -846,7 +846,8 @@  int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
 
     /* If altering migration_stream update this assert too. */
     assert(checkpointed_stream == MIG_STREAM_NONE ||
-           checkpointed_stream == MIG_STREAM_REMUS);
+           checkpointed_stream == MIG_STREAM_REMUS ||
+           checkpointed_stream == MIG_STREAM_COLO);
 
     /*
      * TODO: Find some time to better tweak the live migration algorithm.
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 789a12e..d8612eb 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -65,6 +65,7 @@  LIBXL_OBJS-y += libxl_no_convert_callout.o
 endif
 
 LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
+LIBXL_OBJS-y += libxl_colo_restore.o
 
 LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
 LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o libxl_libfdt_compat.o
diff --git a/tools/libxl/libxl_colo.h b/tools/libxl/libxl_colo.h
new file mode 100644
index 0000000..8bea1a2
--- /dev/null
+++ b/tools/libxl/libxl_colo.h
@@ -0,0 +1,24 @@ 
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#ifndef LIBXL_COLO_H
+#define LIBXL_COLO_H
+
+extern void libxl__colo_restore_setup(libxl__egc *egc,
+                                      libxl__colo_restore_state *crs);
+extern void libxl__colo_restore_teardown(libxl__egc *egc, void *dcs_void,
+                                         int ret, int retval, int errnoval);
+
+#endif
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
new file mode 100644
index 0000000..a417e6e
--- /dev/null
+++ b/tools/libxl/libxl_colo_restore.c
@@ -0,0 +1,1038 @@ 
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *         Yang Hongyang <hongyang.yang@easystack.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+#include "libxl_colo.h"
+#include "libxl_sr_stream_format.h"
+
+enum {
+    LIBXL_COLO_SETUPED,
+    LIBXL_COLO_SUSPENDED,
+    LIBXL_COLO_RESUMED,
+};
+
+typedef struct libxl__colo_restore_checkpoint_state libxl__colo_restore_checkpoint_state;
+struct libxl__colo_restore_checkpoint_state {
+    libxl__domain_suspend_state dsps;
+    libxl__logdirty_switch lds;
+    libxl__colo_restore_state *crs;
+    libxl__stream_write_state sws;
+    int status;
+    bool preresume;
+    /* used for teardown */
+    int teardown_devices;
+    int saved_rc;
+    char *state_file;
+
+    void (*callback)(libxl__egc *,
+                     libxl__colo_restore_checkpoint_state *,
+                     int);
+};
+
+
+static void libxl__colo_restore_domain_resume_callback(void *data);
+static void libxl__colo_restore_domain_checkpoint_callback(void *data);
+static void libxl__colo_restore_domain_wait_checkpoint_callback(void *data);
+static void libxl__colo_restore_domain_suspend_callback(void *data);
+
+static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
+    NULL,
+};
+
+/* ===================== colo: common functions ===================== */
+static void colo_enable_logdirty(libxl__colo_restore_state *crs, libxl__egc *egc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    const uint32_t domid = crs->domid;
+    libxl__logdirty_switch *const lds = &crcs->lds;
+
+    EGC_GC;
+
+    /* we need to know which pages are dirty to restore the guest */
+    if (xc_shadow_control(CTX->xch, domid,
+                          XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                          NULL, 0, NULL, 0, NULL) < 0) {
+        LOG(ERROR, "cannot enable secondary vm's logdirty");
+        lds->callback(egc, lds, ERROR_FAIL);
+        return;
+    }
+
+    if (crs->hvm) {
+        libxl__domain_common_switch_qemu_logdirty(egc, domid, 1, lds);
+        return;
+    }
+
+    lds->callback(egc, lds, 0);
+}
+
+static void colo_disable_logdirty(libxl__colo_restore_state *crs,
+                                  libxl__egc *egc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    const uint32_t domid = crs->domid;
+    libxl__logdirty_switch *const lds = &crcs->lds;
+
+    EGC_GC;
+
+    /* we need to know which pages are dirty to restore the guest */
+    if (xc_shadow_control(CTX->xch, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                          NULL, 0, NULL, 0, NULL) < 0)
+        LOG(WARN, "cannot disable secondary vm's logdirty");
+
+    if (crs->hvm) {
+        libxl__domain_common_switch_qemu_logdirty(egc, domid, 0, lds);
+        return;
+    }
+
+    lds->callback(egc, lds, 0);
+}
+
+static void colo_resume_vm(libxl__egc *egc,
+                           libxl__colo_restore_checkpoint_state *crcs,
+                           int restore_device_model)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+    int rc;
+
+    /* Convenience aliases */
+    libxl__colo_restore_state *const crs = crcs->crs;
+
+    EGC_GC;
+
+    if (!crs->saved_cb) {
+        /* TODO: sync mmu for hvm? */
+        if (restore_device_model) {
+            rc = libxl__domain_restore_device_model(gc, crs->domid,
+                                                    crcs->state_file);
+            if (rc) {
+                LOG(ERROR, "cannot restore device model for secondary vm");
+                crcs->callback(egc, crcs, rc);
+                return;
+            }
+        }
+        rc = libxl__domain_resume(gc, crs->domid, 0);
+        if (rc)
+            LOG(ERROR, "cannot resume secondary vm");
+
+        crcs->callback(egc, crcs, rc);
+        return;
+    }
+
+    /*
+     * TODO: get store gfn and console gfn
+     *  We should call the callback restore_results in
+     *  xc_domain_restore() before resuming the guest.
+     */
+    libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0);
+
+    return;
+}
+
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+    /* init device subkind-specific state in the libxl ctx */
+    int rc;
+    STATE_AO_GC(cds->ao);
+
+    rc = 0;
+    return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+    /* cleanup device subkind-specific state in the libxl ctx */
+    STATE_AO_GC(cds->ao);
+}
+
+
+/* ================ colo: setup restore environment ================ */
+static void libxl__colo_domain_create_cb(libxl__egc *egc,
+                                         libxl__domain_create_state *dcs,
+                                         int rc, uint32_t domid);
+
+static int init_dsps(libxl__domain_suspend_state *dsps)
+{
+    int rc = ERROR_FAIL;
+    libxl_domain_type type;
+
+    STATE_AO_GC(dsps->ao);
+
+    libxl__xswait_init(&dsps->pvcontrol);
+    libxl__ev_evtchn_init(&dsps->guest_evtchn);
+    libxl__ev_xswatch_init(&dsps->guest_watch);
+    libxl__ev_time_init(&dsps->guest_timeout);
+
+    type = libxl__domain_type(gc, dsps->domid);
+    if (type == LIBXL_DOMAIN_TYPE_INVALID)
+        goto out;
+
+    dsps->type = type;
+
+    dsps->guest_evtchn.port = -1;
+    dsps->guest_evtchn_lockfd = -1;
+    dsps->guest_responded = 0;
+    dsps->dm_savefile = libxl__device_model_savefile(gc, dsps->domid);
+
+    /* Secondary vm is not created, so we cannot get evtchn port */
+
+    rc = 0;
+
+out:
+    return rc;
+}
+
+void libxl__colo_restore_setup(libxl__egc *egc,
+                               libxl__colo_restore_state *crs)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs;
+    int rc = ERROR_FAIL;
+
+    /* Convenience aliases */
+    libxl__srm_restore_autogen_callbacks *const callbacks =
+        &dcs->srs.shs.callbacks.restore.a;
+    const int domid = crs->domid;
+
+    STATE_AO_GC(crs->ao);
+
+    GCNEW(crcs);
+    crs->crcs = crcs;
+    crcs->crs = crs;
+
+    /* setup dsps */
+    crcs->dsps.ao = ao;
+    crcs->dsps.domid = domid;
+    if (init_dsps(&crcs->dsps))
+        goto err;
+
+    callbacks->suspend = libxl__colo_restore_domain_suspend_callback;
+    callbacks->postcopy = libxl__colo_restore_domain_resume_callback;
+    callbacks->checkpoint = libxl__colo_restore_domain_checkpoint_callback;
+    callbacks->wait_checkpoint = libxl__colo_restore_domain_wait_checkpoint_callback;
+
+    /*
+     * Secondary vm is running in colo mode, so we need to call
+     * libxl__xc_domain_restore_done() to create secondary vm.
+     * But we will exit in domain_create_cb(). So replace the
+     * callback here.
+     */
+    crs->saved_cb = dcs->callback;
+    dcs->callback = libxl__colo_domain_create_cb;
+    crcs->state_file = GCSPRINTF(LIBXL_DEVICE_MODEL_RESTORE_FILE".%d", domid);
+    crcs->status = LIBXL_COLO_SETUPED;
+
+    libxl__logdirty_init(&crcs->lds);
+    crcs->lds.ao = ao;
+
+    crcs->sws.fd = crs->send_back_fd;
+    crcs->sws.ao = ao;
+    crcs->sws.back_channel = true;
+
+    dcs->cds.concrete_data = crs;
+
+    libxl__stream_write_start(egc, &crcs->sws);
+
+    rc = 0;
+
+out:
+    crs->callback(egc, crs, rc);
+    return;
+
+err:
+    goto out;
+}
+
+static void libxl__colo_domain_create_cb(libxl__egc *egc,
+                                         libxl__domain_create_state *dcs,
+                                         int rc, uint32_t domid)
+{
+    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+    crcs->callback(egc, crcs, rc);
+}
+
+
+/* ================ colo: teardown restore environment ================ */
+static void colo_restore_teardown_devices_done(libxl__egc *egc,
+    libxl__checkpoint_devices_state *cds, int rc);
+static void do_failover(libxl__egc *egc, libxl__colo_restore_state *crs);
+static void do_failover_done(libxl__egc *egc,
+                             libxl__colo_restore_checkpoint_state* crcs,
+                             int rc);
+static void colo_disable_logdirty_done(libxl__egc *egc,
+                                       libxl__logdirty_switch *lds,
+                                       int rc);
+static void libxl__colo_restore_teardown_done(libxl__egc *egc,
+                                              libxl__colo_restore_state *crs,
+                                              int rc);
+
+void libxl__colo_restore_teardown(libxl__egc *egc, void *dcs_void,
+                                  int ret, int retval, int errnoval)
+{
+    libxl__domain_create_state *dcs = dcs_void;
+    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+    int rc = 1;
+
+    /* convenience aliases */
+    libxl__colo_restore_state *const crs = &dcs->crs;
+    EGC_GC;
+
+    if (ret == 0 && retval == 0)
+        rc = 0;
+
+    LOG(INFO, "%s", rc ? "colo fails" : "failover");
+
+    libxl__stream_write_abort(egc, &crcs->sws, 1);
+    if (crs->saved_cb) {
+        /* crcs->status is LIBXL_COLO_SETUPED */
+        dcs->srs.completion_callback = NULL;
+    }
+    libxl__xc_domain_restore_done(egc, dcs, ret, retval, errnoval);
+
+    crcs->saved_rc = rc;
+    if (!crcs->teardown_devices) {
+        colo_restore_teardown_devices_done(egc, &dcs->cds, 0);
+        return;
+    }
+
+    dcs->cds.callback = colo_restore_teardown_devices_done;
+    libxl__checkpoint_devices_teardown(egc, &dcs->cds);
+}
+
+static void colo_restore_teardown_devices_done(libxl__egc *egc,
+    libxl__checkpoint_devices_state *cds, int rc)
+{
+    libxl__colo_restore_state *crs = cds->concrete_data;
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+    EGC_GC;
+
+    if (rc)
+        LOG(ERROR, "COLO: failed to teardown device for guest with domid %u,"
+            " rc %d", cds->domid, rc);
+
+    if (crcs->teardown_devices)
+        cleanup_device_subkind(cds);
+
+    rc = crcs->saved_rc;
+    if (!rc) {
+        crcs->callback = do_failover_done;
+        do_failover(egc, crs);
+        return;
+    }
+
+    libxl__colo_restore_teardown_done(egc, crs, rc);
+}
+
+static void do_failover(libxl__egc *egc, libxl__colo_restore_state *crs)
+{
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    const int status = crcs->status;
+    libxl__logdirty_switch *const lds = &crcs->lds;
+
+    EGC_GC;
+
+    switch(status) {
+    case LIBXL_COLO_SETUPED:
+        /*
+         * We will come here only when reading emulator xenstore data or
+         * emulator context fails, and libxl__xc_domain_restore_done()
+         * is not called. In this case, the migration is not finished,
+         * so we cannot do failover.
+         */
+        LOG(ERROR, "migration fails");
+        crcs->callback(egc, crcs, ERROR_FAIL);
+        return;
+    case LIBXL_COLO_SUSPENDED:
+    case LIBXL_COLO_RESUMED:
+        /* disable logdirty first */
+        lds->callback = colo_disable_logdirty_done;
+        colo_disable_logdirty(crs, egc);
+        return;
+    default:
+        LOG(ERROR, "invalid status: %d", status);
+        crcs->callback(egc, crcs, ERROR_FAIL);
+    }
+}
+
+static void do_failover_done(libxl__egc *egc,
+                             libxl__colo_restore_checkpoint_state* crcs,
+                             int rc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+    /* Convenience aliases */
+    libxl__colo_restore_state *const crs = crcs->crs;
+
+    EGC_GC;
+
+    if (rc)
+        LOG(ERROR, "cannot do failover");
+
+    libxl__colo_restore_teardown_done(egc, crs, rc);
+}
+
+static void colo_disable_logdirty_done(libxl__egc *egc,
+                                       libxl__logdirty_switch *lds,
+                                       int rc)
+{
+    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+
+    EGC_GC;
+
+    if (rc)
+        LOG(WARN, "cannot disable logdirty");
+
+    if (crcs->status == LIBXL_COLO_SUSPENDED) {
+        /*
+         * failover when reading state from master, so no need to
+         * call libxl__domain_restore().
+         */
+        colo_resume_vm(egc, crcs, 0);
+        return;
+    }
+
+    /* If we cannot disable logdirty, we still can do failover */
+    crcs->callback(egc, crcs, 0);
+}
+
+static void libxl__colo_restore_teardown_done(libxl__egc *egc,
+                                              libxl__colo_restore_state *crs,
+                                              int rc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    EGC_GC;
+
+    /* convenience aliases */
+    const int domid = crs->domid;
+    const libxl_ctx *const ctx = libxl__gc_owner(gc);
+    xc_interface *const xch = ctx->xch;
+
+    if (!rc)
+        /* failover, no need to destroy the secondary vm */
+        goto out;
+
+    xc_domain_destroy(xch, domid);
+
+out:
+    if (crs->saved_cb) {
+        dcs->callback = crs->saved_cb;
+        crs->saved_cb = NULL;
+    }
+
+    dcs->callback(egc, dcs, rc, crs->domid);
+}
+
+/*
+ * checkpoint callbacks are called in the following order:
+ * 1. checkpoint
+ * 2. resume
+ * 3. wait checkpoint
+ * 4. suspend
+ */
+static void colo_common_write_stream_done(libxl__egc *egc,
+                                          libxl__stream_write_state *stream,
+                                          int rc);
+static void colo_common_read_stream_done(libxl__egc *egc,
+                                         libxl__stream_read_state *stream,
+                                         int rc);
+/* ======================== colo: checkpoint ======================= */
+/*
+ * Do the following things when resuming secondary vm:
+ *  1. read emulator xenstore data
+ *  2. read emulator context
+ *  3. REC_TYPE_CHECKPOINT_END
+ */
+static void libxl__colo_restore_domain_checkpoint_callback(void *data)
+{
+    libxl__save_helper_state *shs = data;
+    libxl__stream_read_state *srs = CONTAINER_OF(shs, *srs, shs);
+    libxl__domain_create_state *dcs = CONTAINER_OF(srs, *dcs, srs);
+    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+    crcs->callback = NULL;
+    dcs->srs.checkpoint_callback = colo_common_read_stream_done;
+    libxl__stream_read_start_checkpoint(shs->egc, &dcs->srs);
+}
+
+
+/* ===================== colo: resume secondary vm ===================== */
+/*
+ * Do the following things when resuming secondary vm the first time:
+ *  1. resume secondary vm
+ *  2. enable log dirty
+ *  3. setup checkpoint devices
+ *  4. write CHECKPOINT_SVM_READY
+ *  5. unpause secondary vm
+ *  6. write CHECKPOINT_SVM_RESUMED
+ *
+ * Do the following things when resuming secondary vm:
+ *  1. write CHECKPOINT_SVM_READY
+ *  2. resume secondary vm
+ *  3. write CHECKPOINT_SVM_RESUMED
+ */
+static void colo_send_svm_ready(libxl__egc *egc,
+                                libxl__colo_restore_checkpoint_state *crcs);
+static void colo_send_svm_ready_done(libxl__egc *egc,
+                                     libxl__colo_restore_checkpoint_state *crcs,
+                                     int rc);
+static void colo_restore_preresume_cb(libxl__egc *egc,
+                                      libxl__checkpoint_devices_state *cds,
+                                      int rc);
+static void colo_restore_resume_vm(libxl__egc *egc,
+                                   libxl__colo_restore_checkpoint_state *crcs);
+static void colo_resume_vm_done(libxl__egc *egc,
+                                libxl__colo_restore_checkpoint_state *crcs,
+                                int rc);
+static void colo_write_svm_resumed(libxl__egc *egc,
+                                   libxl__colo_restore_checkpoint_state *crcs);
+static void colo_enable_logdirty_done(libxl__egc *egc,
+                                      libxl__logdirty_switch *lds,
+                                      int retval);
+static void colo_reenable_logdirty(libxl__egc *egc,
+                                   libxl__logdirty_switch *lds,
+                                   int rc);
+static void colo_reenable_logdirty_done(libxl__egc *egc,
+                                        libxl__logdirty_switch *lds,
+                                        int rc);
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+                                          libxl__colo_restore_state *crs);
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc);
+static void colo_unpause_svm(libxl__egc *egc,
+                             libxl__colo_restore_checkpoint_state *crcs);
+
+static void libxl__colo_restore_domain_resume_callback(void *data)
+{
+    libxl__save_helper_state *shs = data;
+    libxl__stream_read_state *srs = CONTAINER_OF(shs, *srs, shs);
+    libxl__domain_create_state *dcs = CONTAINER_OF(srs, *dcs, srs);
+    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+    if (crcs->teardown_devices)
+        colo_send_svm_ready(shs->egc, crcs);
+    else
+        colo_restore_resume_vm(shs->egc, crcs);
+}
+
+static void colo_send_svm_ready(libxl__egc *egc,
+                               libxl__colo_restore_checkpoint_state *crcs)
+{
+    libxl_sr_checkpoint_state srcs = { .id = CHECKPOINT_SVM_READY };
+
+    crcs->callback = colo_send_svm_ready_done;
+    crcs->sws.checkpoint_callback = colo_common_write_stream_done;
+    libxl__stream_write_checkpoint_state(egc, &crcs->sws, &srcs);
+}
+
+static void colo_send_svm_ready_done(libxl__egc *egc,
+                                     libxl__colo_restore_checkpoint_state *crcs,
+                                     int rc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+    /* Convenience aliases */
+    libxl__checkpoint_devices_state *cds = &dcs->cds;
+
+    if (!crcs->preresume) {
+        crcs->preresume = true;
+        colo_unpause_svm(egc, crcs);
+        return;
+    }
+
+    cds->callback = colo_restore_preresume_cb;
+    libxl__checkpoint_devices_preresume(egc, cds);
+}
+
+static void colo_restore_preresume_cb(libxl__egc *egc,
+                                      libxl__checkpoint_devices_state *cds,
+                                      int rc)
+{
+    libxl__colo_restore_state *crs = cds->concrete_data;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "preresume fails");
+        goto out;
+    }
+
+    colo_restore_resume_vm(egc, crcs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_restore_resume_vm(libxl__egc *egc,
+                                   libxl__colo_restore_checkpoint_state *crcs)
+{
+
+    crcs->callback = colo_resume_vm_done;
+    colo_resume_vm(egc, crcs, 1);
+}
+
+static void colo_resume_vm_done(libxl__egc *egc,
+                                libxl__colo_restore_checkpoint_state *crcs,
+                                int rc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+    /* Convenience aliases */
+    libxl__colo_restore_state *const crs = crcs->crs;
+    libxl__logdirty_switch *const lds = &crcs->lds;
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "cannot resume secondary vm");
+        goto out;
+    }
+
+    crcs->status = LIBXL_COLO_RESUMED;
+
+    /* avoid calling stream->completion_callback() more than once */
+    if (crs->saved_cb) {
+        dcs->callback = crs->saved_cb;
+        crs->saved_cb = NULL;
+
+        dcs->srs.completion_callback = NULL;
+
+        lds->callback = colo_enable_logdirty_done;
+        colo_enable_logdirty(crs, egc);
+        return;
+    }
+
+    colo_write_svm_resumed(egc, crcs);
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_write_svm_resumed(libxl__egc *egc,
+                                   libxl__colo_restore_checkpoint_state *crcs)
+{
+    libxl_sr_checkpoint_state srcs = { .id = CHECKPOINT_SVM_RESUMED };
+
+    crcs->callback = NULL;
+    crcs->sws.checkpoint_callback = colo_common_write_stream_done;
+    libxl__stream_write_checkpoint_state(egc, &crcs->sws, &srcs);
+}
+
+static void colo_enable_logdirty_done(libxl__egc *egc,
+                                      libxl__logdirty_switch *lds,
+                                      int rc)
+{
+    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+
+    /* Convenience aliases */
+    libxl__colo_restore_state *const crs = crcs->crs;
+
+    EGC_GC;
+
+    if (rc) {
+        /*
+         * log-dirty already enabled? There's no test op,
+         * so attempt to disable then reenable it
+         */
+        lds->callback = colo_reenable_logdirty;
+        colo_disable_logdirty(crs, egc);
+        return;
+    }
+
+    colo_setup_checkpoint_devices(egc, crs);
+}
+
+static void colo_reenable_logdirty(libxl__egc *egc,
+                                   libxl__logdirty_switch *lds,
+                                   int rc)
+{
+    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+    /* Convenience aliases */
+    libxl__colo_restore_state *const crs = crcs->crs;
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "cannot enable logdirty");
+        goto out;
+    }
+
+    lds->callback = colo_reenable_logdirty_done;
+    colo_enable_logdirty(crs, egc);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_reenable_logdirty_done(libxl__egc *egc,
+                                        libxl__logdirty_switch *lds,
+                                        int rc)
+{
+    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+    /* Convenience aliases */
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "cannot enable logdirty");
+        goto out;
+    }
+
+    colo_setup_checkpoint_devices(egc, crcs->crs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+/*
+ * We cannot setup checkpoint devices in libxl__colo_restore_setup(),
+ * because the guest is not ready.
+ */
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+                                          libxl__colo_restore_state *crs)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__checkpoint_devices_state *cds = &dcs->cds;
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    STATE_AO_GC(crs->ao);
+
+    /* TODO: disk/nic support */
+    cds->device_kind_flags = 0;
+    cds->callback = colo_restore_setup_cds_done;
+    cds->ao = ao;
+    cds->domid = crs->domid;
+    cds->ops = colo_restore_ops;
+
+    if (init_device_subkind(cds))
+        goto out;
+
+    crcs->teardown_devices = 1;
+
+    libxl__checkpoint_devices_setup(egc, cds);
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc)
+{
+    libxl__colo_restore_state *crs = cds->concrete_data;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
+            cds->domid);
+        goto out;
+    }
+
+    colo_send_svm_ready(egc, crcs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_unpause_svm(libxl__egc *egc,
+                             libxl__colo_restore_checkpoint_state *crcs)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+    int rc;
+
+    /* Convenience aliases */
+    const uint32_t domid = crcs->crs->domid;
+    libxl__save_helper_state *const shs = &dcs->srs.shs;
+
+    EGC_GC;
+
+    /* We have enabled secondary vm's logdirty, so we can unpause it now */
+    rc = libxl_domain_unpause(CTX, domid);
+    if (rc) {
+        LOG(ERROR, "cannot unpause secondary vm");
+        goto out;
+    }
+
+    colo_write_svm_resumed(egc, crcs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+
+/* ===================== colo: wait new checkpoint ===================== */
+static void colo_restore_commit_cb(libxl__egc *egc,
+                                   libxl__checkpoint_devices_state *cds,
+                                   int rc);
+static void colo_stream_read_done(libxl__egc *egc,
+                                  libxl__colo_restore_checkpoint_state *crcs,
+                                  int real_size);
+
+static void libxl__colo_restore_domain_wait_checkpoint_callback(void *data)
+{
+    libxl__save_helper_state *shs = data;
+    libxl__stream_read_state *srs = CONTAINER_OF(shs, *srs, shs);
+    libxl__domain_create_state *dcs = CONTAINER_OF(srs, *dcs, srs);
+
+    /* Convenience aliases */
+    libxl__checkpoint_devices_state *cds = &dcs->cds;
+
+    cds->callback = colo_restore_commit_cb;
+    libxl__checkpoint_devices_commit(shs->egc, cds);
+}
+
+static void colo_restore_commit_cb(libxl__egc *egc,
+                                   libxl__checkpoint_devices_state *cds,
+                                   int rc)
+{
+    libxl__colo_restore_state *crs = cds->concrete_data;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "commit fails");
+        goto out;
+    }
+
+    crcs->callback = colo_stream_read_done;
+    dcs->srs.checkpoint_callback = colo_common_read_stream_done;
+    libxl__stream_read_checkpoint_state(egc, &dcs->srs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->srs.shs, 0);
+}
+
+static void colo_stream_read_done(libxl__egc *egc,
+                                  libxl__colo_restore_checkpoint_state *crcs,
+                                  int id)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+    int ok = 0;
+
+    EGC_GC;
+
+    if (id != CHECKPOINT_NEW) {
+        LOG(ERROR, "invalid section: %d", id);
+        goto out;
+    }
+
+    ok = 1;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->srs.shs, ok);
+}
+
+
+/* ===================== colo: suspend secondary vm ===================== */
+/*
+ * Do the following things when resuming secondary vm:
+ *  1. suspend secondary vm
+ *  2. send CHECKPOINT_SVM_SUSPENDED
+ */
+static void colo_suspend_vm_done(libxl__egc *egc,
+                                 libxl__domain_suspend_state *dsps,
+                                 int ok);
+static void colo_restore_postsuspend_cb(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc);
+
+static void libxl__colo_restore_domain_suspend_callback(void *data)
+{
+    libxl__save_helper_state *shs = data;
+    libxl__stream_read_state *srs = CONTAINER_OF(shs, *srs, shs);
+    libxl__domain_create_state *dcs = CONTAINER_OF(srs, *dcs, srs);
+    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+    STATE_AO_GC(dcs->ao);
+
+    /* Convenience aliases */
+    libxl__domain_suspend_state *const dsps = &crcs->dsps;
+
+    /* suspend secondary vm */
+    dsps->callback_common_done = colo_suspend_vm_done;
+
+    libxl__domain_suspend(shs->egc, dsps);
+}
+
+static void colo_suspend_vm_done(libxl__egc *egc,
+                                 libxl__domain_suspend_state *dsps,
+                                 int rc)
+{
+    libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(dsps, *crcs, dsps);
+    libxl__colo_restore_state *crs = crcs->crs;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+    /* Convenience aliases */
+    libxl__checkpoint_devices_state *cds = &dcs->cds;
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "cannot suspend secondary vm");
+        goto out;
+    }
+
+    crcs->status = LIBXL_COLO_SUSPENDED;
+
+    cds->callback = colo_restore_postsuspend_cb;
+    libxl__checkpoint_devices_postsuspend(egc, cds);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->srs.shs, !rc);
+}
+
+static void colo_restore_postsuspend_cb(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc)
+{
+    libxl__colo_restore_state *crs = cds->concrete_data;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+    libxl_sr_checkpoint_state srcs = { .id = CHECKPOINT_SVM_SUSPENDED };
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "postsuspend fails");
+        goto out;
+    }
+
+    crcs->callback = NULL;
+    crcs->sws.checkpoint_callback = colo_common_write_stream_done;
+    libxl__stream_write_checkpoint_state(egc, &crcs->sws, &srcs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->srs.shs, !rc);
+}
+
+
+/* ===================== colo: common callback ===================== */
+static void colo_common_write_stream_done(libxl__egc *egc,
+                                          libxl__stream_write_state *stream,
+                                          int rc)
+{
+    libxl__colo_restore_checkpoint_state *crcs =
+        CONTAINER_OF(stream, *crcs, sws);
+    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+    int ok;
+
+    EGC_GC;
+
+    if (rc < 0) {
+        /* TODO: it may be a internal error, but we don't know */
+        LOG(ERROR, "sending data fails");
+        ok = 2;
+        goto out;
+    }
+
+    if (!crcs->callback) {
+        /* Everythins is OK */
+        ok = 1;
+        goto out;
+    }
+
+    crcs->callback(egc, crcs, 0);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->srs.shs, ok);
+}
+
+static void colo_common_read_stream_done(libxl__egc *egc,
+                                         libxl__stream_read_state *stream,
+                                         int rc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(stream, *dcs, srs);
+    libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+    int ok;
+
+    EGC_GC;
+
+    if (rc < 0) {
+        /* TODO: it may be a internal error, but we don't know */
+        LOG(ERROR, "reading data fails");
+        ok = 2;
+        goto out;
+    }
+
+    if (!crcs->callback) {
+        /* Everythins is OK */
+        ok = 1;
+        goto out;
+    }
+
+    /* rc contains the id */
+    crcs->callback(egc, crcs, rc);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->srs.shs, ok);
+}
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index fc746fb..39458b7 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -19,6 +19,7 @@ 
 
 #include "libxl_internal.h"
 #include "libxl_arch.h"
+#include "libxl_colo.h"
 
 #include <xc_dom.h>
 #include <xenguest.h>
@@ -981,6 +982,23 @@  static void domcreate_console_available(libxl__egc *egc,
                                         dcs->aop_console_how.for_event));
 }
 
+static void libxl__colo_restore_setup_done(libxl__egc *egc,
+                                           libxl__colo_restore_state *crs,
+                                           int rc)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+    EGC_GC;
+
+    if (rc) {
+        LOG(ERROR, "colo restore setup fails: %d", rc);
+        domcreate_stream_done(egc, &dcs->srs, rc);
+        return;
+    }
+
+    libxl__stream_read_start(egc, &dcs->srs);
+}
+
 static void domcreate_bootloader_done(libxl__egc *egc,
                                       libxl__bootloader_state *bl,
                                       int rc)
@@ -994,6 +1012,8 @@  static void domcreate_bootloader_done(libxl__egc *egc,
     const int restore_fd = dcs->restore_fd;
     libxl__domain_build_state *const state = &dcs->build_state;
     const int checkpointed_stream = dcs->restore_params.checkpointed_stream;
+    libxl__colo_restore_state *const crs = &dcs->crs;
+    libxl_domain_build_info *const info = &d_config->b_info;
 
     if (rc) {
         domcreate_rebuild_done(egc, dcs, rc);
@@ -1022,6 +1042,13 @@  static void domcreate_bootloader_done(libxl__egc *egc,
 
     /* Restore */
 
+    /* COLO only supports HVM now */
+    if (info->type != LIBXL_DOMAIN_TYPE_HVM &&
+        checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
     rc = libxl__build_pre(gc, domid, d_config, state);
     if (rc)
         goto out;
@@ -1035,6 +1062,16 @@  static void domcreate_bootloader_done(libxl__egc *egc,
 
     if (restore_fd >= 0) {
         switch (checkpointed_stream) {
+        case LIBXL_CHECKPOINTED_STREAM_COLO:
+            /* colo restore setup */
+            crs->ao = ao;
+            crs->domid = domid;
+            crs->send_back_fd = dcs->send_back_fd;
+            crs->recv_fd = restore_fd;
+            crs->hvm = (info->type == LIBXL_DOMAIN_TYPE_HVM);
+            crs->callback = libxl__colo_restore_setup_done;
+            libxl__colo_restore_setup(egc, crs);
+            break;
         case LIBXL_CHECKPOINTED_STREAM_REMUS:
             libxl__remus_restore_setup(egc, dcs);
             /* fall through */
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 33b658d..b9ca81c 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3470,6 +3470,23 @@  libxl__stream_read_inuse(const libxl__stream_read_state *stream)
     return stream->running;
 }
 
+/* colo related structure */
+typedef struct libxl__colo_restore_state libxl__colo_restore_state;
+typedef void libxl__colo_callback(libxl__egc *,
+                                  libxl__colo_restore_state *, int rc);
+struct libxl__colo_restore_state {
+    /* must set by caller of libxl__colo_(setup|teardown) */
+    libxl__ao *ao;
+    uint32_t domid;
+    int send_back_fd;
+    int recv_fd;
+    int hvm;
+    libxl__colo_callback *callback;
+
+    /* private, colo restore checkpoint state */
+    libxl__domain_create_cb *saved_cb;
+    void *crcs;
+};
 
 struct libxl__domain_create_state {
     /* filled in by user */
@@ -3486,6 +3503,8 @@  struct libxl__domain_create_state {
     /* private to domain_create */
     int guest_domid;
     libxl__domain_build_state build_state;
+    libxl__colo_restore_state crs;
+    libxl__checkpoint_devices_state cds;
     libxl__bootloader_state bl;
     libxl__stub_dm_spawn_state dmss;
         /* If we're not doing stubdom, we use only dmss.dm,
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 0d6949a..b1810b2 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -15,6 +15,7 @@ 
 #include "libxl_osdeps.h"
 
 #include "libxl_internal.h"
+#include "libxl_colo.h"
 
 /* stream_fd is as from the caller (eventually, the application).
  * It may be 0, 1 or 2, in which case we need to dup it elsewhere.
@@ -68,7 +69,11 @@  void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
     shs->ao = ao;
     shs->domid = domid;
     shs->recv_callback = libxl__srm_callout_received_restore;
-    shs->completion_callback = libxl__xc_domain_restore_done;
+    if (dcs->restore_params.checkpointed_stream ==
+                                                LIBXL_CHECKPOINTED_STREAM_COLO)
+        shs->completion_callback = libxl__colo_restore_teardown;
+    else
+        shs->completion_callback = libxl__xc_domain_restore_done;
     shs->caller_state = dcs;
     shs->need_results = 1;
 
diff --git a/tools/libxl/libxl_stream_read.c b/tools/libxl/libxl_stream_read.c
index 5d980d9..d6bd2fe 100644
--- a/tools/libxl/libxl_stream_read.c
+++ b/tools/libxl/libxl_stream_read.c
@@ -846,6 +846,18 @@  void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
      */
     if (libxl__stream_read_inuse(stream)) {
         switch (checkpointed_stream) {
+        case LIBXL_CHECKPOINTED_STREAM_COLO:
+            if (stream->completion_callback) {
+                /*
+                 * restore, just build the secondary vm, don't close
+                 * the stream
+                 */
+                stream->completion_callback(egc, stream, 0);
+            } else {
+                /* failover, just close the stream */
+                stream_complete(egc, stream, 0);
+            }
+            break;
         case LIBXL_CHECKPOINTED_STREAM_REMUS:
             /*
              * Failover from primary. Domain state is currently at a
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 632c009..33f4a90 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -232,6 +232,7 @@  libxl_hdtype = Enumeration("hdtype", [
 libxl_checkpointed_stream = Enumeration("checkpointed_stream", [
     (0, "NONE"),
     (1, "REMUS"),
+    (2, "COLO"),
     ])
 
 #