diff mbox

[v10,22/31] implement the cmdline for COLO

Message ID 1456109555-28299-23-git-send-email-wency@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wen Congyang Feb. 22, 2016, 2:52 a.m. UTC
Add a new option -c to the command 'xl remus'. If you want
to use COLO HA instead of Remus HA, please use -c option.

Update man pages to reflect the addition of a new option to
'xl remus' command.

Also add a new option -c to the internal command 'xl migrate-receive'.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
---
 docs/man/xl.pod.1         | 12 +++++++--
 tools/libxl/libxl.c       | 23 ++++++++++++++++--
 tools/libxl/xl_cmdimpl.c  | 62 ++++++++++++++++++++++++++++++++++++-----------
 tools/libxl/xl_cmdtable.c |  4 ++-
 4 files changed, 82 insertions(+), 19 deletions(-)

Comments

Wei Liu March 2, 2016, 3:03 p.m. UTC | #1
On Mon, Feb 22, 2016 at 10:52:26AM +0800, Wen Congyang wrote:
[...]
> +    if (libxl_defbool_val(info->colo)) {
> +        if (libxl_defbool_val(info->compression)) {

This can be simplified as

       if (libxl_defbool_val(xxx) && libxl_defbool_val(yyy))

> +            LOG(ERROR, "cannot use memory checkpoint compression in COLO mode");
> +            rc = ERROR_FAIL;
> +            goto out;
> +        }
> +    }
> +
>      if (!libxl_defbool_val(info->allow_unsafe) &&
>          (libxl_defbool_val(info->blackhole) ||
>           !libxl_defbool_val(info->netbuf) ||
> @@ -876,7 +892,10 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
>      dss->live = 1;
>      dss->debug = 0;
>      dss->remus = info;
> -    dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_REMUS;
> +    if (libxl_defbool_val(info->colo))
> +        dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_COLO;
> +    else
> +        dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_REMUS;
>  
>      assert(info);
>  
> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
> index df7268b..0dc7220 100644
> --- a/tools/libxl/xl_cmdimpl.c
> +++ b/tools/libxl/xl_cmdimpl.c
> @@ -4440,6 +4440,8 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>      char rc_buf;
>      char *migration_domname;
>      struct domain_create dom_info;
> +    const char *ha = checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO ?
> +                     "COLO" : "Remus";
>  
>      signal(SIGPIPE, SIG_IGN);
>      /* if we get SIGPIPE we'd rather just have it as an error */
> @@ -4460,6 +4462,9 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>      dom_info.send_back_fd = send_fd;
>      dom_info.migration_domname_r = &migration_domname;
>      dom_info.checkpointed_stream = checkpointed;
> +    if (checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO)
> +        /* COLO uses stdout to send control message to master */
> +        dom_info.quiet = 1;
>  

It seems that dom_info->quiet affects stderr, not stdout. See the only
place that checks this in xl_cmdimpl.c.

>      rc = create_domain(&dom_info);
>      if (rc < 0) {
> @@ -4472,11 +4477,12 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>  
>      switch (checkpointed) {
>      case LIBXL_CHECKPOINTED_STREAM_REMUS:
> +    case LIBXL_CHECKPOINTED_STREAM_COLO:
>          /* If we are here, it means that the sender (primary) has crashed.
>           * TODO: Split-Brain Check.
>           */
> -        fprintf(stderr, "migration target: Remus Failover for domain %u\n",
> -                domid);
> +        fprintf(stderr, "migration target: %s Failover for domain %u\n",
> +                ha, domid);
>  
>          /*
>           * If domain renaming fails, lets just continue (as we need the domain
> @@ -4492,16 +4498,20 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>              rc = libxl_domain_rename(ctx, domid, migration_domname,
>                                       common_domname);
>              if (rc)
> -                fprintf(stderr, "migration target (Remus): "
> +                fprintf(stderr, "migration target (%s): "
>                          "Failed to rename domain from %s to %s:%d\n",
> -                        migration_domname, common_domname, rc);
> +                        ha, migration_domname, common_domname, rc);
>          }
>  
> +        if (checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO)
> +            /* The guest is running after failover in COLO mode */
> +            exit(rc ? -ERROR_FAIL: 0);
> +
>          rc = libxl_domain_unpause(ctx, domid);
>          if (rc)
> -            fprintf(stderr, "migration target (Remus): "
> +            fprintf(stderr, "migration target (%s): "
>                      "Failed to unpause domain %s (id: %u):%d\n",
> -                    common_domname, domid, rc);
> +                    ha, common_domname, domid, rc);
>  
>          exit(rc ? -ERROR_FAIL: 0);
>      default:
> @@ -4649,7 +4659,7 @@ int main_migrate_receive(int argc, char **argv)
>      libxl_checkpointed_stream checkpointed = LIBXL_CHECKPOINTED_STREAM_NONE;
>      int opt;
>  
> -    SWITCH_FOREACH_OPT(opt, "Fedr", NULL, "migrate-receive", 0) {
> +    SWITCH_FOREACH_OPT(opt, "Fedrc", NULL, "migrate-receive", 0) {
>      case 'F':
>          daemonize = 0;
>          break;
> @@ -4663,6 +4673,9 @@ int main_migrate_receive(int argc, char **argv)
>      case 'r':
>          checkpointed = LIBXL_CHECKPOINTED_STREAM_REMUS;
>          break;
> +    case 'c':
> +        checkpointed = LIBXL_CHECKPOINTED_STREAM_COLO;
> +        break;
>      }
>  
>      if (argc-optind != 0) {
> @@ -8032,11 +8045,8 @@ int main_remus(int argc, char **argv)
>      int config_len;
>  
>      memset(&r_info, 0, sizeof(libxl_domain_remus_info));
> -    /* Defaults */
> -    r_info.interval = 200;
> -    libxl_defbool_setdefault(&r_info.blackhole, false);
>  
> -    SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:e", NULL, "remus", 2) {
> +    SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:ec", NULL, "remus", 2) {
>      case 'i':
>          r_info.interval = atoi(optarg);
>          break;
> @@ -8064,11 +8074,32 @@ int main_remus(int argc, char **argv)
>      case 'e':
>          daemonize = 0;
>          break;
> +    case 'c':
> +        libxl_defbool_set(&r_info.colo, true);
>      }
>  
>      domid = find_domain(argv[optind]);
>      host = argv[optind + 1];
>  
> +    /* Defaults */
> +    libxl_defbool_setdefault(&r_info.blackhole, false);
> +    libxl_defbool_setdefault(&r_info.colo, false);
> +    if (!libxl_defbool_val(r_info.colo) && !r_info.interval)
> +        r_info.interval = 200;
> +
> +    if (libxl_defbool_val(r_info.colo)) {
> +        if (r_info.interval || libxl_defbool_val(r_info.blackhole)) {
> +            perror("Option -c conflicts with -i or -b");
> +            exit(-1);
> +        }
> +
> +        if (libxl_defbool_is_default(r_info.compression)) {
> +            perror("COLO can't be used with memory compression. "
> +                   "Disable memory checkpoint compression now...");
> +            libxl_defbool_set(&r_info.compression, false);
> +        }
> +    }
> +

I don't think I'm entirely happy with how these things are arranged.
Remus and COLO don't seem to have a set of consistent APIs that
arbitrary users can call.

But for the sake of not growing this series any longer let's leave it
like this for the moment. I think COLO at best is going to be (as you
stated in manpage) experimental at this stage.


>      if (!r_info.netbufscript)
>          r_info.netbufscript = default_remus_netbufscript;
>  
> @@ -8083,8 +8114,9 @@ int main_remus(int argc, char **argv)
>          if (!ssh_command[0]) {
>              rune = host;
>          } else {
> -            xasprintf(&rune, "exec %s %s xl migrate-receive -r %s",
> +            xasprintf(&rune, "exec %s %s xl migrate-receive %s %s",
>                        ssh_command, host,
> +                      libxl_defbool_val(r_info.colo) ? "-c" : "-r",
>                        daemonize ? "" : " -e");
>          }
>  
> @@ -8112,7 +8144,8 @@ int main_remus(int argc, char **argv)
>       * domain to force failover
>       */
>      if (libxl_domain_info(ctx, 0, domid)) {
> -        fprintf(stderr, "Remus: Primary domain has been destroyed.\n");
> +        fprintf(stderr, "%s: Primary domain has been destroyed.\n",
> +                libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
>          close(send_fd);
>          return 0;
>      }
> @@ -8124,7 +8157,8 @@ int main_remus(int argc, char **argv)
>      if (rc == ERROR_GUEST_TIMEDOUT)
>          fprintf(stderr, "Failed to suspend domain at primary.\n");
>      else {
> -        fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n");
> +        fprintf(stderr, "%s: Backup failed? resuming domain at primary.\n",
> +                libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
>          libxl_domain_resume(ctx, domid, 1, 0);
>      }
>  
> diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
> index fdc1ac6..b6b630c 100644
> --- a/tools/libxl/xl_cmdtable.c
> +++ b/tools/libxl/xl_cmdtable.c
> @@ -499,7 +499,9 @@ struct cmd_spec cmd_table[] = {
>        "-b                      Replicate memory checkpoints to /dev/null (blackhole).\n"
>        "                        Works only in unsafe mode.\n"
>        "-n                      Disable network output buffering. Works only in unsafe mode.\n"
> -      "-d                      Disable disk replication. Works only in unsafe mode."
> +      "-d                      Disable disk replication. Works only in unsafe mode.\n"
> +      "-c                      Enable COLO HA. It is conflict with -i and -b, and memory\n"
> +      "                        checkpoint must be disabled"
>      },
>  #endif
>      { "devd",
> -- 
> 2.5.0
> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
Wen Congyang March 3, 2016, 1:30 a.m. UTC | #2
On 03/02/2016 11:03 PM, Wei Liu wrote:
> On Mon, Feb 22, 2016 at 10:52:26AM +0800, Wen Congyang wrote:
> [...]
>> +    if (libxl_defbool_val(info->colo)) {
>> +        if (libxl_defbool_val(info->compression)) {
> 
> This can be simplified as
> 
>        if (libxl_defbool_val(xxx) && libxl_defbool_val(yyy))

OK. will fix it in the next version.

> 
>> +            LOG(ERROR, "cannot use memory checkpoint compression in COLO mode");
>> +            rc = ERROR_FAIL;
>> +            goto out;
>> +        }
>> +    }
>> +
>>      if (!libxl_defbool_val(info->allow_unsafe) &&
>>          (libxl_defbool_val(info->blackhole) ||
>>           !libxl_defbool_val(info->netbuf) ||
>> @@ -876,7 +892,10 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
>>      dss->live = 1;
>>      dss->debug = 0;
>>      dss->remus = info;
>> -    dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_REMUS;
>> +    if (libxl_defbool_val(info->colo))
>> +        dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_COLO;
>> +    else
>> +        dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_REMUS;
>>  
>>      assert(info);
>>  
>> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
>> index df7268b..0dc7220 100644
>> --- a/tools/libxl/xl_cmdimpl.c
>> +++ b/tools/libxl/xl_cmdimpl.c
>> @@ -4440,6 +4440,8 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>>      char rc_buf;
>>      char *migration_domname;
>>      struct domain_create dom_info;
>> +    const char *ha = checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO ?
>> +                     "COLO" : "Remus";
>>  
>>      signal(SIGPIPE, SIG_IGN);
>>      /* if we get SIGPIPE we'd rather just have it as an error */
>> @@ -4460,6 +4462,9 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>>      dom_info.send_back_fd = send_fd;
>>      dom_info.migration_domname_r = &migration_domname;
>>      dom_info.checkpointed_stream = checkpointed;
>> +    if (checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO)
>> +        /* COLO uses stdout to send control message to master */
>> +        dom_info.quiet = 1;
>>  
> 
> It seems that dom_info->quiet affects stderr, not stdout. See the only
> place that checks this in xl_cmdimpl.c.
> 
>>      rc = create_domain(&dom_info);
>>      if (rc < 0) {
>> @@ -4472,11 +4477,12 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>>  
>>      switch (checkpointed) {
>>      case LIBXL_CHECKPOINTED_STREAM_REMUS:
>> +    case LIBXL_CHECKPOINTED_STREAM_COLO:
>>          /* If we are here, it means that the sender (primary) has crashed.
>>           * TODO: Split-Brain Check.
>>           */
>> -        fprintf(stderr, "migration target: Remus Failover for domain %u\n",
>> -                domid);
>> +        fprintf(stderr, "migration target: %s Failover for domain %u\n",
>> +                ha, domid);
>>  
>>          /*
>>           * If domain renaming fails, lets just continue (as we need the domain
>> @@ -4492,16 +4498,20 @@ static void migrate_receive(int debug, int daemonize, int monitor,
>>              rc = libxl_domain_rename(ctx, domid, migration_domname,
>>                                       common_domname);
>>              if (rc)
>> -                fprintf(stderr, "migration target (Remus): "
>> +                fprintf(stderr, "migration target (%s): "
>>                          "Failed to rename domain from %s to %s:%d\n",
>> -                        migration_domname, common_domname, rc);
>> +                        ha, migration_domname, common_domname, rc);
>>          }
>>  
>> +        if (checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO)
>> +            /* The guest is running after failover in COLO mode */
>> +            exit(rc ? -ERROR_FAIL: 0);
>> +
>>          rc = libxl_domain_unpause(ctx, domid);
>>          if (rc)
>> -            fprintf(stderr, "migration target (Remus): "
>> +            fprintf(stderr, "migration target (%s): "
>>                      "Failed to unpause domain %s (id: %u):%d\n",
>> -                    common_domname, domid, rc);
>> +                    ha, common_domname, domid, rc);
>>  
>>          exit(rc ? -ERROR_FAIL: 0);
>>      default:
>> @@ -4649,7 +4659,7 @@ int main_migrate_receive(int argc, char **argv)
>>      libxl_checkpointed_stream checkpointed = LIBXL_CHECKPOINTED_STREAM_NONE;
>>      int opt;
>>  
>> -    SWITCH_FOREACH_OPT(opt, "Fedr", NULL, "migrate-receive", 0) {
>> +    SWITCH_FOREACH_OPT(opt, "Fedrc", NULL, "migrate-receive", 0) {
>>      case 'F':
>>          daemonize = 0;
>>          break;
>> @@ -4663,6 +4673,9 @@ int main_migrate_receive(int argc, char **argv)
>>      case 'r':
>>          checkpointed = LIBXL_CHECKPOINTED_STREAM_REMUS;
>>          break;
>> +    case 'c':
>> +        checkpointed = LIBXL_CHECKPOINTED_STREAM_COLO;
>> +        break;
>>      }
>>  
>>      if (argc-optind != 0) {
>> @@ -8032,11 +8045,8 @@ int main_remus(int argc, char **argv)
>>      int config_len;
>>  
>>      memset(&r_info, 0, sizeof(libxl_domain_remus_info));
>> -    /* Defaults */
>> -    r_info.interval = 200;
>> -    libxl_defbool_setdefault(&r_info.blackhole, false);
>>  
>> -    SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:e", NULL, "remus", 2) {
>> +    SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:ec", NULL, "remus", 2) {
>>      case 'i':
>>          r_info.interval = atoi(optarg);
>>          break;
>> @@ -8064,11 +8074,32 @@ int main_remus(int argc, char **argv)
>>      case 'e':
>>          daemonize = 0;
>>          break;
>> +    case 'c':
>> +        libxl_defbool_set(&r_info.colo, true);
>>      }
>>  
>>      domid = find_domain(argv[optind]);
>>      host = argv[optind + 1];
>>  
>> +    /* Defaults */
>> +    libxl_defbool_setdefault(&r_info.blackhole, false);
>> +    libxl_defbool_setdefault(&r_info.colo, false);
>> +    if (!libxl_defbool_val(r_info.colo) && !r_info.interval)
>> +        r_info.interval = 200;
>> +
>> +    if (libxl_defbool_val(r_info.colo)) {
>> +        if (r_info.interval || libxl_defbool_val(r_info.blackhole)) {
>> +            perror("Option -c conflicts with -i or -b");
>> +            exit(-1);
>> +        }
>> +
>> +        if (libxl_defbool_is_default(r_info.compression)) {
>> +            perror("COLO can't be used with memory compression. "
>> +                   "Disable memory checkpoint compression now...");
>> +            libxl_defbool_set(&r_info.compression, false);
>> +        }
>> +    }
>> +
> 
> I don't think I'm entirely happy with how these things are arranged.
> Remus and COLO don't seem to have a set of consistent APIs that
> arbitrary users can call.
> 
> But for the sake of not growing this series any longer let's leave it
> like this for the moment. I think COLO at best is going to be (as you
> stated in manpage) experimental at this stage.

Yes, it is experimental now.

Thanks
Wen Congyang

> 
> 
>>      if (!r_info.netbufscript)
>>          r_info.netbufscript = default_remus_netbufscript;
>>  
>> @@ -8083,8 +8114,9 @@ int main_remus(int argc, char **argv)
>>          if (!ssh_command[0]) {
>>              rune = host;
>>          } else {
>> -            xasprintf(&rune, "exec %s %s xl migrate-receive -r %s",
>> +            xasprintf(&rune, "exec %s %s xl migrate-receive %s %s",
>>                        ssh_command, host,
>> +                      libxl_defbool_val(r_info.colo) ? "-c" : "-r",
>>                        daemonize ? "" : " -e");
>>          }
>>  
>> @@ -8112,7 +8144,8 @@ int main_remus(int argc, char **argv)
>>       * domain to force failover
>>       */
>>      if (libxl_domain_info(ctx, 0, domid)) {
>> -        fprintf(stderr, "Remus: Primary domain has been destroyed.\n");
>> +        fprintf(stderr, "%s: Primary domain has been destroyed.\n",
>> +                libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
>>          close(send_fd);
>>          return 0;
>>      }
>> @@ -8124,7 +8157,8 @@ int main_remus(int argc, char **argv)
>>      if (rc == ERROR_GUEST_TIMEDOUT)
>>          fprintf(stderr, "Failed to suspend domain at primary.\n");
>>      else {
>> -        fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n");
>> +        fprintf(stderr, "%s: Backup failed? resuming domain at primary.\n",
>> +                libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
>>          libxl_domain_resume(ctx, domid, 1, 0);
>>      }
>>  
>> diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
>> index fdc1ac6..b6b630c 100644
>> --- a/tools/libxl/xl_cmdtable.c
>> +++ b/tools/libxl/xl_cmdtable.c
>> @@ -499,7 +499,9 @@ struct cmd_spec cmd_table[] = {
>>        "-b                      Replicate memory checkpoints to /dev/null (blackhole).\n"
>>        "                        Works only in unsafe mode.\n"
>>        "-n                      Disable network output buffering. Works only in unsafe mode.\n"
>> -      "-d                      Disable disk replication. Works only in unsafe mode."
>> +      "-d                      Disable disk replication. Works only in unsafe mode.\n"
>> +      "-c                      Enable COLO HA. It is conflict with -i and -b, and memory\n"
>> +      "                        checkpoint must be disabled"
>>      },
>>  #endif
>>      { "devd",
>> -- 
>> 2.5.0
>>
>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
> 
> 
> .
>
diff mbox

Patch

diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 4279c7c..1c6dd87 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -447,12 +447,15 @@  Print huge (!) amount of debug during the migration process.
 
 =item B<remus> [I<OPTIONS>] I<domain-id> I<host>
 
-Enable Remus HA for domain. By default B<xl> relies on ssh as a transport
-mechanism between the two hosts.
+Enable Remus HA or COLO HA for domain. By default B<xl> relies on ssh as a
+transport mechanism between the two hosts.
 
 N.B: Remus support in xl is still in experimental (proof-of-concept) phase.
      Disk replication support is limited to DRBD disks.
 
+     COLO support in xl is still in experimental (proof-of-concept) phase.
+     There is no support for network or disk at the moment.
+
 B<OPTIONS>
 
 =over 4
@@ -498,6 +501,11 @@  Disable network output buffering. Requires enabling unsafe mode.
 
 Disable disk replication. Requires enabling unsafe mode.
 
+=item B<-c>
+
+Enable COLO HA. This conflicts with B<-i> and B<-b>, and memory
+checkpoint compression must be disabled.
+
 =back
 
 =item B<pause> I<domain-id>
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 7236175..12df81a 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -849,12 +849,28 @@  int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
         goto out;
     }
 
+    /* The caller must set this defbool */
+    if (libxl_defbool_is_default(info->colo)) {
+        LOG(ERROR, "colo mode must be enabled/disabled");
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
     libxl_defbool_setdefault(&info->allow_unsafe, false);
     libxl_defbool_setdefault(&info->blackhole, false);
-    libxl_defbool_setdefault(&info->compression, true);
+    libxl_defbool_setdefault(&info->compression,
+                             !libxl_defbool_val(info->colo));
     libxl_defbool_setdefault(&info->netbuf, true);
     libxl_defbool_setdefault(&info->diskbuf, true);
 
+    if (libxl_defbool_val(info->colo)) {
+        if (libxl_defbool_val(info->compression)) {
+            LOG(ERROR, "cannot use memory checkpoint compression in COLO mode");
+            rc = ERROR_FAIL;
+            goto out;
+        }
+    }
+
     if (!libxl_defbool_val(info->allow_unsafe) &&
         (libxl_defbool_val(info->blackhole) ||
          !libxl_defbool_val(info->netbuf) ||
@@ -876,7 +892,10 @@  int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
     dss->live = 1;
     dss->debug = 0;
     dss->remus = info;
-    dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_REMUS;
+    if (libxl_defbool_val(info->colo))
+        dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_COLO;
+    else
+        dss->checkpointed_stream = LIBXL_CHECKPOINTED_STREAM_REMUS;
 
     assert(info);
 
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index df7268b..0dc7220 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -4440,6 +4440,8 @@  static void migrate_receive(int debug, int daemonize, int monitor,
     char rc_buf;
     char *migration_domname;
     struct domain_create dom_info;
+    const char *ha = checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO ?
+                     "COLO" : "Remus";
 
     signal(SIGPIPE, SIG_IGN);
     /* if we get SIGPIPE we'd rather just have it as an error */
@@ -4460,6 +4462,9 @@  static void migrate_receive(int debug, int daemonize, int monitor,
     dom_info.send_back_fd = send_fd;
     dom_info.migration_domname_r = &migration_domname;
     dom_info.checkpointed_stream = checkpointed;
+    if (checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO)
+        /* COLO uses stdout to send control message to master */
+        dom_info.quiet = 1;
 
     rc = create_domain(&dom_info);
     if (rc < 0) {
@@ -4472,11 +4477,12 @@  static void migrate_receive(int debug, int daemonize, int monitor,
 
     switch (checkpointed) {
     case LIBXL_CHECKPOINTED_STREAM_REMUS:
+    case LIBXL_CHECKPOINTED_STREAM_COLO:
         /* If we are here, it means that the sender (primary) has crashed.
          * TODO: Split-Brain Check.
          */
-        fprintf(stderr, "migration target: Remus Failover for domain %u\n",
-                domid);
+        fprintf(stderr, "migration target: %s Failover for domain %u\n",
+                ha, domid);
 
         /*
          * If domain renaming fails, lets just continue (as we need the domain
@@ -4492,16 +4498,20 @@  static void migrate_receive(int debug, int daemonize, int monitor,
             rc = libxl_domain_rename(ctx, domid, migration_domname,
                                      common_domname);
             if (rc)
-                fprintf(stderr, "migration target (Remus): "
+                fprintf(stderr, "migration target (%s): "
                         "Failed to rename domain from %s to %s:%d\n",
-                        migration_domname, common_domname, rc);
+                        ha, migration_domname, common_domname, rc);
         }
 
+        if (checkpointed == LIBXL_CHECKPOINTED_STREAM_COLO)
+            /* The guest is running after failover in COLO mode */
+            exit(rc ? -ERROR_FAIL: 0);
+
         rc = libxl_domain_unpause(ctx, domid);
         if (rc)
-            fprintf(stderr, "migration target (Remus): "
+            fprintf(stderr, "migration target (%s): "
                     "Failed to unpause domain %s (id: %u):%d\n",
-                    common_domname, domid, rc);
+                    ha, common_domname, domid, rc);
 
         exit(rc ? -ERROR_FAIL: 0);
     default:
@@ -4649,7 +4659,7 @@  int main_migrate_receive(int argc, char **argv)
     libxl_checkpointed_stream checkpointed = LIBXL_CHECKPOINTED_STREAM_NONE;
     int opt;
 
-    SWITCH_FOREACH_OPT(opt, "Fedr", NULL, "migrate-receive", 0) {
+    SWITCH_FOREACH_OPT(opt, "Fedrc", NULL, "migrate-receive", 0) {
     case 'F':
         daemonize = 0;
         break;
@@ -4663,6 +4673,9 @@  int main_migrate_receive(int argc, char **argv)
     case 'r':
         checkpointed = LIBXL_CHECKPOINTED_STREAM_REMUS;
         break;
+    case 'c':
+        checkpointed = LIBXL_CHECKPOINTED_STREAM_COLO;
+        break;
     }
 
     if (argc-optind != 0) {
@@ -8032,11 +8045,8 @@  int main_remus(int argc, char **argv)
     int config_len;
 
     memset(&r_info, 0, sizeof(libxl_domain_remus_info));
-    /* Defaults */
-    r_info.interval = 200;
-    libxl_defbool_setdefault(&r_info.blackhole, false);
 
-    SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:e", NULL, "remus", 2) {
+    SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:ec", NULL, "remus", 2) {
     case 'i':
         r_info.interval = atoi(optarg);
         break;
@@ -8064,11 +8074,32 @@  int main_remus(int argc, char **argv)
     case 'e':
         daemonize = 0;
         break;
+    case 'c':
+        libxl_defbool_set(&r_info.colo, true);
     }
 
     domid = find_domain(argv[optind]);
     host = argv[optind + 1];
 
+    /* Defaults */
+    libxl_defbool_setdefault(&r_info.blackhole, false);
+    libxl_defbool_setdefault(&r_info.colo, false);
+    if (!libxl_defbool_val(r_info.colo) && !r_info.interval)
+        r_info.interval = 200;
+
+    if (libxl_defbool_val(r_info.colo)) {
+        if (r_info.interval || libxl_defbool_val(r_info.blackhole)) {
+            perror("Option -c conflicts with -i or -b");
+            exit(-1);
+        }
+
+        if (libxl_defbool_is_default(r_info.compression)) {
+            perror("COLO can't be used with memory compression. "
+                   "Disable memory checkpoint compression now...");
+            libxl_defbool_set(&r_info.compression, false);
+        }
+    }
+
     if (!r_info.netbufscript)
         r_info.netbufscript = default_remus_netbufscript;
 
@@ -8083,8 +8114,9 @@  int main_remus(int argc, char **argv)
         if (!ssh_command[0]) {
             rune = host;
         } else {
-            xasprintf(&rune, "exec %s %s xl migrate-receive -r %s",
+            xasprintf(&rune, "exec %s %s xl migrate-receive %s %s",
                       ssh_command, host,
+                      libxl_defbool_val(r_info.colo) ? "-c" : "-r",
                       daemonize ? "" : " -e");
         }
 
@@ -8112,7 +8144,8 @@  int main_remus(int argc, char **argv)
      * domain to force failover
      */
     if (libxl_domain_info(ctx, 0, domid)) {
-        fprintf(stderr, "Remus: Primary domain has been destroyed.\n");
+        fprintf(stderr, "%s: Primary domain has been destroyed.\n",
+                libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
         close(send_fd);
         return 0;
     }
@@ -8124,7 +8157,8 @@  int main_remus(int argc, char **argv)
     if (rc == ERROR_GUEST_TIMEDOUT)
         fprintf(stderr, "Failed to suspend domain at primary.\n");
     else {
-        fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n");
+        fprintf(stderr, "%s: Backup failed? resuming domain at primary.\n",
+                libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
         libxl_domain_resume(ctx, domid, 1, 0);
     }
 
diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
index fdc1ac6..b6b630c 100644
--- a/tools/libxl/xl_cmdtable.c
+++ b/tools/libxl/xl_cmdtable.c
@@ -499,7 +499,9 @@  struct cmd_spec cmd_table[] = {
       "-b                      Replicate memory checkpoints to /dev/null (blackhole).\n"
       "                        Works only in unsafe mode.\n"
       "-n                      Disable network output buffering. Works only in unsafe mode.\n"
-      "-d                      Disable disk replication. Works only in unsafe mode."
+      "-d                      Disable disk replication. Works only in unsafe mode.\n"
+      "-c                      Enable COLO HA. It is conflict with -i and -b, and memory\n"
+      "                        checkpoint must be disabled"
     },
 #endif
     { "devd",