@@ -133,6 +133,14 @@ struct restore_callbacks {
*/
int (*wait_checkpoint)(void* data);
+ /*
+ * callback to send store gfn and console gfn to xl
+ * if we want to resume vm before xc_domain_save()
+ * exits.
+ */
+ void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
+ void *data);
+
/* to be provided as the last argument to each callback function */
void* data;
};
@@ -214,6 +214,10 @@ struct xc_sr_context
struct xc_sr_restore_ops ops;
struct restore_callbacks *callbacks;
+ int send_back_fd;
+ unsigned long p2m_size;
+ xc_hypercall_buffer_t dirty_bitmap_hbuf;
+
/* From Image Header. */
uint32_t format_version;
@@ -222,13 +226,13 @@ struct xc_sr_context
uint32_t guest_page_size;
/* Plain VM, or checkpoints over time. */
- bool checkpointed;
+ int checkpointed;
/* Currently buffering records between a checkpoint */
bool buffer_all_records;
/*
- * With Remus, we buffer the records sent by the primary at checkpoint,
+ * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
* in case the primary will fail, we can recover from the last
* checkpoint state.
* This should be enough for most of the cases because primary only send
@@ -411,6 +411,92 @@ static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
return rc;
}
+/*
+ * Send checkpoint dirty pfn list to primary.
+ */
+static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = -1;
+ unsigned count, written;
+ uint64_t i, *pfns = NULL;
+ struct iovec *iov = NULL;
+ xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size };
+ struct xc_sr_record rec =
+ {
+ .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST,
+ };
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->restore.dirty_bitmap_hbuf);
+
+ if ( xc_shadow_control(
+ xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size,
+ NULL, 0, &stats) != ctx->restore.p2m_size )
+ {
+ PERROR("Failed to retrieve logdirty bitmap");
+ goto err;
+ }
+
+ for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ )
+ {
+ if ( test_bit(i, dirty_bitmap) )
+ count++;
+ }
+
+
+ pfns = malloc(count * sizeof(*pfns));
+ if ( !pfns )
+ {
+ ERROR("Unable to allocate %zu bytes of memory for dirty pfn list",
+ count * sizeof(*pfns));
+ goto err;
+ }
+
+ for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i )
+ {
+ if ( !test_bit(i, dirty_bitmap) )
+ continue;
+
+ if ( written > count )
+ {
+ ERROR("Dirty pfn list exceed");
+ goto err;
+ }
+
+ pfns[written++] = i;
+ }
+
+ /* iovec[] for writev(). */
+ iov = malloc(3 * sizeof(*iov));
+ if ( !iov )
+ {
+ ERROR("Unable to allocate memory for sending dirty bitmap");
+ goto err;
+ }
+
+ rec.length = count * sizeof(*pfns);
+
+ iov[0].iov_base = &rec.type;
+ iov[0].iov_len = sizeof(rec.type);
+
+ iov[1].iov_base = &rec.length;
+ iov[1].iov_len = sizeof(rec.length);
+
+ iov[2].iov_base = pfns;
+ iov[2].iov_len = count * sizeof(*pfns);
+
+ if ( writev_exact(ctx->restore.send_back_fd, iov, 3) )
+ {
+ PERROR("Failed to write dirty bitmap to stream");
+ goto err;
+ }
+
+ rc = 0;
+ err:
+ return rc;
+}
+
static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
static int handle_checkpoint(struct xc_sr_context *ctx)
{
@@ -460,6 +546,53 @@ static int handle_checkpoint(struct xc_sr_context *ctx)
else
ctx->restore.buffer_all_records = true;
+ if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+ {
+#define HANDLE_CALLBACK_RETURN_VALUE(ret) \
+ do { \
+ if ( ret == 1 ) \
+ rc = 0; /* Success */ \
+ else \
+ { \
+ if ( ret == 2 ) \
+ rc = BROKEN_CHANNEL; \
+ else \
+ rc = -1; /* Some unspecified error */ \
+ goto err; \
+ } \
+ } while (0)
+
+ /* COLO */
+
+ /* We need to resume guest */
+ rc = ctx->restore.ops.stream_complete(ctx);
+ if ( rc )
+ goto err;
+
+ ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn,
+ ctx->restore.console_gfn,
+ ctx->restore.callbacks->data);
+
+ /* Resume secondary vm */
+ ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+ /* Wait for a new checkpoint */
+ ret = ctx->restore.callbacks->wait_checkpoint(
+ ctx->restore.callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+ /* suspend secondary vm */
+ ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+#undef HANDLE_CALLBACK_RETURN_VALUE
+
+ rc = send_checkpoint_dirty_pfn_list(ctx);
+ if ( rc )
+ goto err;
+ }
+
err:
return rc;
}
@@ -529,6 +662,21 @@ static int setup(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->restore.dirty_bitmap_hbuf);
+
+ if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+ {
+ dirty_bitmap = xc_hypercall_buffer_alloc_pages(xch, dirty_bitmap,
+ NRPAGES(bitmap_size(ctx->restore.p2m_size)));
+
+ if ( !dirty_bitmap )
+ {
+ ERROR("Unable to allocate memory for dirty bitmap");
+ rc = -1;
+ goto err;
+ }
+ }
rc = ctx->restore.ops.setup(ctx);
if ( rc )
@@ -562,10 +710,15 @@ static void cleanup(struct xc_sr_context *ctx)
{
xc_interface *xch = ctx->xch;
unsigned i;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->restore.dirty_bitmap_hbuf);
for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
free(ctx->restore.buffered_records[i].data);
+ if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+ xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
+ NRPAGES(bitmap_size(ctx->restore.p2m_size)));
free(ctx->restore.buffered_records);
free(ctx->restore.populated_pfns);
if ( ctx->restore.ops.cleanup(ctx) )
@@ -631,6 +784,15 @@ static int restore(struct xc_sr_context *ctx)
} while ( rec.type != REC_TYPE_END );
remus_failover:
+
+ if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+ {
+ /* With COLO, we have already called stream_complete */
+ rc = 0;
+ IPRINTF("COLO Failover");
+ goto done;
+ }
+
/*
* With Remus, if we reach here, there must be some error on primary,
* failover from the last checkpoint state.
@@ -667,6 +829,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
xc_migration_stream_t stream_type,
struct restore_callbacks *callbacks, int send_back_fd)
{
+ xen_pfn_t nr_pfns;
struct xc_sr_context ctx =
{
.xch = xch,
@@ -680,11 +843,21 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
ctx.restore.xenstore_domid = store_domid;
ctx.restore.checkpointed = stream_type;
ctx.restore.callbacks = callbacks;
+ ctx.restore.send_back_fd = send_back_fd;
/* Sanity checks for callbacks. */
if ( stream_type )
assert(callbacks->checkpoint);
+ if ( ctx.restore.checkpointed == XC_MIG_STREAM_COLO )
+ {
+ /* this is COLO restore */
+ assert(callbacks->suspend &&
+ callbacks->postcopy &&
+ callbacks->wait_checkpoint &&
+ callbacks->restore_results);
+ }
+
DPRINTF("fd %d, dom %u, hvm %u, pae %u, superpages %d"
", stream_type %d", io_fd, dom, hvm, pae,
superpages, stream_type);
@@ -706,6 +879,14 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
if ( read_headers(&ctx) )
return -1;
+ if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
+ {
+ PERROR("Unable to obtain the guest p2m size");
+ return -1;
+ }
+
+ ctx.restore.p2m_size = nr_pfns;
+
if ( ctx.dominfo.hvm )
{
ctx.restore.ops = restore_ops_x86_hvm;
@@ -126,11 +126,6 @@ static void colo_resume_vm(libxl__egc *egc,
return;
}
- /*
- * TODO: get store gfn and console gfn
- * We should call the callback restore_results in
- * xc_domain_restore() before resuming the guest.
- */
libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0);
return;
@@ -1017,6 +1017,8 @@ static void domcreate_bootloader_done(libxl__egc *egc,
const int checkpointed_stream = dcs->restore_params.checkpointed_stream;
libxl__colo_restore_state *const crs = &dcs->crs;
libxl_domain_build_info *const info = &d_config->b_info;
+ libxl__srm_restore_autogen_callbacks *const callbacks =
+ &dcs->srs.shs.callbacks.restore.a;
if (rc) {
domcreate_rebuild_done(egc, dcs, rc);
@@ -1044,6 +1046,7 @@ static void domcreate_bootloader_done(libxl__egc *egc,
}
/* Restore */
+ callbacks->restore_results = libxl__srm_callout_callback_restore_results;
/* COLO only supports HVM now because it does not work very
* well with pv drivers:
@@ -29,8 +29,8 @@ our @msgs = (
[ 6, 'srcxA', "wait_checkpoint", [] ],
[ 7, 'scxA', "switch_qemu_logdirty", [qw(int domid
unsigned enable)] ],
- [ 8, 'r', "restore_results", ['unsigned long', 'store_mfn',
- 'unsigned long', 'console_mfn'] ],
+ [ 8, 'rcx', "restore_results", ['unsigned long', 'store_gfn',
+ 'unsigned long', 'console_gfn'] ],
[ 9, 'srW', "complete", [qw(int retval
int errnoval)] ],
);