[v11,16/27] libxc/restore: support COLO restore

Message ID	1457080891-26054-17-git-send-email-xiecl.fnst@cn.fujitsu.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xen.org> From: Changlong Xie <xiecl.fnst@cn.fujitsu.com> To: xen devel <xen-devel@lists.xen.org>, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Ian Campbell <ian.campbell@citrix.com>, Ian Jackson <ian.jackson@eu.citrix.com>, Wei Liu <wei.liu2@citrix.com> Date: Fri, 4 Mar 2016 16:41:20 +0800 Message-ID: <1457080891-26054-17-git-send-email-xiecl.fnst@cn.fujitsu.com> In-Reply-To: <1457080891-26054-1-git-send-email-xiecl.fnst@cn.fujitsu.com> References: <1457080891-26054-1-git-send-email-xiecl.fnst@cn.fujitsu.com> MIME-Version: 1.0 Cc: Lars Kurth <lars.kurth@citrix.com>, Changlong Xie <xiecl.fnst@cn.fujitsu.com>, Wen Congyang <wency@cn.fujitsu.com>, Gui Jianfeng <guijianfeng@cn.fujitsu.com>, Jiang Yunhong <yunhong.jiang@intel.com>, Dong Eddie <eddie.dong@intel.com>, Anthony Perard <anthony.perard@citrix.com>, Shriram Rajagopalan <rshriram@cs.ubc.ca>, Yang Hongyang <hongyang.yang@easystack.cn> Subject: [Xen-devel] [PATCH v11 16/27] libxc/restore: support COLO restore Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>

diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h index cb0bbcf..67cb540 100644 --- a/tools/libxc/include/xenguest.h +++ b/tools/libxc/include/xenguest.h @@ -127,6 +127,14 @@ struct restore_callbacks { */ int (*wait_checkpoint)(void* data); + /* + * callback to send store gfn and console gfn to xl + * if we want to resume vm before xc_domain_save() + * exits. + */ + void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn, + void *data); + /* to be provided as the last argument to each callback function */ void* data; }; diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h index 2bfed64..29ab4eb 100644 --- a/tools/libxc/xc_sr_common.h +++ b/tools/libxc/xc_sr_common.h @@ -226,6 +226,10 @@ struct xc_sr_context struct xc_sr_restore_ops ops; struct restore_callbacks *callbacks; + int send_back_fd; + unsigned long p2m_size; + xc_hypercall_buffer_t dirty_bitmap_hbuf; + /* From Image Header. */ uint32_t format_version; @@ -234,13 +238,13 @@ struct xc_sr_context uint32_t guest_page_size; /* Plain VM, or checkpoints over time. */ - bool checkpointed; + int checkpointed; /* Currently buffering records between a checkpoint */ bool buffer_all_records; /* - * With Remus, we buffer the records sent by the primary at checkpoint, + * With Remus/COLO, we buffer the records sent by the primary at checkpoint, * in case the primary will fail, we can recover from the last * checkpoint state. * This should be enough for most of the cases because primary only send diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c index aef9bca..24c4719 100644 --- a/tools/libxc/xc_sr_restore.c +++ b/tools/libxc/xc_sr_restore.c @@ -411,6 +411,92 @@ static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec) return rc; } +/* + * Send checkpoint dirty pfn list to primary. + */ +static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc = -1; + unsigned count, written; + uint64_t i, *pfns = NULL; + struct iovec *iov = NULL; + xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size }; + struct xc_sr_record rec = + { + .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST, + }; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->restore.dirty_bitmap_hbuf); + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size, + NULL, 0, &stats) != ctx->restore.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + goto err; + } + + for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ ) + { + if ( test_bit(i, dirty_bitmap) ) + count++; + } + + + pfns = malloc(count * sizeof(*pfns)); + if ( !pfns ) + { + ERROR("Unable to allocate %zu bytes of memory for dirty pfn list", + count * sizeof(*pfns)); + goto err; + } + + for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i ) + { + if ( !test_bit(i, dirty_bitmap) ) + continue; + + if ( written > count ) + { + ERROR("Dirty pfn list exceed"); + goto err; + } + + pfns[written++] = i; + } + + /* iovec[] for writev(). */ + iov = malloc(3 * sizeof(*iov)); + if ( !iov ) + { + ERROR("Unable to allocate memory for sending dirty bitmap"); + goto err; + } + + rec.length = count * sizeof(*pfns); + + iov[0].iov_base = &rec.type; + iov[0].iov_len = sizeof(rec.type); + + iov[1].iov_base = &rec.length; + iov[1].iov_len = sizeof(rec.length); + + iov[2].iov_base = pfns; + iov[2].iov_len = count * sizeof(*pfns); + + if ( writev_exact(ctx->restore.send_back_fd, iov, 3) ) + { + PERROR("Failed to write dirty bitmap to stream"); + goto err; + } + + rc = 0; + err: + return rc; +} + static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec); static int handle_checkpoint(struct xc_sr_context *ctx) { @@ -460,6 +546,53 @@ static int handle_checkpoint(struct xc_sr_context *ctx) else ctx->restore.buffer_all_records = true; + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) + { +#define HANDLE_CALLBACK_RETURN_VALUE(ret) \ + do { \ + if ( ret == 1 ) \ + rc = 0; /* Success */ \ + else \ + { \ + if ( ret == 2 ) \ + rc = BROKEN_CHANNEL; \ + else \ + rc = -1; /* Some unspecified error */ \ + goto err; \ + } \ + } while (0) + + /* COLO */ + + /* We need to resume guest */ + rc = ctx->restore.ops.stream_complete(ctx); + if ( rc ) + goto err; + + ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn, + ctx->restore.console_gfn, + ctx->restore.callbacks->data); + + /* Resume secondary vm */ + ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + + /* Wait for a new checkpoint */ + ret = ctx->restore.callbacks->wait_checkpoint( + ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + + /* suspend secondary vm */ + ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + +#undef HANDLE_CALLBACK_RETURN_VALUE + + rc = send_checkpoint_dirty_pfn_list(ctx); + if ( rc ) + goto err; + } + err: return rc; } @@ -529,6 +662,21 @@ static int setup(struct xc_sr_context *ctx) { xc_interface *xch = ctx->xch; int rc; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->restore.dirty_bitmap_hbuf); + + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) + { + dirty_bitmap = xc_hypercall_buffer_alloc_pages(xch, dirty_bitmap, + NRPAGES(bitmap_size(ctx->restore.p2m_size))); + + if ( !dirty_bitmap ) + { + ERROR("Unable to allocate memory for dirty bitmap"); + rc = -1; + goto err; + } + } rc = ctx->restore.ops.setup(ctx); if ( rc ) @@ -562,10 +710,15 @@ static void cleanup(struct xc_sr_context *ctx) { xc_interface *xch = ctx->xch; unsigned i; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->restore.dirty_bitmap_hbuf); for ( i = 0; i < ctx->restore.buffered_rec_num; i++ ) free(ctx->restore.buffered_records[i].data); + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) + xc_hypercall_buffer_free_pages(xch, dirty_bitmap, + NRPAGES(bitmap_size(ctx->restore.p2m_size))); free(ctx->restore.buffered_records); free(ctx->restore.populated_pfns); if ( ctx->restore.ops.cleanup(ctx) ) @@ -631,6 +784,15 @@ static int restore(struct xc_sr_context *ctx) } while ( rec.type != REC_TYPE_END ); remus_failover: + + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) + { + /* With COLO, we have already called stream_complete */ + rc = 0; + IPRINTF("COLO Failover"); + goto done; + } + /* * With Remus, if we reach here, there must be some error on primary, * failover from the last checkpoint state. @@ -667,6 +829,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, int checkpointed_stream, struct restore_callbacks *callbacks, int send_back_fd) { + xen_pfn_t nr_pfns; struct xc_sr_context ctx = { .xch = xch, @@ -680,11 +843,21 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, ctx.restore.xenstore_domid = store_domid; ctx.restore.checkpointed = checkpointed_stream; ctx.restore.callbacks = callbacks; + ctx.restore.send_back_fd = send_back_fd; /* Sanity checks for callbacks. */ if ( checkpointed_stream ) assert(callbacks->checkpoint); + if ( ctx.restore.checkpointed == MIG_STREAM_COLO ) + { + /* this is COLO restore */ + assert(callbacks->suspend && + callbacks->postcopy && + callbacks->wait_checkpoint && + callbacks->restore_results); + } + DPRINTF("fd %d, dom %u, hvm %u, pae %u, superpages %d" ", checkpointed_stream %d", io_fd, dom, hvm, pae, superpages, checkpointed_stream); @@ -706,6 +879,14 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, if ( read_headers(&ctx) ) return -1; + if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 ) + { + PERROR("Unable to obtain the guest p2m size"); + return -1; + } + + ctx.restore.p2m_size = nr_pfns; + if ( ctx.dominfo.hvm ) { ctx.restore.ops = restore_ops_x86_hvm; diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c index 829687e..b0dd52b 100644 --- a/tools/libxl/libxl_colo_restore.c +++ b/tools/libxl/libxl_colo_restore.c @@ -125,11 +125,6 @@ static void colo_resume_vm(libxl__egc *egc, return; } - /* - * TODO: get store gfn and console gfn - * We should call the callback restore_results in - * xc_domain_restore() before resuming the guest. - */ libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0); return; diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c index 7301ac4..3610a39 100644 --- a/tools/libxl/libxl_create.c +++ b/tools/libxl/libxl_create.c @@ -1013,6 +1013,8 @@ static void domcreate_bootloader_done(libxl__egc *egc, const int checkpointed_stream = dcs->restore_params.checkpointed_stream; libxl__colo_restore_state *const crs = &dcs->crs; libxl_domain_build_info *const info = &d_config->b_info; + libxl__srm_restore_autogen_callbacks *const callbacks = + &dcs->srs.shs.callbacks.restore.a; if (rc) { domcreate_rebuild_done(egc, dcs, rc); @@ -1040,6 +1042,7 @@ static void domcreate_bootloader_done(libxl__egc *egc, } /* Restore */ + callbacks->restore_results = libxl__srm_callout_callback_restore_results; /* COLO only supports HVM now */ if (info->type != LIBXL_DOMAIN_TYPE_HVM && diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl index 6016706..c2243f2 100755 --- a/tools/libxl/libxl_save_msgs_gen.pl +++ b/tools/libxl/libxl_save_msgs_gen.pl @@ -29,8 +29,8 @@ our @msgs = ( [ 6, 'srcxA', "wait_checkpoint", [] ], [ 7, 'scxA', "switch_qemu_logdirty", [qw(int domid unsigned enable)] ], - [ 8, 'r', "restore_results", ['unsigned long', 'store_mfn', - 'unsigned long', 'console_mfn'] ], + [ 8, 'rcx', "restore_results", ['unsigned long', 'store_gfn', + 'unsigned long', 'console_gfn'] ], [ 9, 'srW', "complete", [qw(int retval int errnoval)] ], );

[v11,16/27] libxc/restore: support COLO restore

Commit Message

Comments

Patch