diff mbox series

[10/10] tools/xenstored: Delay new transaction while Live-Update is pending

Message ID 20210616144324.31652-11-julien@xen.org (mailing list archive)
State New
Headers show
Series tools/xenstored: Bug fixes + Improve Live-Update | expand

Commit Message

Julien Grall June 16, 2021, 2:43 p.m. UTC
From: Julien Grall <jgrall@amazon.com>

At the moment, Live-Update will, by default, not proceed if there are
in-flight transactions. It is possible force it by passing -F but this
will break any connection with in-flight transactions.

There are PV drivers out that may never terminate some transaction. On
host running such guest, we would need to use -F. Unfortunately, this
also risks to break well-behaving guests (and even dom0) because
Live-Update will happen as soon as the timeout is hit.

Ideally, we would want to preserve transactions but this requires
some work and a lot of testing to be able to use it in production.

As a stop gap, we want to limit the damage of -F. This patch will delay
any transactions that are started after Live-Update has been requested.

If the request cannot be delayed, the connection will be stalled to
avoid loosing requests.

If the connection has already a pending transaction before Live-Update,
then new transaction will not be delayed. This is to avoid the connection
to stall.

With this stop gap in place, domains with long running transactions will
still break when using -F, but other domains which starts a transaction
in the middle of Live-Update will continue to work.

Signed-off-by: Julien Grall <jgrall@amazon.com>
---
 tools/xenstore/xenstored_control.c | 10 ++++++
 tools/xenstore/xenstored_control.h |  2 ++
 tools/xenstore/xenstored_core.c    | 49 +++++++++++++++++++++++++++++-
 tools/xenstore/xenstored_core.h    |  3 ++
 4 files changed, 63 insertions(+), 1 deletion(-)

Comments

Luca Fancellu June 21, 2021, 9:30 a.m. UTC | #1
> On 16 Jun 2021, at 15:43, Julien Grall <julien@xen.org> wrote:
> 
> From: Julien Grall <jgrall@amazon.com>
> 
> At the moment, Live-Update will, by default, not proceed if there are
> in-flight transactions. It is possible force it by passing -F but this
> will break any connection with in-flight transactions.
> 
> There are PV drivers out that may never terminate some transaction. On
> host running such guest, we would need to use -F. Unfortunately, this
> also risks to break well-behaving guests (and even dom0) because
> Live-Update will happen as soon as the timeout is hit.
> 
> Ideally, we would want to preserve transactions but this requires
> some work and a lot of testing to be able to use it in production.
> 
> As a stop gap, we want to limit the damage of -F. This patch will delay
> any transactions that are started after Live-Update has been requested.
> 
> If the request cannot be delayed, the connection will be stalled to
> avoid loosing requests.
> 
> If the connection has already a pending transaction before Live-Update,
> then new transaction will not be delayed. This is to avoid the connection
> to stall.
> 
> With this stop gap in place, domains with long running transactions will
> still break when using -F, but other domains which starts a transaction
> in the middle of Live-Update will continue to work.
> 
> Signed-off-by: Julien Grall <jgrall@amazon.com>

Reviewed-by: Luca Fancellu <luca.fancellu@arm.com>

> ---
> tools/xenstore/xenstored_control.c | 10 ++++++
> tools/xenstore/xenstored_control.h |  2 ++
> tools/xenstore/xenstored_core.c    | 49 +++++++++++++++++++++++++++++-
> tools/xenstore/xenstored_core.h    |  3 ++
> 4 files changed, 63 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c
> index 1c24d4869eab..a045f102a420 100644
> --- a/tools/xenstore/xenstored_control.c
> +++ b/tools/xenstore/xenstored_control.c
> @@ -131,6 +131,11 @@ unsigned int lu_write_response(FILE *fp)
> 	return sizeof(msg) + msg.len;
> }
> 
> +bool lu_is_pending(void)
> +{
> +	return lu_status != NULL;
> +}
> +
> #else
> struct connection *lu_get_connection(void)
> {
> @@ -142,6 +147,11 @@ unsigned int lu_write_response(FILE *fp)
> 	/* Unsupported */
> 	return 0;
> }
> +
> +bool lu_is_pending(void)
> +{
> +	return false;
> +}
> #endif
> 
> struct cmd_s {
> diff --git a/tools/xenstore/xenstored_control.h b/tools/xenstore/xenstored_control.h
> index 27d7f19e4b7f..98b6fbcea2b1 100644
> --- a/tools/xenstore/xenstored_control.h
> +++ b/tools/xenstore/xenstored_control.h
> @@ -23,3 +23,5 @@ struct connection *lu_get_connection(void);
> 
> /* Write the "OK" response for the live-update command */
> unsigned int lu_write_response(FILE *fp);
> +
> +bool lu_is_pending(void);
> diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
> index 9eca58682b51..10b53af76ac5 100644
> --- a/tools/xenstore/xenstored_core.c
> +++ b/tools/xenstore/xenstored_core.c
> @@ -338,7 +338,20 @@ static int destroy_conn(void *_conn)
> 
> static bool conn_can_read(struct connection *conn)
> {
> -	return conn->funcs->can_read(conn) && !conn->is_ignored;
> +	if (!conn->funcs->can_read(conn))
> +		return false;
> +
> +	if (conn->is_ignored)
> +		return false;
> +
> +	/*
> +	 * For stalled connection, we want to process the pending
> +	 * command as soon as live-update has aborted.
> +	 */
> +	if (conn->is_stalled)
> +		return !lu_is_pending();
> +
> +	return true;
> }
> 
> static bool conn_can_write(struct connection *conn)
> @@ -417,6 +430,12 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout)
> 			if (!list_empty(&conn->out_list))
> 				events |= POLLOUT;
> 			conn->pollfd_idx = set_fd(conn->fd, events);
> +			/*
> +			 * For stalled connection, we want to process the
> +			 * pending command as soon as live-update has aborted.
> +			 */
> +			if (conn->is_stalled && !lu_is_pending())
> +				*ptimeout = 0;
> 		}
> 	}
> }
> @@ -1524,6 +1543,9 @@ static bool process_delayed_message(struct delayed_request *req)
> 	struct connection *conn = req->data;
> 	struct buffered_data *saved_in = conn->in;
> 
> +	if (lu_is_pending())
> +		return false;
> +
> 	/*
> 	 * Part of process_message() expects conn->in to contains the
> 	 * processed response. So save the current conn->in and restore it
> @@ -1543,6 +1565,30 @@ static void consider_message(struct connection *conn)
> 			sockmsg_string(conn->in->hdr.msg.type),
> 			conn->in->hdr.msg.len, conn);
> 
> +	conn->is_stalled = false;
> +	/*
> +	 * Currently, Live-Update is not supported if there is active
> +	 * transactions. In order to reduce the number of retry, delay
> +	 * any new request to start a transaction if Live-Update is pending
> +	 * and there are no transactions in-flight.
> +	 *
> +	 * If we can't delay the request, then mark the connection as
> +	 * stalled. This will ignore new requests until Live-Update happened
> +	 * or it was aborted.
> +	 */
> +	if (lu_is_pending() && conn->transaction_started == 0 &&
> +	    conn->in->hdr.msg.type == XS_TRANSACTION_START) {
> +		trace("Delaying transaction start for connection %p req_id %u\n",
> +		      conn, conn->in->hdr.msg.req_id);
> +
> +		if (delay_request(conn, conn->in, process_delayed_message,
> +				  conn, false) != 0) {
> +			trace("Stalling connection %p\n", conn);
> +			conn->is_stalled = true;
> +		}
> +		return;
> +	}
> +
> 	process_message(conn, conn->in);
> 
> 	assert(conn->in == NULL);
> @@ -1629,6 +1675,7 @@ struct connection *new_connection(const struct interface_funcs *funcs)
> 	new->pollfd_idx = -1;
> 	new->funcs = funcs;
> 	new->is_ignored = false;
> +	new->is_stalled = false;
> 	new->transaction_started = 0;
> 	INIT_LIST_HEAD(&new->out_list);
> 	INIT_LIST_HEAD(&new->watches);
> diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
> index dac517156993..258f6ff38279 100644
> --- a/tools/xenstore/xenstored_core.h
> +++ b/tools/xenstore/xenstored_core.h
> @@ -110,6 +110,9 @@ struct connection
> 	/* Is this connection ignored? */
> 	bool is_ignored;
> 
> +	/* Is the connection stalled? */
> +	bool is_stalled;
> +
> 	/* Buffered incoming data. */
> 	struct buffered_data *in;
> 
> -- 
> 2.17.1
> 
>
Juergen Gross June 24, 2021, 9:23 a.m. UTC | #2
On 16.06.21 16:43, Julien Grall wrote:
> From: Julien Grall <jgrall@amazon.com>
> 
> At the moment, Live-Update will, by default, not proceed if there are
> in-flight transactions. It is possible force it by passing -F but this
> will break any connection with in-flight transactions.
> 
> There are PV drivers out that may never terminate some transaction. On
> host running such guest, we would need to use -F. Unfortunately, this
> also risks to break well-behaving guests (and even dom0) because
> Live-Update will happen as soon as the timeout is hit.
> 
> Ideally, we would want to preserve transactions but this requires
> some work and a lot of testing to be able to use it in production.
> 
> As a stop gap, we want to limit the damage of -F. This patch will delay
> any transactions that are started after Live-Update has been requested.
> 
> If the request cannot be delayed, the connection will be stalled to
> avoid loosing requests.
> 
> If the connection has already a pending transaction before Live-Update,
> then new transaction will not be delayed. This is to avoid the connection
> to stall.
> 
> With this stop gap in place, domains with long running transactions will
> still break when using -F, but other domains which starts a transaction
> in the middle of Live-Update will continue to work.
> 
> Signed-off-by: Julien Grall <jgrall@amazon.com>

Reviewed-by: Juergen Gross <jgross@suse.com>


Juergen
diff mbox series

Patch

diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c
index 1c24d4869eab..a045f102a420 100644
--- a/tools/xenstore/xenstored_control.c
+++ b/tools/xenstore/xenstored_control.c
@@ -131,6 +131,11 @@  unsigned int lu_write_response(FILE *fp)
 	return sizeof(msg) + msg.len;
 }
 
+bool lu_is_pending(void)
+{
+	return lu_status != NULL;
+}
+
 #else
 struct connection *lu_get_connection(void)
 {
@@ -142,6 +147,11 @@  unsigned int lu_write_response(FILE *fp)
 	/* Unsupported */
 	return 0;
 }
+
+bool lu_is_pending(void)
+{
+	return false;
+}
 #endif
 
 struct cmd_s {
diff --git a/tools/xenstore/xenstored_control.h b/tools/xenstore/xenstored_control.h
index 27d7f19e4b7f..98b6fbcea2b1 100644
--- a/tools/xenstore/xenstored_control.h
+++ b/tools/xenstore/xenstored_control.h
@@ -23,3 +23,5 @@  struct connection *lu_get_connection(void);
 
 /* Write the "OK" response for the live-update command */
 unsigned int lu_write_response(FILE *fp);
+
+bool lu_is_pending(void);
diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index 9eca58682b51..10b53af76ac5 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -338,7 +338,20 @@  static int destroy_conn(void *_conn)
 
 static bool conn_can_read(struct connection *conn)
 {
-	return conn->funcs->can_read(conn) && !conn->is_ignored;
+	if (!conn->funcs->can_read(conn))
+		return false;
+
+	if (conn->is_ignored)
+		return false;
+
+	/*
+	 * For stalled connection, we want to process the pending
+	 * command as soon as live-update has aborted.
+	 */
+	if (conn->is_stalled)
+		return !lu_is_pending();
+
+	return true;
 }
 
 static bool conn_can_write(struct connection *conn)
@@ -417,6 +430,12 @@  static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout)
 			if (!list_empty(&conn->out_list))
 				events |= POLLOUT;
 			conn->pollfd_idx = set_fd(conn->fd, events);
+			/*
+			 * For stalled connection, we want to process the
+			 * pending command as soon as live-update has aborted.
+			 */
+			if (conn->is_stalled && !lu_is_pending())
+				*ptimeout = 0;
 		}
 	}
 }
@@ -1524,6 +1543,9 @@  static bool process_delayed_message(struct delayed_request *req)
 	struct connection *conn = req->data;
 	struct buffered_data *saved_in = conn->in;
 
+	if (lu_is_pending())
+		return false;
+
 	/*
 	 * Part of process_message() expects conn->in to contains the
 	 * processed response. So save the current conn->in and restore it
@@ -1543,6 +1565,30 @@  static void consider_message(struct connection *conn)
 			sockmsg_string(conn->in->hdr.msg.type),
 			conn->in->hdr.msg.len, conn);
 
+	conn->is_stalled = false;
+	/*
+	 * Currently, Live-Update is not supported if there is active
+	 * transactions. In order to reduce the number of retry, delay
+	 * any new request to start a transaction if Live-Update is pending
+	 * and there are no transactions in-flight.
+	 *
+	 * If we can't delay the request, then mark the connection as
+	 * stalled. This will ignore new requests until Live-Update happened
+	 * or it was aborted.
+	 */
+	if (lu_is_pending() && conn->transaction_started == 0 &&
+	    conn->in->hdr.msg.type == XS_TRANSACTION_START) {
+		trace("Delaying transaction start for connection %p req_id %u\n",
+		      conn, conn->in->hdr.msg.req_id);
+
+		if (delay_request(conn, conn->in, process_delayed_message,
+				  conn, false) != 0) {
+			trace("Stalling connection %p\n", conn);
+			conn->is_stalled = true;
+		}
+		return;
+	}
+
 	process_message(conn, conn->in);
 
 	assert(conn->in == NULL);
@@ -1629,6 +1675,7 @@  struct connection *new_connection(const struct interface_funcs *funcs)
 	new->pollfd_idx = -1;
 	new->funcs = funcs;
 	new->is_ignored = false;
+	new->is_stalled = false;
 	new->transaction_started = 0;
 	INIT_LIST_HEAD(&new->out_list);
 	INIT_LIST_HEAD(&new->watches);
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
index dac517156993..258f6ff38279 100644
--- a/tools/xenstore/xenstored_core.h
+++ b/tools/xenstore/xenstored_core.h
@@ -110,6 +110,9 @@  struct connection
 	/* Is this connection ignored? */
 	bool is_ignored;
 
+	/* Is the connection stalled? */
+	bool is_stalled;
+
 	/* Buffered incoming data. */
 	struct buffered_data *in;