diff mbox series

xenbus: Avoid deadlock during suspend due to open transactions

Message ID 20190508102807.7096-1-ross.lagerwall@citrix.com (mailing list archive)
State Superseded
Headers show
Series xenbus: Avoid deadlock during suspend due to open transactions | expand

Commit Message

Ross Lagerwall May 8, 2019, 10:28 a.m. UTC
During a suspend/resume, the xenwatch thread waits for all outstanding
xenstore requests and transactions to complete. This does not work
correctly for transactions started by userspace because it waits for
them to complete after freezing userspace threads which means the
transactions has no way of completing, resulting in a deadlock. This is
trivial to reproduce by running this script and then suspending the VM:

    import pyxs, time
    c = pyxs.client.Client(xen_bus_path="/dev/xen/xenbus")
    c.connect()
    c.transaction()
    time.sleep(3600)

Even if this deadlock were resolved, misbehaving userspace should not
prevent a VM from being migrated. So, instead of waiting for these
transactions to complete, ignore them during suspend and mark them as
aborted during the return path. If the caller commits the transaction,
return EAGAIN so that they try again. If the caller discards the
transaction, return OK since no changes were made anyway.

This only affects users of the xenbus file interface. In-kernel users of
xenbus are assumed to be well-behaved and complete all transactions
before freezing.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
---
 drivers/xen/xenbus/xenbus.h              |  2 +
 drivers/xen/xenbus/xenbus_dev_frontend.c | 60 ++++++++++++++++++++++++
 drivers/xen/xenbus/xenbus_xs.c           | 16 ++++++-
 3 files changed, 77 insertions(+), 1 deletion(-)

Comments

Jürgen Groß May 10, 2019, 6:20 a.m. UTC | #1
On 08/05/2019 12:28, Ross Lagerwall wrote:
> During a suspend/resume, the xenwatch thread waits for all outstanding
> xenstore requests and transactions to complete. This does not work
> correctly for transactions started by userspace because it waits for
> them to complete after freezing userspace threads which means the
> transactions has no way of completing, resulting in a deadlock. This is
> trivial to reproduce by running this script and then suspending the VM:
> 
>     import pyxs, time
>     c = pyxs.client.Client(xen_bus_path="/dev/xen/xenbus")
>     c.connect()
>     c.transaction()
>     time.sleep(3600)
> 
> Even if this deadlock were resolved, misbehaving userspace should not
> prevent a VM from being migrated. So, instead of waiting for these
> transactions to complete, ignore them during suspend and mark them as
> aborted during the return path. If the caller commits the transaction,
> return EAGAIN so that they try again. If the caller discards the
> transaction, return OK since no changes were made anyway.
> 
> This only affects users of the xenbus file interface. In-kernel users of
> xenbus are assumed to be well-behaved and complete all transactions
> before freezing.
> 
> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>

I think this can be done much easier:

Add a bool "user_req" to struct xb_req_data set for user requests and
a generation count to struct xenbus_transaction_holder which will be
initialized from a global counter being incremented at every
suspend/resume cycle.

Don't increment xs_state_users for user transactions and abort user
transactions in case its generation count doesn't match the global
counter.


Juergen

> ---
>  drivers/xen/xenbus/xenbus.h              |  2 +
>  drivers/xen/xenbus/xenbus_dev_frontend.c | 60 ++++++++++++++++++++++++
>  drivers/xen/xenbus/xenbus_xs.c           | 16 ++++++-
>  3 files changed, 77 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
> index 092981171df1..a977e1396149 100644
> --- a/drivers/xen/xenbus/xenbus.h
> +++ b/drivers/xen/xenbus/xenbus.h
> @@ -133,4 +133,6 @@ void xenbus_ring_ops_init(void);
>  int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par);
>  void xenbus_dev_queue_reply(struct xb_req_data *req);
>  
> +unsigned int xenbus_file_abort_trans(bool abort);
> +
>  #endif
> diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
> index 0782ff3c2273..623218a2a165 100644
> --- a/drivers/xen/xenbus/xenbus_dev_frontend.c
> +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
> @@ -69,6 +69,7 @@
>  struct xenbus_transaction_holder {
>  	struct list_head list;
>  	struct xenbus_transaction handle;
> +	bool aborted;
>  };
>  
>  /*
> @@ -113,8 +114,49 @@ struct xenbus_file_priv {
>  	wait_queue_head_t read_waitq;
>  
>  	struct kref kref;
> +
> +	struct list_head file_list;
>  };
>  
> +static DEFINE_SPINLOCK(file_list_lock);
> +static LIST_HEAD(file_list);
> +
> +static void register_xenbus_file(struct xenbus_file_priv *u)
> +{
> +	spin_lock(&file_list_lock);
> +	list_add(&u->file_list, &file_list);
> +	spin_unlock(&file_list_lock);
> +}
> +
> +static void unregister_xenbus_file(struct xenbus_file_priv *u)
> +{
> +	spin_lock(&file_list_lock);
> +	list_del(&u->file_list);
> +	spin_unlock(&file_list_lock);
> +}
> +
> +unsigned int xenbus_file_abort_trans(bool abort)
> +{
> +	struct xenbus_file_priv *u;
> +	struct xenbus_transaction_holder *trans;
> +	unsigned int count = 0;
> +
> +	spin_lock(&file_list_lock);
> +	list_for_each_entry(u, &file_list, file_list) {
> +		mutex_lock(&u->msgbuffer_mutex);
> +		list_for_each_entry(trans, &u->transactions, list) {
> +			if (!trans->aborted) {
> +				count++;
> +				trans->aborted = abort;
> +			}
> +		}
> +		mutex_unlock(&u->msgbuffer_mutex);
> +	}
> +	spin_unlock(&file_list_lock);
> +
> +	return count;
> +}
> +
>  /* Read out any raw xenbus messages queued up. */
>  static ssize_t xenbus_file_read(struct file *filp,
>  			       char __user *ubuf,
> @@ -306,6 +348,8 @@ static void xenbus_file_free(struct kref *kref)
>  
>  	u = container_of(kref, struct xenbus_file_priv, kref);
>  
> +	unregister_xenbus_file(u);
> +
>  	/*
>  	 * No need for locking here because there are no other users,
>  	 * by definition.
> @@ -449,6 +493,20 @@ static int xenbus_write_transaction(unsigned msg_type,
>  		 !(msg->hdr.len == 2 &&
>  		   (!strcmp(msg->body, "T") || !strcmp(msg->body, "F"))))
>  		return xenbus_command_reply(u, XS_ERROR, "EINVAL");
> +	else if (msg_type == XS_TRANSACTION_END) {
> +		trans = xenbus_get_transaction(u, msg->hdr.tx_id);
> +		if (trans && trans->aborted) {
> +			list_del(&trans->list);
> +			kfree(trans);
> +			if (!strcmp(msg->body, "T"))
> +				return xenbus_command_reply(u, XS_ERROR,
> +							    "EAGAIN");
> +			else
> +				return xenbus_command_reply(u,
> +							    XS_TRANSACTION_END,
> +							    "OK");
> +		}
> +	}
>  
>  	rc = xenbus_dev_request_and_reply(&msg->hdr, u);
>  	if (rc && trans) {
> @@ -640,6 +698,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp)
>  
>  	filp->private_data = u;
>  
> +	register_xenbus_file(u);
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
> index 49a3874ae6bb..9abff635fc20 100644
> --- a/drivers/xen/xenbus/xenbus_xs.c
> +++ b/drivers/xen/xenbus/xenbus_xs.c
> @@ -95,12 +95,23 @@ static pid_t xenwatch_pid;
>  static DEFINE_MUTEX(xenwatch_mutex);
>  static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
>  
> +static unsigned int xs_state_count_users(void)
> +{
> +	unsigned int count;
> +
> +	spin_lock(&xs_state_lock);
> +	count = xs_state_users - xenbus_file_abort_trans(false);
> +	spin_unlock(&xs_state_lock);
> +
> +	return count;
> +}
> +
>  static void xs_suspend_enter(void)
>  {
>  	spin_lock(&xs_state_lock);
>  	xs_suspend_active++;
>  	spin_unlock(&xs_state_lock);
> -	wait_event(xs_state_exit_wq, xs_state_users == 0);
> +	wait_event(xs_state_exit_wq, xs_state_count_users() == 0);
>  }
>  
>  static void xs_suspend_exit(void)
> @@ -838,6 +849,9 @@ void xs_resume(void)
>  
>  	mutex_unlock(&xs_response_mutex);
>  
> +	spin_lock(&xs_state_lock);
> +	xs_state_users -= xenbus_file_abort_trans(true);
> +	spin_unlock(&xs_state_lock);
>  	xs_suspend_exit();
>  
>  	/* No need for watches_lock: the xs_watch_rwsem is sufficient. */
>
diff mbox series

Patch

diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
index 092981171df1..a977e1396149 100644
--- a/drivers/xen/xenbus/xenbus.h
+++ b/drivers/xen/xenbus/xenbus.h
@@ -133,4 +133,6 @@  void xenbus_ring_ops_init(void);
 int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par);
 void xenbus_dev_queue_reply(struct xb_req_data *req);
 
+unsigned int xenbus_file_abort_trans(bool abort);
+
 #endif
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index 0782ff3c2273..623218a2a165 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -69,6 +69,7 @@ 
 struct xenbus_transaction_holder {
 	struct list_head list;
 	struct xenbus_transaction handle;
+	bool aborted;
 };
 
 /*
@@ -113,8 +114,49 @@  struct xenbus_file_priv {
 	wait_queue_head_t read_waitq;
 
 	struct kref kref;
+
+	struct list_head file_list;
 };
 
+static DEFINE_SPINLOCK(file_list_lock);
+static LIST_HEAD(file_list);
+
+static void register_xenbus_file(struct xenbus_file_priv *u)
+{
+	spin_lock(&file_list_lock);
+	list_add(&u->file_list, &file_list);
+	spin_unlock(&file_list_lock);
+}
+
+static void unregister_xenbus_file(struct xenbus_file_priv *u)
+{
+	spin_lock(&file_list_lock);
+	list_del(&u->file_list);
+	spin_unlock(&file_list_lock);
+}
+
+unsigned int xenbus_file_abort_trans(bool abort)
+{
+	struct xenbus_file_priv *u;
+	struct xenbus_transaction_holder *trans;
+	unsigned int count = 0;
+
+	spin_lock(&file_list_lock);
+	list_for_each_entry(u, &file_list, file_list) {
+		mutex_lock(&u->msgbuffer_mutex);
+		list_for_each_entry(trans, &u->transactions, list) {
+			if (!trans->aborted) {
+				count++;
+				trans->aborted = abort;
+			}
+		}
+		mutex_unlock(&u->msgbuffer_mutex);
+	}
+	spin_unlock(&file_list_lock);
+
+	return count;
+}
+
 /* Read out any raw xenbus messages queued up. */
 static ssize_t xenbus_file_read(struct file *filp,
 			       char __user *ubuf,
@@ -306,6 +348,8 @@  static void xenbus_file_free(struct kref *kref)
 
 	u = container_of(kref, struct xenbus_file_priv, kref);
 
+	unregister_xenbus_file(u);
+
 	/*
 	 * No need for locking here because there are no other users,
 	 * by definition.
@@ -449,6 +493,20 @@  static int xenbus_write_transaction(unsigned msg_type,
 		 !(msg->hdr.len == 2 &&
 		   (!strcmp(msg->body, "T") || !strcmp(msg->body, "F"))))
 		return xenbus_command_reply(u, XS_ERROR, "EINVAL");
+	else if (msg_type == XS_TRANSACTION_END) {
+		trans = xenbus_get_transaction(u, msg->hdr.tx_id);
+		if (trans && trans->aborted) {
+			list_del(&trans->list);
+			kfree(trans);
+			if (!strcmp(msg->body, "T"))
+				return xenbus_command_reply(u, XS_ERROR,
+							    "EAGAIN");
+			else
+				return xenbus_command_reply(u,
+							    XS_TRANSACTION_END,
+							    "OK");
+		}
+	}
 
 	rc = xenbus_dev_request_and_reply(&msg->hdr, u);
 	if (rc && trans) {
@@ -640,6 +698,8 @@  static int xenbus_file_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = u;
 
+	register_xenbus_file(u);
+
 	return 0;
 }
 
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 49a3874ae6bb..9abff635fc20 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -95,12 +95,23 @@  static pid_t xenwatch_pid;
 static DEFINE_MUTEX(xenwatch_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
 
+static unsigned int xs_state_count_users(void)
+{
+	unsigned int count;
+
+	spin_lock(&xs_state_lock);
+	count = xs_state_users - xenbus_file_abort_trans(false);
+	spin_unlock(&xs_state_lock);
+
+	return count;
+}
+
 static void xs_suspend_enter(void)
 {
 	spin_lock(&xs_state_lock);
 	xs_suspend_active++;
 	spin_unlock(&xs_state_lock);
-	wait_event(xs_state_exit_wq, xs_state_users == 0);
+	wait_event(xs_state_exit_wq, xs_state_count_users() == 0);
 }
 
 static void xs_suspend_exit(void)
@@ -838,6 +849,9 @@  void xs_resume(void)
 
 	mutex_unlock(&xs_response_mutex);
 
+	spin_lock(&xs_state_lock);
+	xs_state_users -= xenbus_file_abort_trans(true);
+	spin_unlock(&xs_state_lock);
 	xs_suspend_exit();
 
 	/* No need for watches_lock: the xs_watch_rwsem is sufficient. */