diff mbox series

[01/23] multipathd: uxlsnr: avoid deadlock on exit

Message ID 20200924134054.14632-2-mwilck@suse.com (mailing list archive)
State Not Applicable, archived
Delegated to: christophe varoqui
Headers show
Series libmultipath: improve cleanup on exit | expand

Commit Message

Martin Wilck Sept. 24, 2020, 1:40 p.m. UTC
From: Martin Wilck <mwilck@suse.com>

The uxlsnr wouldn't always release the client lock when cancelled,
causing a deadlock in uxsock_cleanup(). While this hasn't been
caused by commit 3d611a2, the deadlock seems to have become much
more likely after that patch. Solving this means that we have to
treat reallocation failure of the pollfd array differently.
We will now just ignore any clients above the last valid pfd index.
That's a minor problem, as we're in an OOM situation anyway.

Moreover, client_lock is not a "struct lock", but a plain
pthread_mutex_t.

Fixes: 3d611a2 ("multipathd: cancel threads early during shutdown")
Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 multipathd/uxlsnr.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

Comments

Benjamin Marzinski Sept. 26, 2020, 1:52 a.m. UTC | #1
On Thu, Sep 24, 2020 at 03:40:32PM +0200, mwilck@suse.com wrote:
> From: Martin Wilck <mwilck@suse.com>
> 
> The uxlsnr wouldn't always release the client lock when cancelled,
> causing a deadlock in uxsock_cleanup(). While this hasn't been
> caused by commit 3d611a2, the deadlock seems to have become much
> more likely after that patch. Solving this means that we have to
> treat reallocation failure of the pollfd array differently.
> We will now just ignore any clients above the last valid pfd index.
> That's a minor problem, as we're in an OOM situation anyway.
> 
> Moreover, client_lock is not a "struct lock", but a plain
> pthread_mutex_t.
> 
> Fixes: 3d611a2 ("multipathd: cancel threads early during shutdown")
> Signed-off-by: Martin Wilck <mwilck@suse.com>
> ---
>  multipathd/uxlsnr.c | 17 ++++++++++-------
>  1 file changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/multipathd/uxlsnr.c b/multipathd/uxlsnr.c
> index 1c5ce9d..d47ba1a 100644
> --- a/multipathd/uxlsnr.c
> +++ b/multipathd/uxlsnr.c
> @@ -35,6 +35,7 @@
>  #include "config.h"
>  #include "mpath_cmd.h"
>  #include "time-util.h"
> +#include "util.h"
>  
>  #include "main.h"
>  #include "cli.h"
> @@ -116,7 +117,7 @@ static void _dead_client(struct client *c)
>  
>  static void dead_client(struct client *c)
>  {
> -	pthread_cleanup_push(cleanup_lock, &client_lock);
> +	pthread_cleanup_push(cleanup_mutex, &client_lock);
>  	pthread_mutex_lock(&client_lock);
>  	_dead_client(c);
>  	pthread_cleanup_pop(1);
> @@ -306,6 +307,7 @@ void * uxsock_listen(uxsock_trigger_fn uxsock_trigger, long ux_sock,
>  
>  		/* setup for a poll */
>  		pthread_mutex_lock(&client_lock);
> +		pthread_cleanup_push(cleanup_mutex, &client_lock);
>  		num_clients = 0;
>  		list_for_each_entry(c, &clients, node) {
>  			num_clients++;
> @@ -322,14 +324,13 @@ void * uxsock_listen(uxsock_trigger_fn uxsock_trigger, long ux_sock,
>  						sizeof(struct pollfd));
>  			}
>  			if (!new) {
> -				pthread_mutex_unlock(&client_lock);
>  				condlog(0, "%s: failed to realloc %d poll fds",
>  					"uxsock", 2 + num_clients);
> -				sched_yield();
> -				continue;
> +				num_clients = old_clients;

O.k. I'm getting way into the theoretical weeds here, but I believe that
realloc() is technically allowed to return NULL when it shrinks
allocated memory. In this case num_clients would be too big.  Later in
this function, when we loop through num_clients

                for (i = 2; i < num_clients + 2; i++) {
                        if (polls[i].revents & POLLIN) {
 
We could look at an unused polls entry, since its revents doesn't get
cleared. It's also possible that the fd of this unused entry matches the
fd of an existing client. Then we could try to get a packet from a
client that isn't sending one, and kill that client. Yeah, this will
almost certainly never happen.  But we could just zero out the revents
field, or loop over the actual number of structures we polled, and then
it can't happen.

-Ben

> +			} else {
> +				old_clients = num_clients;
> +				polls = new;
>  			}
> -			old_clients = num_clients;
> -			polls = new;
>  		}
>  		polls[0].fd = ux_sock;
>  		polls[0].events = POLLIN;
> @@ -347,8 +348,10 @@ void * uxsock_listen(uxsock_trigger_fn uxsock_trigger, long ux_sock,
>  			polls[i].fd = c->fd;
>  			polls[i].events = POLLIN;
>  			i++;
> +			if (i >= 2 + num_clients)
> +				break;
>  		}
> -		pthread_mutex_unlock(&client_lock);
> +		pthread_cleanup_pop(1);
>  
>  		/* most of our life is spent in this call */
>  		poll_count = ppoll(polls, i, &sleep_time, &mask);
> -- 
> 2.28.0

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Martin Wilck Sept. 26, 2020, 1 p.m. UTC | #2
On Fri, 2020-09-25 at 20:52 -0500, Benjamin Marzinski wrote:
> 
> O.k. I'm getting way into the theoretical weeds here, but I believe
> that
> realloc() is technically allowed to return NULL when it shrinks
> allocated memory. In this case num_clients would be too big.  Later
> in
> this function, when we loop through num_clients
> 
>                 for (i = 2; i < num_clients + 2; i++) {
>                         if (polls[i].revents & POLLIN) {
>  
> We could look at an unused polls entry, since its revents doesn't get
> cleared. It's also possible that the fd of this unused entry matches
> the
> fd of an existing client. Then we could try to get a packet from a
> client that isn't sending one, and kill that client. Yeah, this will
> almost certainly never happen.  But we could just zero out the
> revents
> field, or loop over the actual number of structures we polled, and
> then
> it can't happen.

I'll fix this up in this patch, and add another patch to sanitize this.
calling realloc() when the number of fds shrinks really isn't buying us
much.

Martin


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox series

Patch

diff --git a/multipathd/uxlsnr.c b/multipathd/uxlsnr.c
index 1c5ce9d..d47ba1a 100644
--- a/multipathd/uxlsnr.c
+++ b/multipathd/uxlsnr.c
@@ -35,6 +35,7 @@ 
 #include "config.h"
 #include "mpath_cmd.h"
 #include "time-util.h"
+#include "util.h"
 
 #include "main.h"
 #include "cli.h"
@@ -116,7 +117,7 @@  static void _dead_client(struct client *c)
 
 static void dead_client(struct client *c)
 {
-	pthread_cleanup_push(cleanup_lock, &client_lock);
+	pthread_cleanup_push(cleanup_mutex, &client_lock);
 	pthread_mutex_lock(&client_lock);
 	_dead_client(c);
 	pthread_cleanup_pop(1);
@@ -306,6 +307,7 @@  void * uxsock_listen(uxsock_trigger_fn uxsock_trigger, long ux_sock,
 
 		/* setup for a poll */
 		pthread_mutex_lock(&client_lock);
+		pthread_cleanup_push(cleanup_mutex, &client_lock);
 		num_clients = 0;
 		list_for_each_entry(c, &clients, node) {
 			num_clients++;
@@ -322,14 +324,13 @@  void * uxsock_listen(uxsock_trigger_fn uxsock_trigger, long ux_sock,
 						sizeof(struct pollfd));
 			}
 			if (!new) {
-				pthread_mutex_unlock(&client_lock);
 				condlog(0, "%s: failed to realloc %d poll fds",
 					"uxsock", 2 + num_clients);
-				sched_yield();
-				continue;
+				num_clients = old_clients;
+			} else {
+				old_clients = num_clients;
+				polls = new;
 			}
-			old_clients = num_clients;
-			polls = new;
 		}
 		polls[0].fd = ux_sock;
 		polls[0].events = POLLIN;
@@ -347,8 +348,10 @@  void * uxsock_listen(uxsock_trigger_fn uxsock_trigger, long ux_sock,
 			polls[i].fd = c->fd;
 			polls[i].events = POLLIN;
 			i++;
+			if (i >= 2 + num_clients)
+				break;
 		}
-		pthread_mutex_unlock(&client_lock);
+		pthread_cleanup_pop(1);
 
 		/* most of our life is spent in this call */
 		poll_count = ppoll(polls, i, &sleep_time, &mask);