diff mbox series

libtirpc deadlocks and race conditions patch

Message ID ec9a624b-d08d-d623-2e93-705a986f7422@cfa.harvard.edu (mailing list archive)
State New, archived
Headers show
Series libtirpc deadlocks and race conditions patch | expand

Commit Message

Attila Kovacs July 20, 2022, 7:20 p.m. UTC
Hi,

Attached is the patch for the previously mentioned issues of 
clnt_create() deadlocking on errors, and cond_signal() being unprotected 
against asyncronous calls to clnt_destroy().

cheers,

-- A.


On 7/20/22 14:16, Attila Kovacs wrote:
> 
> More mutex issues -- this time more likely to do with the connect 
> deadlock we see:
> 
> in clnt_dg.c, in clnt_dg_create():
> 
> 
> clnt_fd_lock is locked on L171, but then not released if jumping to the 
> err1 label on an error (L175 and L180). This means that those errors 
> will deadlock all further operations that require clnt_fd_lock access.
> 
> Same in clnt_vc.c in clnt_vc_create, on lines 215, 222, and 230 
> respectively.
> 
> -- A.
> 
> 
> 
> 
> 
> On 7/20/22 12:09, Attila Kovacs wrote:
>> Hi Steve,
>>
>>
>> We are using the tirpc library in an MT environment. We are 
>> experiencing occasional deadlocks when calling clnt_create_timed() 
>> concurrently from multiple threads. When this happens, all connect 
>> threads hang, with each thread taking up 100% CPU.
>>
>> I was peeking at the code (release 1.3.2), and some potential problems 
>> I see is how waiting / signaling is implemented on cu->cu_fd_lock.cv 
>> in clnt_dg.c and clnt_vc.c.
>>
>> 1. In cnlt_dg_freeres() and clnt_vc_freeres(), cond_signal() is called 
>> after unlocking the mutex (clnt_fd_lock). The manual of 
>> pthread_cond_signal() allows that, but mentions that for consistent 
>> scheduling, cond_signal() should be called with the waiting mutex 
>> locked.  (i.e. lock -> signal -> unlock, rather than lock -> unlock -> 
>> signal).
>>
>> One of the dangers of signaling with the unlocked mutex, is that in 
>> MT, another thread can lock the mutex before the signal call is made, 
>> and potentially destroy the condition variable (e.g. an asynchronous 
>> call to clnt_*_destroy()). Thus, by the time the signal() is called in 
>> clnt*_freeres(), both the condition may be invalid.
>>
>> The proper sequence here should be:
>>
>>      [...]
>>      mutex_lock(&clnt_fd_lock);
>>      while (ct->ct_fd_lock->active)
>>          cond_wait(&ct->ct_fd_lock->cv, &clnt_fd_lock);
>>      xdrs->x_op = XDR_FREE;
>>      dummy = (*xdr_res)(xdrs, res_ptr);
>>      thr_sigsetmask(SIG_SETMASK, &(mask), NULL);
>>      cond_signal(&ct->ct_fd_lock->cv);
>>      mutex_unlock(&clnt_fd_lock);
>>
>>
>> 2. Similar issue in the macro release_fd_lock() in both the vc and dg 
>> sources. Here again the sequence ought to be:
>>
>> #define release_fd_lock(fd_lock, mask) {    \
>>      mutex_lock(&clnt_fd_lock);    \
>>      fd_lock->active = FALSE;    \
>>      thr_sigsetmask(SIG_SETMASK, &(mask), (sigset_t *) NULL); \
>>      cond_signal(&fd_lock->cv);    \
>>      mutex_unlock(&clnt_fd_lock);    \
>> }
>>
>>
>> 3. The use of cond_wait() in clnt_dg.c and clnt_vc.c is also 
>> unprotected against the asynchronous destruction of the condition 
>> variable, since cond_wait() releases the mutex as it enters the 
>> waiting state. So, when it is called again in the while() loop, the 
>> condtion might no longer exist. Thus, before calling cond_wait(), one 
>> should check that the client is valid (not destroyed) inside the wait 
>> loop.
>>
>>
>> I'm not sure if these particular issues are the cause of the deadlocks 
>> we see, but I think they are problematic enough on their own, and 
>> perhaps should be fixed in the next release.
>>
>> Thanks,
>>
>> -- Attila
diff mbox series

Patch

diff --git a/src/clnt_dg.c b/src/clnt_dg.c
index b3d82e7..7c5d22e 100644
--- a/src/clnt_dg.c
+++ b/src/clnt_dg.c
@@ -101,9 +101,9 @@  extern mutex_t clnt_fd_lock;
 #define	release_fd_lock(fd_lock, mask) {	\
 	mutex_lock(&clnt_fd_lock);	\
 	fd_lock->active = FALSE;	\
-	mutex_unlock(&clnt_fd_lock);	\
 	thr_sigsetmask(SIG_SETMASK, &(mask), NULL); \
 	cond_signal(&fd_lock->cv);	\
+	mutex_unlock(&clnt_fd_lock);    \
 }
 
 static const char mem_err_clnt_dg[] = "clnt_dg_create: out of memory";
@@ -172,12 +172,15 @@  clnt_dg_create(fd, svcaddr, program, version, sendsz, recvsz)
 	if (dg_fd_locks == (fd_locks_t *) NULL) {
 		dg_fd_locks = fd_locks_init();
 		if (dg_fd_locks == (fd_locks_t *) NULL) {
+			mutex_unlock(&clnt_fd_lock);
 			goto err1;
 		}
 	}
 	fd_lock = fd_lock_create(fd, dg_fd_locks);
-	if (fd_lock == (fd_lock_t *) NULL)
+	if (fd_lock == (fd_lock_t *) NULL) {
+		mutex_unlock(&clnt_fd_lock);
 		goto err1;
+	}
 
 	mutex_unlock(&clnt_fd_lock);
 	thr_sigsetmask(SIG_SETMASK, &(mask), NULL);
@@ -573,9 +576,9 @@  clnt_dg_freeres(cl, xdr_res, res_ptr)
 	cu->cu_fd_lock->active = TRUE;
 	xdrs->x_op = XDR_FREE;
 	dummy = (*xdr_res)(xdrs, res_ptr);
-	mutex_unlock(&clnt_fd_lock);
 	thr_sigsetmask(SIG_SETMASK, &mask, NULL);
 	cond_signal(&cu->cu_fd_lock->cv);
+	mutex_unlock(&clnt_fd_lock);
 	return (dummy);
 }
 
diff --git a/src/clnt_vc.c b/src/clnt_vc.c
index a07e297..3c73e65 100644
--- a/src/clnt_vc.c
+++ b/src/clnt_vc.c
@@ -153,9 +153,9 @@  extern mutex_t  clnt_fd_lock;
 #define release_fd_lock(fd_lock, mask) {	\
 	mutex_lock(&clnt_fd_lock);	\
 	fd_lock->active = FALSE;	\
-	mutex_unlock(&clnt_fd_lock);	\
 	thr_sigsetmask(SIG_SETMASK, &(mask), (sigset_t *) NULL);	\
 	cond_signal(&fd_lock->cv);	\
+	mutex_unlock(&clnt_fd_lock);    \
 }
 
 static const char clnt_vc_errstr[] = "%s : %s";
@@ -216,7 +216,9 @@  clnt_vc_create(fd, raddr, prog, vers, sendsz, recvsz)
 	if (vc_fd_locks == (fd_locks_t *) NULL) {
 		vc_fd_locks = fd_locks_init();
 		if (vc_fd_locks == (fd_locks_t *) NULL) {
-			struct rpc_createerr *ce = &get_rpc_createerr();
+			struct rpc_createerr *ce;
+			mutex_unlock(&clnt_fd_lock);
+			ce = &get_rpc_createerr();
 			ce->cf_stat = RPC_SYSTEMERROR;
 			ce->cf_error.re_errno = errno;
 			goto err;
@@ -224,7 +226,9 @@  clnt_vc_create(fd, raddr, prog, vers, sendsz, recvsz)
 	}
 	fd_lock = fd_lock_create(fd, vc_fd_locks);
 	if (fd_lock == (fd_lock_t *) NULL) {
-		struct rpc_createerr *ce = &get_rpc_createerr();
+		struct rpc_createerr *ce;
+		mutex_unlock(&clnt_fd_lock);
+		ce = &get_rpc_createerr();
 		ce->cf_stat = RPC_SYSTEMERROR;
 		ce->cf_error.re_errno = errno;
 		goto err;
@@ -495,9 +499,9 @@  clnt_vc_freeres(cl, xdr_res, res_ptr)
 		cond_wait(&ct->ct_fd_lock->cv, &clnt_fd_lock);
 	xdrs->x_op = XDR_FREE;
 	dummy = (*xdr_res)(xdrs, res_ptr);
-	mutex_unlock(&clnt_fd_lock);
 	thr_sigsetmask(SIG_SETMASK, &(mask), NULL);
 	cond_signal(&ct->ct_fd_lock->cv);
+	mutex_unlock(&clnt_fd_lock);
 
 	return dummy;
 }