diff mbox

nfs: fix a deadlock in nfs client initialization

Message ID 20171120214118.4240-1-smayhew@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Scott Mayhew Nov. 20, 2017, 9:41 p.m. UTC
The following deadlock can occur between a process waiting for a client
to initialize in while walking the client list and another process
waiting for the nfs_clid_init_mutex so it can initialize that client:

Process 1                               Process 2
---------                               ---------
spin_lock(&nn->nfs_client_lock);
list_add_tail(&CLIENTA->cl_share_link,
        &nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
                                        spin_lock(&nn->nfs_client_lock);
                                        list_add_tail(&CLIENTB->cl_share_link,
                                                &nn->nfs_client_list);
                                        spin_unlock(&nn->nfs_client_lock);
                                        mutex_lock(&nfs_clid_init_mutex);
                                        nfs41_walk_client_list(clp, result, cred);
                                        nfs_wait_client_init_complete(CLIENTA);
(waiting for nfs_clid_init_mutex)

Add and initilize the client with the nfs_clid_init_mutex held in
order to prevent that deadlock.

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
---
 fs/nfs/client.c    | 21 +++++++++++++++++++--
 fs/nfs/nfs4state.c |  4 ----
 2 files changed, 19 insertions(+), 6 deletions(-)

Comments

Schumaker, Anna Nov. 29, 2017, 8:16 p.m. UTC | #1
Hi Scott,

On 11/20/2017 04:41 PM, Scott Mayhew wrote:
> The following deadlock can occur between a process waiting for a client
> to initialize in while walking the client list and another process
> waiting for the nfs_clid_init_mutex so it can initialize that client:
> 
> Process 1                               Process 2
> ---------                               ---------
> spin_lock(&nn->nfs_client_lock);
> list_add_tail(&CLIENTA->cl_share_link,
>         &nn->nfs_client_list);
> spin_unlock(&nn->nfs_client_lock);
>                                         spin_lock(&nn->nfs_client_lock);
>                                         list_add_tail(&CLIENTB->cl_share_link,
>                                                 &nn->nfs_client_list);
>                                         spin_unlock(&nn->nfs_client_lock);
>                                         mutex_lock(&nfs_clid_init_mutex);
>                                         nfs41_walk_client_list(clp, result, cred);
>                                         nfs_wait_client_init_complete(CLIENTA);
> (waiting for nfs_clid_init_mutex)
> 
> Add and initilize the client with the nfs_clid_init_mutex held in
> order to prevent that deadlock.
> 
> Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> ---
>  fs/nfs/client.c    | 21 +++++++++++++++++++--
>  fs/nfs/nfs4state.c |  4 ----
>  2 files changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/nfs/client.c b/fs/nfs/client.c
> index 0ac2fb1..db38c47 100644
> --- a/fs/nfs/client.c
> +++ b/fs/nfs/client.c
> @@ -60,6 +60,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
>  static DEFINE_SPINLOCK(nfs_version_lock);
>  static DEFINE_MUTEX(nfs_version_mutex);
>  static LIST_HEAD(nfs_versions);
> +static DEFINE_MUTEX(nfs_clid_init_mutex);
>  
>  /*
>   * RPC cruft for NFS
> @@ -386,7 +387,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
>   */
>  struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
>  {
> -	struct nfs_client *clp, *new = NULL;
> +	struct nfs_client *clp, *new = NULL, *result = NULL;
>  	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
>  	const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
>  
> @@ -407,11 +408,27 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
>  			return nfs_found_client(cl_init, clp);
>  		}
>  		if (new) {
> +			/* add and initialize the client with the
> +			 * nfs_clid_init_mutex held to prevent a deadlock
> +			 * with the server trunking detection
> +			 */
> +			spin_unlock(&nn->nfs_client_lock);
> +			mutex_lock(&nfs_clid_init_mutex);
> +			spin_lock(&nn->nfs_client_lock);
> +			clp = nfs_match_client(cl_init);
> +			if (clp) {
> +				spin_unlock(&nn->nfs_client_lock);
> +				mutex_unlock(&nfs_clid_init_mutex);
> +				new->rpc_ops->free_client(new);
> +				return nfs_found_client(cl_init, clp);
> +			}

Is there any reason you can't just grab the nfs_clid_init_mutex at the very beginning of the do/while loop, right before getting the nn->nfs_client_lock?  Then you wouldn't need to repeat the nfs_match_client() check, which might help clean the code up a bit.

Thanks,
Anna


>  			list_add_tail(&new->cl_share_link,
>  					&nn->nfs_client_list);
>  			spin_unlock(&nn->nfs_client_lock);
>  			new->cl_flags = cl_init->init_flags;
> -			return rpc_ops->init_client(new, cl_init);
> +			result = rpc_ops->init_client(new, cl_init);
> +			mutex_unlock(&nfs_clid_init_mutex);
> +			return result;
>  		}
>  
>  		spin_unlock(&nn->nfs_client_lock);
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 54fd56d..668164e 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -77,8 +77,6 @@ const nfs4_stateid invalid_stateid = {
>  	.type = NFS4_INVALID_STATEID_TYPE,
>  };
>  
> -static DEFINE_MUTEX(nfs_clid_init_mutex);
> -
>  int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
>  {
>  	struct nfs4_setclientid_res clid = {
> @@ -2164,7 +2162,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
>  	clnt = clp->cl_rpcclient;
>  	i = 0;
>  
> -	mutex_lock(&nfs_clid_init_mutex);
>  again:
>  	status  = -ENOENT;
>  	cred = nfs4_get_clid_cred(clp);
> @@ -2232,7 +2229,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
>  	}
>  
>  out_unlock:
> -	mutex_unlock(&nfs_clid_init_mutex);
>  	dprintk("NFS: %s: status = %d\n", __func__, status);
>  	return status;
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Nov. 29, 2017, 8:50 p.m. UTC | #2
T24gTW9uLCAyMDE3LTExLTIwIGF0IDE2OjQxIC0wNTAwLCBTY290dCBNYXloZXcgd3JvdGU6DQo+
IFRoZSBmb2xsb3dpbmcgZGVhZGxvY2sgY2FuIG9jY3VyIGJldHdlZW4gYSBwcm9jZXNzIHdhaXRp
bmcgZm9yIGENCj4gY2xpZW50DQo+IHRvIGluaXRpYWxpemUgaW4gd2hpbGUgd2Fsa2luZyB0aGUg
Y2xpZW50IGxpc3QgYW5kIGFub3RoZXIgcHJvY2Vzcw0KPiB3YWl0aW5nIGZvciB0aGUgbmZzX2Ns
aWRfaW5pdF9tdXRleCBzbyBpdCBjYW4gaW5pdGlhbGl6ZSB0aGF0IGNsaWVudDoNCj4gDQo+IFBy
b2Nlc3MgMSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBQcm9jZXNzIDINCj4gLS0tLS0t
LS0tICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIC0tLS0tLS0tLQ0KPiBzcGluX2xvY2so
Jm5uLT5uZnNfY2xpZW50X2xvY2spOw0KPiBsaXN0X2FkZF90YWlsKCZDTElFTlRBLT5jbF9zaGFy
ZV9saW5rLA0KPiAgICAgICAgICZubi0+bmZzX2NsaWVudF9saXN0KTsNCj4gc3Bpbl91bmxvY2so
Jm5uLT5uZnNfY2xpZW50X2xvY2spOw0KPiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
ICAgICAgICAgc3Bpbl9sb2NrKCZubi0NCj4gPm5mc19jbGllbnRfbG9jayk7DQo+ICAgICAgICAg
ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsaXN0X2FkZF90YWlsKCZDTElFTlRCLQ0K
PiA+Y2xfc2hhcmVfbGluaywNCj4gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
ICAgICAgICAgICAgJm5uLQ0KPiA+bmZzX2NsaWVudF9saXN0KTsNCj4gICAgICAgICAgICAgICAg
ICAgICAgICAgICAgICAgICAgICAgICAgIHNwaW5fdW5sb2NrKCZubi0NCj4gPm5mc19jbGllbnRf
bG9jayk7DQo+ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBtdXRleF9s
b2NrKCZuZnNfY2xpZF9pbml0X211dA0KPiBleCk7DQo+ICAgICAgICAgICAgICAgICAgICAgICAg
ICAgICAgICAgICAgICAgICBuZnM0MV93YWxrX2NsaWVudF9saXN0KGNscCwNCj4gcmVzdWx0LCBj
cmVkKTsNCj4gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG5mc193YWl0
X2NsaWVudF9pbml0X2NvbXBsZXRlDQo+IChDTElFTlRBKTsNCj4gKHdhaXRpbmcgZm9yIG5mc19j
bGlkX2luaXRfbXV0ZXgpDQo+IA0KPiBBZGQgYW5kIGluaXRpbGl6ZSB0aGUgY2xpZW50IHdpdGgg
dGhlIG5mc19jbGlkX2luaXRfbXV0ZXggaGVsZCBpbg0KPiBvcmRlciB0byBwcmV2ZW50IHRoYXQg
ZGVhZGxvY2suDQo+IA0KPiBTaWduZWQtb2ZmLWJ5OiBTY290dCBNYXloZXcgPHNtYXloZXdAcmVk
aGF0LmNvbT4NCj4gLS0tDQo+ICBmcy9uZnMvY2xpZW50LmMgICAgfCAyMSArKysrKysrKysrKysr
KysrKysrLS0NCj4gIGZzL25mcy9uZnM0c3RhdGUuYyB8ICA0IC0tLS0NCj4gIDIgZmlsZXMgY2hh
bmdlZCwgMTkgaW5zZXJ0aW9ucygrKSwgNiBkZWxldGlvbnMoLSkNCj4gDQo+IGRpZmYgLS1naXQg
YS9mcy9uZnMvY2xpZW50LmMgYi9mcy9uZnMvY2xpZW50LmMNCj4gaW5kZXggMGFjMmZiMS4uZGIz
OGM0NyAxMDA2NDQNCj4gLS0tIGEvZnMvbmZzL2NsaWVudC5jDQo+ICsrKyBiL2ZzL25mcy9jbGll
bnQuYw0KPiBAQCAtNjAsNiArNjAsNyBAQCBzdGF0aWMNCj4gREVDTEFSRV9XQUlUX1FVRVVFX0hF
QUQobmZzX2NsaWVudF9hY3RpdmVfd3EpOw0KPiAgc3RhdGljIERFRklORV9TUElOTE9DSyhuZnNf
dmVyc2lvbl9sb2NrKTsNCj4gIHN0YXRpYyBERUZJTkVfTVVURVgobmZzX3ZlcnNpb25fbXV0ZXgp
Ow0KPiAgc3RhdGljIExJU1RfSEVBRChuZnNfdmVyc2lvbnMpOw0KPiArc3RhdGljIERFRklORV9N
VVRFWChuZnNfY2xpZF9pbml0X211dGV4KTsNCj4gIA0KPiAgLyoNCj4gICAqIFJQQyBjcnVmdCBm
b3IgTkZTDQo+IEBAIC0zODYsNyArMzg3LDcgQEAgbmZzX2ZvdW5kX2NsaWVudChjb25zdCBzdHJ1
Y3QgbmZzX2NsaWVudF9pbml0ZGF0YQ0KPiAqY2xfaW5pdCwNCj4gICAqLw0KPiAgc3RydWN0IG5m
c19jbGllbnQgKm5mc19nZXRfY2xpZW50KGNvbnN0IHN0cnVjdCBuZnNfY2xpZW50X2luaXRkYXRh
DQo+ICpjbF9pbml0KQ0KPiAgew0KPiAtCXN0cnVjdCBuZnNfY2xpZW50ICpjbHAsICpuZXcgPSBO
VUxMOw0KPiArCXN0cnVjdCBuZnNfY2xpZW50ICpjbHAsICpuZXcgPSBOVUxMLCAqcmVzdWx0ID0g
TlVMTDsNCj4gIAlzdHJ1Y3QgbmZzX25ldCAqbm4gPSBuZXRfZ2VuZXJpYyhjbF9pbml0LT5uZXQs
IG5mc19uZXRfaWQpOw0KPiAgCWNvbnN0IHN0cnVjdCBuZnNfcnBjX29wcyAqcnBjX29wcyA9IGNs
X2luaXQtPm5mc19tb2QtDQo+ID5ycGNfb3BzOw0KPiAgDQo+IEBAIC00MDcsMTEgKzQwOCwyNyBA
QCBzdHJ1Y3QgbmZzX2NsaWVudCAqbmZzX2dldF9jbGllbnQoY29uc3Qgc3RydWN0DQo+IG5mc19j
bGllbnRfaW5pdGRhdGEgKmNsX2luaXQpDQo+ICAJCQlyZXR1cm4gbmZzX2ZvdW5kX2NsaWVudChj
bF9pbml0LCBjbHApOw0KPiAgCQl9DQo+ICAJCWlmIChuZXcpIHsNCj4gKwkJCS8qIGFkZCBhbmQg
aW5pdGlhbGl6ZSB0aGUgY2xpZW50IHdpdGggdGhlDQo+ICsJCQkgKiBuZnNfY2xpZF9pbml0X211
dGV4IGhlbGQgdG8gcHJldmVudCBhDQo+IGRlYWRsb2NrDQo+ICsJCQkgKiB3aXRoIHRoZSBzZXJ2
ZXIgdHJ1bmtpbmcgZGV0ZWN0aW9uDQo+ICsJCQkgKi8NCj4gKwkJCXNwaW5fdW5sb2NrKCZubi0+
bmZzX2NsaWVudF9sb2NrKTsNCj4gKwkJCW11dGV4X2xvY2soJm5mc19jbGlkX2luaXRfbXV0ZXgp
Ow0KPiArCQkJc3Bpbl9sb2NrKCZubi0+bmZzX2NsaWVudF9sb2NrKTsNCj4gKwkJCWNscCA9IG5m
c19tYXRjaF9jbGllbnQoY2xfaW5pdCk7DQo+ICsJCQlpZiAoY2xwKSB7DQo+ICsJCQkJc3Bpbl91
bmxvY2soJm5uLT5uZnNfY2xpZW50X2xvY2spOw0KPiArCQkJCW11dGV4X3VubG9jaygmbmZzX2Ns
aWRfaW5pdF9tdXRleCk7DQo+ICsJCQkJbmV3LT5ycGNfb3BzLT5mcmVlX2NsaWVudChuZXcpOw0K
PiArCQkJCXJldHVybiBuZnNfZm91bmRfY2xpZW50KGNsX2luaXQsDQo+IGNscCk7DQo+ICsJCQl9
DQo+ICAJCQlsaXN0X2FkZF90YWlsKCZuZXctPmNsX3NoYXJlX2xpbmssDQo+ICAJCQkJCSZubi0+
bmZzX2NsaWVudF9saXN0KTsNCj4gIAkJCXNwaW5fdW5sb2NrKCZubi0+bmZzX2NsaWVudF9sb2Nr
KTsNCj4gIAkJCW5ldy0+Y2xfZmxhZ3MgPSBjbF9pbml0LT5pbml0X2ZsYWdzOw0KPiAtCQkJcmV0
dXJuIHJwY19vcHMtPmluaXRfY2xpZW50KG5ldywgY2xfaW5pdCk7DQo+ICsJCQlyZXN1bHQgPSBy
cGNfb3BzLT5pbml0X2NsaWVudChuZXcsIGNsX2luaXQpOw0KPiArCQkJbXV0ZXhfdW5sb2NrKCZu
ZnNfY2xpZF9pbml0X211dGV4KTsNCj4gKwkJCXJldHVybiByZXN1bHQ7DQo+ICAJCX0NCj4gIA0K
PiAgCQlzcGluX3VubG9jaygmbm4tPm5mc19jbGllbnRfbG9jayk7DQo+IGRpZmYgLS1naXQgYS9m
cy9uZnMvbmZzNHN0YXRlLmMgYi9mcy9uZnMvbmZzNHN0YXRlLmMNCj4gaW5kZXggNTRmZDU2ZC4u
NjY4MTY0ZSAxMDA2NDQNCj4gLS0tIGEvZnMvbmZzL25mczRzdGF0ZS5jDQo+ICsrKyBiL2ZzL25m
cy9uZnM0c3RhdGUuYw0KPiBAQCAtNzcsOCArNzcsNiBAQCBjb25zdCBuZnM0X3N0YXRlaWQgaW52
YWxpZF9zdGF0ZWlkID0gew0KPiAgCS50eXBlID0gTkZTNF9JTlZBTElEX1NUQVRFSURfVFlQRSwN
Cj4gIH07DQo+ICANCj4gLXN0YXRpYyBERUZJTkVfTVVURVgobmZzX2NsaWRfaW5pdF9tdXRleCk7
DQo+IC0NCj4gIGludCBuZnM0X2luaXRfY2xpZW50aWQoc3RydWN0IG5mc19jbGllbnQgKmNscCwg
c3RydWN0IHJwY19jcmVkDQo+ICpjcmVkKQ0KPiAgew0KPiAgCXN0cnVjdCBuZnM0X3NldGNsaWVu
dGlkX3JlcyBjbGlkID0gew0KPiBAQCAtMjE2NCw3ICsyMTYyLDYgQEAgaW50IG5mczRfZGlzY292
ZXJfc2VydmVyX3RydW5raW5nKHN0cnVjdA0KPiBuZnNfY2xpZW50ICpjbHAsDQo+ICAJY2xudCA9
IGNscC0+Y2xfcnBjY2xpZW50Ow0KPiAgCWkgPSAwOw0KPiAgDQo+IC0JbXV0ZXhfbG9jaygmbmZz
X2NsaWRfaW5pdF9tdXRleCk7DQo+ICBhZ2FpbjoNCj4gIAlzdGF0dXMgID0gLUVOT0VOVDsNCj4g
IAljcmVkID0gbmZzNF9nZXRfY2xpZF9jcmVkKGNscCk7DQo+IEBAIC0yMjMyLDcgKzIyMjksNiBA
QCBpbnQgbmZzNF9kaXNjb3Zlcl9zZXJ2ZXJfdHJ1bmtpbmcoc3RydWN0DQo+IG5mc19jbGllbnQg
KmNscCwNCj4gIAl9DQo+ICANCj4gIG91dF91bmxvY2s6DQo+IC0JbXV0ZXhfdW5sb2NrKCZuZnNf
Y2xpZF9pbml0X211dGV4KTsNCj4gIAlkcHJpbnRrKCJORlM6ICVzOiBzdGF0dXMgPSAlZFxuIiwg
X19mdW5jX18sIHN0YXR1cyk7DQo+ICAJcmV0dXJuIHN0YXR1czsNCj4gIH0NCg0KWW91ciBpbml0
aWFsIGZpeCB3YXMgZmluZS4gSXQganVzdCBuZWVkZWQgMiBjaGFuZ2VzOg0KDQoxKSBSZW1vdmUg
dGhlIHRlc3QgZm9yICdjbHAtPmNsX21pbm9ydmVyc2lvbiA+IDAnLg0KMikgUmVmY291bnQgdGhl
IHN0cnVjdCBuZnNfY2xpZW50IHdoaWxlIHlvdSBhcmUgd2FpdGluZyBmb3IgaXQgdG8gYmUNCmlu
aXRpYWxpc2VkLg0KDQotLSANClRyb25kIE15a2xlYnVzdA0KTGludXggTkZTIGNsaWVudCBtYWlu
dGFpbmVyLCBQcmltYXJ5RGF0YQ0KdHJvbmQubXlrbGVidXN0QHByaW1hcnlkYXRhLmNvbQ0K

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Mayhew Nov. 30, 2017, 2:46 p.m. UTC | #3
On Wed, 29 Nov 2017, Trond Myklebust wrote:

> On Mon, 2017-11-20 at 16:41 -0500, Scott Mayhew wrote:
> > The following deadlock can occur between a process waiting for a
> > client
> > to initialize in while walking the client list and another process
> > waiting for the nfs_clid_init_mutex so it can initialize that client:
> > 
> > Process 1                               Process 2
> > ---------                               ---------
> > spin_lock(&nn->nfs_client_lock);
> > list_add_tail(&CLIENTA->cl_share_link,
> >         &nn->nfs_client_list);
> > spin_unlock(&nn->nfs_client_lock);
> >                                         spin_lock(&nn-
> > >nfs_client_lock);
> >                                         list_add_tail(&CLIENTB-
> > >cl_share_link,
> >                                                 &nn-
> > >nfs_client_list);
> >                                         spin_unlock(&nn-
> > >nfs_client_lock);
> >                                         mutex_lock(&nfs_clid_init_mut
> > ex);
> >                                         nfs41_walk_client_list(clp,
> > result, cred);
> >                                         nfs_wait_client_init_complete
> > (CLIENTA);
> > (waiting for nfs_clid_init_mutex)
> > 
> > Add and initilize the client with the nfs_clid_init_mutex held in
> > order to prevent that deadlock.
> > 
> > Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> > ---
> >  fs/nfs/client.c    | 21 +++++++++++++++++++--
> >  fs/nfs/nfs4state.c |  4 ----
> >  2 files changed, 19 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/nfs/client.c b/fs/nfs/client.c
> > index 0ac2fb1..db38c47 100644
> > --- a/fs/nfs/client.c
> > +++ b/fs/nfs/client.c
> > @@ -60,6 +60,7 @@ static
> > DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
> >  static DEFINE_SPINLOCK(nfs_version_lock);
> >  static DEFINE_MUTEX(nfs_version_mutex);
> >  static LIST_HEAD(nfs_versions);
> > +static DEFINE_MUTEX(nfs_clid_init_mutex);
> >  
> >  /*
> >   * RPC cruft for NFS
> > @@ -386,7 +387,7 @@ nfs_found_client(const struct nfs_client_initdata
> > *cl_init,
> >   */
> >  struct nfs_client *nfs_get_client(const struct nfs_client_initdata
> > *cl_init)
> >  {
> > -	struct nfs_client *clp, *new = NULL;
> > +	struct nfs_client *clp, *new = NULL, *result = NULL;
> >  	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
> >  	const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod-
> > >rpc_ops;
> >  
> > @@ -407,11 +408,27 @@ struct nfs_client *nfs_get_client(const struct
> > nfs_client_initdata *cl_init)
> >  			return nfs_found_client(cl_init, clp);
> >  		}
> >  		if (new) {
> > +			/* add and initialize the client with the
> > +			 * nfs_clid_init_mutex held to prevent a
> > deadlock
> > +			 * with the server trunking detection
> > +			 */
> > +			spin_unlock(&nn->nfs_client_lock);
> > +			mutex_lock(&nfs_clid_init_mutex);
> > +			spin_lock(&nn->nfs_client_lock);
> > +			clp = nfs_match_client(cl_init);
> > +			if (clp) {
> > +				spin_unlock(&nn->nfs_client_lock);
> > +				mutex_unlock(&nfs_clid_init_mutex);
> > +				new->rpc_ops->free_client(new);
> > +				return nfs_found_client(cl_init,
> > clp);
> > +			}
> >  			list_add_tail(&new->cl_share_link,
> >  					&nn->nfs_client_list);
> >  			spin_unlock(&nn->nfs_client_lock);
> >  			new->cl_flags = cl_init->init_flags;
> > -			return rpc_ops->init_client(new, cl_init);
> > +			result = rpc_ops->init_client(new, cl_init);
> > +			mutex_unlock(&nfs_clid_init_mutex);
> > +			return result;
> >  		}
> >  
> >  		spin_unlock(&nn->nfs_client_lock);
> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > index 54fd56d..668164e 100644
> > --- a/fs/nfs/nfs4state.c
> > +++ b/fs/nfs/nfs4state.c
> > @@ -77,8 +77,6 @@ const nfs4_stateid invalid_stateid = {
> >  	.type = NFS4_INVALID_STATEID_TYPE,
> >  };
> >  
> > -static DEFINE_MUTEX(nfs_clid_init_mutex);
> > -
> >  int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred
> > *cred)
> >  {
> >  	struct nfs4_setclientid_res clid = {
> > @@ -2164,7 +2162,6 @@ int nfs4_discover_server_trunking(struct
> > nfs_client *clp,
> >  	clnt = clp->cl_rpcclient;
> >  	i = 0;
> >  
> > -	mutex_lock(&nfs_clid_init_mutex);
> >  again:
> >  	status  = -ENOENT;
> >  	cred = nfs4_get_clid_cred(clp);
> > @@ -2232,7 +2229,6 @@ int nfs4_discover_server_trunking(struct
> > nfs_client *clp,
> >  	}
> >  
> >  out_unlock:
> > -	mutex_unlock(&nfs_clid_init_mutex);
> >  	dprintk("NFS: %s: status = %d\n", __func__, status);
> >  	return status;
> >  }
> 
> Your initial fix was fine. It just needed 2 changes:
> 
> 1) Remove the test for 'clp->cl_minorversion > 0'.
> 2) Refcount the struct nfs_client while you are waiting for it to be
> initialised.

I guess I didn't explain it well enough in the email but I actually did
try that already and I had problems with a multi homed NFS server.
Once the first process finds and uses an existing client, who's going to
mark the old client ready so that the other processes finish waiting? 
Maybe what I need is to mark the old client ready with an error like
-EPERM or something, i.e.

---8<---
        error = nfs4_discover_server_trunking(clp, &old);
        if (error < 0)
                goto error;

        if (clp != old)
                clp->cl_preserve_clid = true;
        nfs_mark_client_ready(clp, -EPERM);
        nfs_put_client(clp);
        clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
        return old;
---8<---

I'll test that to see if that makes any difference.

-Scott
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, PrimaryData
> trond.myklebust@primarydata.com
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0ac2fb1..db38c47 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -60,6 +60,7 @@  static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
 static DEFINE_SPINLOCK(nfs_version_lock);
 static DEFINE_MUTEX(nfs_version_mutex);
 static LIST_HEAD(nfs_versions);
+static DEFINE_MUTEX(nfs_clid_init_mutex);
 
 /*
  * RPC cruft for NFS
@@ -386,7 +387,7 @@  nfs_found_client(const struct nfs_client_initdata *cl_init,
  */
 struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
 {
-	struct nfs_client *clp, *new = NULL;
+	struct nfs_client *clp, *new = NULL, *result = NULL;
 	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
 	const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
 
@@ -407,11 +408,27 @@  struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
 			return nfs_found_client(cl_init, clp);
 		}
 		if (new) {
+			/* add and initialize the client with the
+			 * nfs_clid_init_mutex held to prevent a deadlock
+			 * with the server trunking detection
+			 */
+			spin_unlock(&nn->nfs_client_lock);
+			mutex_lock(&nfs_clid_init_mutex);
+			spin_lock(&nn->nfs_client_lock);
+			clp = nfs_match_client(cl_init);
+			if (clp) {
+				spin_unlock(&nn->nfs_client_lock);
+				mutex_unlock(&nfs_clid_init_mutex);
+				new->rpc_ops->free_client(new);
+				return nfs_found_client(cl_init, clp);
+			}
 			list_add_tail(&new->cl_share_link,
 					&nn->nfs_client_list);
 			spin_unlock(&nn->nfs_client_lock);
 			new->cl_flags = cl_init->init_flags;
-			return rpc_ops->init_client(new, cl_init);
+			result = rpc_ops->init_client(new, cl_init);
+			mutex_unlock(&nfs_clid_init_mutex);
+			return result;
 		}
 
 		spin_unlock(&nn->nfs_client_lock);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 54fd56d..668164e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -77,8 +77,6 @@  const nfs4_stateid invalid_stateid = {
 	.type = NFS4_INVALID_STATEID_TYPE,
 };
 
-static DEFINE_MUTEX(nfs_clid_init_mutex);
-
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	struct nfs4_setclientid_res clid = {
@@ -2164,7 +2162,6 @@  int nfs4_discover_server_trunking(struct nfs_client *clp,
 	clnt = clp->cl_rpcclient;
 	i = 0;
 
-	mutex_lock(&nfs_clid_init_mutex);
 again:
 	status  = -ENOENT;
 	cred = nfs4_get_clid_cred(clp);
@@ -2232,7 +2229,6 @@  int nfs4_discover_server_trunking(struct nfs_client *clp,
 	}
 
 out_unlock:
-	mutex_unlock(&nfs_clid_init_mutex);
 	dprintk("NFS: %s: status = %d\n", __func__, status);
 	return status;
 }