[DRAFT,2/2] ocfs2: fix deadlock caused by recursive cluster locking
diff mbox

Message ID 1476854382-28101-3-git-send-email-zren@suse.com
State New
Headers show

Commit Message

Zhen Ren Oct. 19, 2016, 5:19 a.m. UTC
The deadlock issue happens when running discontiguous block
group testing on multiple nodes. The easier way to reproduce
is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
nodes at the same time by pssh.

This is indeed another deadlock caused by: commit 743b5f1434f5
("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
had been explained well by Tariq Saeed in this thread:

https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html

For this case, the ocfs2_inode_lock() is misused recursively as below:

do_sys_open
 do_filp_open
  path_openat
   may_open
    inode_permission
     __inode_permission
      ocfs2_permission  <====== ocfs2_inode_lock()
       generic_permission
        get_acl
         ocfs2_iop_get_acl  <====== ocfs2_inode_lock()
          ocfs2_inode_lock_full_nested <===== deadlock if a remote EX request
comes between two ocfs2_inode_lock()

Fix by checking if the cluster lock has been acquired aready in the call-chain
path.

Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
Signed-off-by: Eric Ren <zren@suse.com>
---
 fs/ocfs2/acl.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

Comments

piaojun Oct. 31, 2016, 10:55 a.m. UTC | #1
Hi Eric,

On 2016-10-19 13:19, Eric Ren wrote:
> The deadlock issue happens when running discontiguous block
> group testing on multiple nodes. The easier way to reproduce
> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
> nodes at the same time by pssh.
> 
> This is indeed another deadlock caused by: commit 743b5f1434f5
> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
> had been explained well by Tariq Saeed in this thread:
> 
> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
> 
> For this case, the ocfs2_inode_lock() is misused recursively as below:
> 
> do_sys_open
>  do_filp_open
>   path_openat
>    may_open
>     inode_permission
>      __inode_permission
>       ocfs2_permission  <====== ocfs2_inode_lock()
>        generic_permission
>         get_acl
>          ocfs2_iop_get_acl  <====== ocfs2_inode_lock()
>           ocfs2_inode_lock_full_nested <===== deadlock if a remote EX request
Do you mean another node wants to get ex of the inode? or another process?
> comes between two ocfs2_inode_lock()
> 
> Fix by checking if the cluster lock has been acquired aready in the call-chain
> path.
> 
> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> Signed-off-by: Eric Ren <zren@suse.com>
> ---
>  fs/ocfs2/acl.c | 39 +++++++++++++++++++++++++++------------
>  1 file changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..7e3544e 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>  int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  {
>  	struct buffer_head *bh = NULL;
> +	struct ocfs2_holder *oh;
> +	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>  	int status = 0;
>  
> -	status = ocfs2_inode_lock(inode, &bh, 1);
> -	if (status < 0) {
> -		if (status != -ENOENT)
> -			mlog_errno(status);
> -		return status;
> +	oh = ocfs2_is_locked_by_me(lockres);
> +	if (!oh) {
> +		status = ocfs2_inode_lock(inode, &bh, 1);
> +		if (status < 0) {
> +			if (status != -ENOENT)
> +				mlog_errno(status);
> +			return status;
> +		}
>  	}
> +
>  	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> -	ocfs2_inode_unlock(inode, 1);
> +
> +	if (!oh)
> +		ocfs2_inode_unlock(inode, 1);
>  	brelse(bh);
>  	return status;
>  }
> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
>  	struct ocfs2_super *osb;
>  	struct buffer_head *di_bh = NULL;
>  	struct posix_acl *acl;
> +	struct ocfs2_holder *oh;
> +	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>  	int ret;
>  
>  	osb = OCFS2_SB(inode->i_sb);
>  	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>  		return NULL;
> -	ret = ocfs2_inode_lock(inode, &di_bh, 0);
> -	if (ret < 0) {
> -		if (ret != -ENOENT)
> -			mlog_errno(ret);
> -		return ERR_PTR(ret);
> +
> +	oh = ocfs2_is_locked_by_me(lockres);
> +	if (!oh) {
> +		ret = ocfs2_inode_lock(inode, &di_bh, 0);
> +		if (ret < 0) {
> +			if (ret != -ENOENT)
> +				mlog_errno(ret);
> +			return ERR_PTR(ret);
> +		}
>  	}
>  
>  	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> -	ocfs2_inode_unlock(inode, 0);
> +	if (!oh)
> +		ocfs2_inode_unlock(inode, 0);
>  	brelse(di_bh);
>  	return acl;
>  }
>
Zhen Ren Nov. 1, 2016, 1:45 a.m. UTC | #2
Hi,

On 10/31/2016 06:55 PM, piaojun wrote:
> Hi Eric,
>
> On 2016-10-19 13:19, Eric Ren wrote:
>> The deadlock issue happens when running discontiguous block
>> group testing on multiple nodes. The easier way to reproduce
>> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
>> nodes at the same time by pssh.
>>
>> This is indeed another deadlock caused by: commit 743b5f1434f5
>> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
>> had been explained well by Tariq Saeed in this thread:
>>
>> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
>>
>> For this case, the ocfs2_inode_lock() is misused recursively as below:
>>
>> do_sys_open
>>   do_filp_open
>>    path_openat
>>     may_open
>>      inode_permission
>>       __inode_permission
>>        ocfs2_permission  <====== ocfs2_inode_lock()
>>         generic_permission
>>          get_acl
>>           ocfs2_iop_get_acl  <====== ocfs2_inode_lock()
>>            ocfs2_inode_lock_full_nested <===== deadlock if a remote EX request
> Do you mean another node wants to get ex of the inode? or another process?
Remote EX request means "another node wants to get ex of the inode";-)

Eric
>> comes between two ocfs2_inode_lock()
>>
>> Fix by checking if the cluster lock has been acquired aready in the call-chain
>> path.
>>
>> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
>> Signed-off-by: Eric Ren <zren@suse.com>
>> ---
>>   fs/ocfs2/acl.c | 39 +++++++++++++++++++++++++++------------
>>   1 file changed, 27 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>> index bed1fcb..7e3544e 100644
>> --- a/fs/ocfs2/acl.c
>> +++ b/fs/ocfs2/acl.c
>> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>>   int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>   {
>>   	struct buffer_head *bh = NULL;
>> +	struct ocfs2_holder *oh;
>> +	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>   	int status = 0;
>>   
>> -	status = ocfs2_inode_lock(inode, &bh, 1);
>> -	if (status < 0) {
>> -		if (status != -ENOENT)
>> -			mlog_errno(status);
>> -		return status;
>> +	oh = ocfs2_is_locked_by_me(lockres);
>> +	if (!oh) {
>> +		status = ocfs2_inode_lock(inode, &bh, 1);
>> +		if (status < 0) {
>> +			if (status != -ENOENT)
>> +				mlog_errno(status);
>> +			return status;
>> +		}
>>   	}
>> +
>>   	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>> -	ocfs2_inode_unlock(inode, 1);
>> +
>> +	if (!oh)
>> +		ocfs2_inode_unlock(inode, 1);
>>   	brelse(bh);
>>   	return status;
>>   }
>> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
>>   	struct ocfs2_super *osb;
>>   	struct buffer_head *di_bh = NULL;
>>   	struct posix_acl *acl;
>> +	struct ocfs2_holder *oh;
>> +	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>   	int ret;
>>   
>>   	osb = OCFS2_SB(inode->i_sb);
>>   	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>   		return NULL;
>> -	ret = ocfs2_inode_lock(inode, &di_bh, 0);
>> -	if (ret < 0) {
>> -		if (ret != -ENOENT)
>> -			mlog_errno(ret);
>> -		return ERR_PTR(ret);
>> +
>> +	oh = ocfs2_is_locked_by_me(lockres);
>> +	if (!oh) {
>> +		ret = ocfs2_inode_lock(inode, &di_bh, 0);
>> +		if (ret < 0) {
>> +			if (ret != -ENOENT)
>> +				mlog_errno(ret);
>> +			return ERR_PTR(ret);
>> +		}
>>   	}
>>   
>>   	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>   
>> -	ocfs2_inode_unlock(inode, 0);
>> +	if (!oh)
>> +		ocfs2_inode_unlock(inode, 0);
>>   	brelse(di_bh);
>>   	return acl;
>>   }
>>
>
Zhen Ren Nov. 9, 2016, 4:55 a.m. UTC | #3
Hi all,

On 10/19/2016 01:19 PM, Eric Ren wrote:
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..7e3544e 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>   int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>   {
>   	struct buffer_head *bh = NULL;
> +	struct ocfs2_holder *oh;
> +	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>   	int status = 0;
>   
> -	status = ocfs2_inode_lock(inode, &bh, 1);
> -	if (status < 0) {
> -		if (status != -ENOENT)
> -			mlog_errno(status);
> -		return status;
> +	oh = ocfs2_is_locked_by_me(lockres);
> +	if (!oh) {
> +		status = ocfs2_inode_lock(inode, &bh, 1);
> +		if (status < 0) {
> +			if (status != -ENOENT)
> +				mlog_errno(status);
> +			return status;
> +		}
>   	}
This is wrong. We also depend ocfs2_inode_lock() pass out "bh" for later use.

So, we may need another function something like ocfs2_inode_getbh():
      if (!oh)
         ocfs2_inode_lock();
    else
        ocfs2_inode_getbh();

Eric
> +
>   	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> -	ocfs2_inode_unlock(inode, 1);
> +
> +	if (!oh)
> +		ocfs2_inode_unlock(inode, 1);
>   	brelse(bh);
>   	return status;
>   }
> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
>   	struct ocfs2_super *osb;
>   	struct buffer_head *di_bh = NULL;
>   	struct posix_acl *acl;
> +	struct ocfs2_holder *oh;
> +	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>   	int ret;
>   
>   	osb = OCFS2_SB(inode->i_sb);
>   	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   		return NULL;
> -	ret = ocfs2_inode_lock(inode, &di_bh, 0);
> -	if (ret < 0) {
> -		if (ret != -ENOENT)
> -			mlog_errno(ret);
> -		return ERR_PTR(ret);
> +
> +	oh = ocfs2_is_locked_by_me(lockres);
> +	if (!oh) {
> +		ret = ocfs2_inode_lock(inode, &di_bh, 0);
> +		if (ret < 0) {
> +			if (ret != -ENOENT)
> +				mlog_errno(ret);
> +			return ERR_PTR(ret);
> +		}
>   	}
>   
>   	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>   
> -	ocfs2_inode_unlock(inode, 0);
> +	if (!oh)
> +		ocfs2_inode_unlock(inode, 0);
>   	brelse(di_bh);
>   	return acl;
>   }
piaojun Nov. 10, 2016, 10:49 a.m. UTC | #4
Hi Eric,

On 2016-11-1 9:45, Eric Ren wrote:
> Hi,
> 
> On 10/31/2016 06:55 PM, piaojun wrote:
>> Hi Eric,
>>
>> On 2016-10-19 13:19, Eric Ren wrote:
>>> The deadlock issue happens when running discontiguous block
>>> group testing on multiple nodes. The easier way to reproduce
>>> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
>>> nodes at the same time by pssh.
>>>
>>> This is indeed another deadlock caused by: commit 743b5f1434f5
>>> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
>>> had been explained well by Tariq Saeed in this thread:
>>>
>>> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
>>>
>>> For this case, the ocfs2_inode_lock() is misused recursively as below:
>>>
>>> do_sys_open
>>>   do_filp_open
>>>    path_openat
>>>     may_open
>>>      inode_permission
>>>       __inode_permission
>>>        ocfs2_permission  <====== ocfs2_inode_lock()
>>>         generic_permission
>>>          get_acl
>>>           ocfs2_iop_get_acl  <====== ocfs2_inode_lock()
>>>            ocfs2_inode_lock_full_nested <===== deadlock if a remote EX request
>> Do you mean another node wants to get ex of the inode? or another process?
> Remote EX request means "another node wants to get ex of the inode";-)
> 
> Eric
If another node wants to get ex, it will get blocked as this node has
got pr. Why will the ex request make this node get blocked? Expect your
detailed description.

thanks,
Jun
>>> comes between two ocfs2_inode_lock()
>>>
>>> Fix by checking if the cluster lock has been acquired aready in the call-chain
>>> path.
>>>
>>> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
>>> Signed-off-by: Eric Ren <zren@suse.com>
>>> ---
>>>   fs/ocfs2/acl.c | 39 +++++++++++++++++++++++++++------------
>>>   1 file changed, 27 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>>> index bed1fcb..7e3544e 100644
>>> --- a/fs/ocfs2/acl.c
>>> +++ b/fs/ocfs2/acl.c
>>> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>>>   int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>>   {
>>>       struct buffer_head *bh = NULL;
>>> +    struct ocfs2_holder *oh;
>>> +    struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>>       int status = 0;
>>>   -    status = ocfs2_inode_lock(inode, &bh, 1);
>>> -    if (status < 0) {
>>> -        if (status != -ENOENT)
>>> -            mlog_errno(status);
>>> -        return status;
>>> +    oh = ocfs2_is_locked_by_me(lockres);
>>> +    if (!oh) {
>>> +        status = ocfs2_inode_lock(inode, &bh, 1);
>>> +        if (status < 0) {
>>> +            if (status != -ENOENT)
>>> +                mlog_errno(status);
>>> +            return status;
>>> +        }
>>>       }
>>> +
>>>       status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>>> -    ocfs2_inode_unlock(inode, 1);
>>> +
>>> +    if (!oh)
>>> +        ocfs2_inode_unlock(inode, 1);
>>>       brelse(bh);
>>>       return status;
>>>   }
>>> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
>>>       struct ocfs2_super *osb;
>>>       struct buffer_head *di_bh = NULL;
>>>       struct posix_acl *acl;
>>> +    struct ocfs2_holder *oh;
>>> +    struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>>       int ret;
>>>         osb = OCFS2_SB(inode->i_sb);
>>>       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>>           return NULL;
>>> -    ret = ocfs2_inode_lock(inode, &di_bh, 0);
>>> -    if (ret < 0) {
>>> -        if (ret != -ENOENT)
>>> -            mlog_errno(ret);
>>> -        return ERR_PTR(ret);
>>> +
>>> +    oh = ocfs2_is_locked_by_me(lockres);
>>> +    if (!oh) {
>>> +        ret = ocfs2_inode_lock(inode, &di_bh, 0);
>>> +        if (ret < 0) {
>>> +            if (ret != -ENOENT)
>>> +                mlog_errno(ret);
>>> +            return ERR_PTR(ret);
>>> +        }
>>>       }
>>>         acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>>   -    ocfs2_inode_unlock(inode, 0);
>>> +    if (!oh)
>>> +        ocfs2_inode_unlock(inode, 0);
>>>       brelse(di_bh);
>>>       return acl;
>>>   }
>>>
>>
> 
> 
> .
>
Zhen Ren Nov. 11, 2016, 1:56 a.m. UTC | #5
Hi,

On 11/10/2016 06:49 PM, piaojun wrote:
> Hi Eric,
>
> On 2016-11-1 9:45, Eric Ren wrote:
>> Hi,
>>
>> On 10/31/2016 06:55 PM, piaojun wrote:
>>> Hi Eric,
>>>
>>> On 2016-10-19 13:19, Eric Ren wrote:
>>>> The deadlock issue happens when running discontiguous block
>>>> group testing on multiple nodes. The easier way to reproduce
>>>> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
>>>> nodes at the same time by pssh.
>>>>
>>>> This is indeed another deadlock caused by: commit 743b5f1434f5
>>>> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
>>>> had been explained well by Tariq Saeed in this thread:
>>>>
>>>> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
>>>>
>>>> For this case, the ocfs2_inode_lock() is misused recursively as below:
>>>>
>>>> do_sys_open
>>>>    do_filp_open
>>>>     path_openat
>>>>      may_open
>>>>       inode_permission
>>>>        __inode_permission
>>>>         ocfs2_permission  <====== ocfs2_inode_lock()
>>>>          generic_permission
>>>>           get_acl
>>>>            ocfs2_iop_get_acl  <====== ocfs2_inode_lock()
>>>>             ocfs2_inode_lock_full_nested <===== deadlock if a remote EX request
>>> Do you mean another node wants to get ex of the inode? or another process?
>> Remote EX request means "another node wants to get ex of the inode";-)
>>
>> Eric
> If another node wants to get ex, it will get blocked as this node has
> got pr. Why will the ex request make this node get blocked? Expect your
> detailed description.
Did you look at this link I mentioned above?

OCFS2_LOCK_BLOCKED flag of this lockres is set in BAST (ocfs2_generic_handle_bast) when 
downconvert is needed
on behalf of remote lock request.

The recursive cluster lock (the second one) will be blocked in __ocfs2_cluster_lock() 
because of OCFS2_LOCK_BLOCKED.
But the downconvert cannot be done, why? because there is no chance for the first cluster 
lock on this node to be unlocked -
we blocked ourselves in the code path.

Eric
>
> thanks,
> Jun
>>>> comes between two ocfs2_inode_lock()
>>>>
>>>> Fix by checking if the cluster lock has been acquired aready in the call-chain
>>>> path.
>>>>
>>>> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
>>>> Signed-off-by: Eric Ren <zren@suse.com>
>>>> ---
>>>>    fs/ocfs2/acl.c | 39 +++++++++++++++++++++++++++------------
>>>>    1 file changed, 27 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>>>> index bed1fcb..7e3544e 100644
>>>> --- a/fs/ocfs2/acl.c
>>>> +++ b/fs/ocfs2/acl.c
>>>> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>>>>    int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>>>    {
>>>>        struct buffer_head *bh = NULL;
>>>> +    struct ocfs2_holder *oh;
>>>> +    struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>>>        int status = 0;
>>>>    -    status = ocfs2_inode_lock(inode, &bh, 1);
>>>> -    if (status < 0) {
>>>> -        if (status != -ENOENT)
>>>> -            mlog_errno(status);
>>>> -        return status;
>>>> +    oh = ocfs2_is_locked_by_me(lockres);
>>>> +    if (!oh) {
>>>> +        status = ocfs2_inode_lock(inode, &bh, 1);
>>>> +        if (status < 0) {
>>>> +            if (status != -ENOENT)
>>>> +                mlog_errno(status);
>>>> +            return status;
>>>> +        }
>>>>        }
>>>> +
>>>>        status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>>>> -    ocfs2_inode_unlock(inode, 1);
>>>> +
>>>> +    if (!oh)
>>>> +        ocfs2_inode_unlock(inode, 1);
>>>>        brelse(bh);
>>>>        return status;
>>>>    }
>>>> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
>>>>        struct ocfs2_super *osb;
>>>>        struct buffer_head *di_bh = NULL;
>>>>        struct posix_acl *acl;
>>>> +    struct ocfs2_holder *oh;
>>>> +    struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>>>        int ret;
>>>>          osb = OCFS2_SB(inode->i_sb);
>>>>        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>>>            return NULL;
>>>> -    ret = ocfs2_inode_lock(inode, &di_bh, 0);
>>>> -    if (ret < 0) {
>>>> -        if (ret != -ENOENT)
>>>> -            mlog_errno(ret);
>>>> -        return ERR_PTR(ret);
>>>> +
>>>> +    oh = ocfs2_is_locked_by_me(lockres);
>>>> +    if (!oh) {
>>>> +        ret = ocfs2_inode_lock(inode, &di_bh, 0);
>>>> +        if (ret < 0) {
>>>> +            if (ret != -ENOENT)
>>>> +                mlog_errno(ret);
>>>> +            return ERR_PTR(ret);
>>>> +        }
>>>>        }
>>>>          acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>>>    -    ocfs2_inode_unlock(inode, 0);
>>>> +    if (!oh)
>>>> +        ocfs2_inode_unlock(inode, 0);
>>>>        brelse(di_bh);
>>>>        return acl;
>>>>    }
>>>>
>>
>> .
>>
>
piaojun Nov. 14, 2016, 5:42 a.m. UTC | #6
Hi Eric,

On 2016-11-11 9:56, Eric Ren wrote:
> Hi,
> 
> On 11/10/2016 06:49 PM, piaojun wrote:
>> Hi Eric,
>>
>> On 2016-11-1 9:45, Eric Ren wrote:
>>> Hi,
>>>
>>> On 10/31/2016 06:55 PM, piaojun wrote:
>>>> Hi Eric,
>>>>
>>>> On 2016-10-19 13:19, Eric Ren wrote:
>>>>> The deadlock issue happens when running discontiguous block
>>>>> group testing on multiple nodes. The easier way to reproduce
>>>>> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
>>>>> nodes at the same time by pssh.
>>>>>
>>>>> This is indeed another deadlock caused by: commit 743b5f1434f5
>>>>> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
>>>>> had been explained well by Tariq Saeed in this thread:
>>>>>
>>>>> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
>>>>>
>>>>> For this case, the ocfs2_inode_lock() is misused recursively as below:
>>>>>
>>>>> do_sys_open
>>>>>    do_filp_open
>>>>>     path_openat
>>>>>      may_open
>>>>>       inode_permission
>>>>>        __inode_permission
>>>>>         ocfs2_permission  <====== ocfs2_inode_lock()
>>>>>          generic_permission
>>>>>           get_acl
>>>>>            ocfs2_iop_get_acl  <====== ocfs2_inode_lock()
>>>>>             ocfs2_inode_lock_full_nested <===== deadlock if a remote EX request
>>>> Do you mean another node wants to get ex of the inode? or another process?
>>> Remote EX request means "another node wants to get ex of the inode";-)
>>>
>>> Eric
>> If another node wants to get ex, it will get blocked as this node has
>> got pr. Why will the ex request make this node get blocked? Expect your
>> detailed description.
> Did you look at this link I mentioned above?
> 
> OCFS2_LOCK_BLOCKED flag of this lockres is set in BAST (ocfs2_generic_handle_bast) when downconvert is needed
> on behalf of remote lock request.
> 
> The recursive cluster lock (the second one) will be blocked in __ocfs2_cluster_lock() because of OCFS2_LOCK_BLOCKED.
> But the downconvert cannot be done, why? because there is no chance for the first cluster lock on this node to be unlocked -
> we blocked ourselves in the code path.
> 
> Eric
You clear my doubt. I will look through your solution.

thanks
Jun
>>
>> thanks,
>> Jun
>>>>> comes between two ocfs2_inode_lock()
>>>>>
>>>>> Fix by checking if the cluster lock has been acquired aready in the call-chain
>>>>> path.
>>>>>
>>>>> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
>>>>> Signed-off-by: Eric Ren <zren@suse.com>
>>>>> ---
>>>>>    fs/ocfs2/acl.c | 39 +++++++++++++++++++++++++++------------
>>>>>    1 file changed, 27 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>>>>> index bed1fcb..7e3544e 100644
>>>>> --- a/fs/ocfs2/acl.c
>>>>> +++ b/fs/ocfs2/acl.c
>>>>> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>>>>>    int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>>>>    {
>>>>>        struct buffer_head *bh = NULL;
>>>>> +    struct ocfs2_holder *oh;
>>>>> +    struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>>>>        int status = 0;
>>>>>    -    status = ocfs2_inode_lock(inode, &bh, 1);
>>>>> -    if (status < 0) {
>>>>> -        if (status != -ENOENT)
>>>>> -            mlog_errno(status);
>>>>> -        return status;
>>>>> +    oh = ocfs2_is_locked_by_me(lockres);
>>>>> +    if (!oh) {
>>>>> +        status = ocfs2_inode_lock(inode, &bh, 1);
>>>>> +        if (status < 0) {
>>>>> +            if (status != -ENOENT)
>>>>> +                mlog_errno(status);
>>>>> +            return status;
>>>>> +        }
>>>>>        }
>>>>> +
>>>>>        status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>>>>> -    ocfs2_inode_unlock(inode, 1);
>>>>> +
>>>>> +    if (!oh)
>>>>> +        ocfs2_inode_unlock(inode, 1);
>>>>>        brelse(bh);
>>>>>        return status;
>>>>>    }
>>>>> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
>>>>>        struct ocfs2_super *osb;
>>>>>        struct buffer_head *di_bh = NULL;
>>>>>        struct posix_acl *acl;
>>>>> +    struct ocfs2_holder *oh;
>>>>> +    struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
>>>>>        int ret;
>>>>>          osb = OCFS2_SB(inode->i_sb);
>>>>>        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>>>>            return NULL;
>>>>> -    ret = ocfs2_inode_lock(inode, &di_bh, 0);
>>>>> -    if (ret < 0) {
>>>>> -        if (ret != -ENOENT)
>>>>> -            mlog_errno(ret);
>>>>> -        return ERR_PTR(ret);
>>>>> +
>>>>> +    oh = ocfs2_is_locked_by_me(lockres);
>>>>> +    if (!oh) {
>>>>> +        ret = ocfs2_inode_lock(inode, &di_bh, 0);
>>>>> +        if (ret < 0) {
>>>>> +            if (ret != -ENOENT)
>>>>> +                mlog_errno(ret);
>>>>> +            return ERR_PTR(ret);
>>>>> +        }
>>>>>        }
>>>>>          acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>>>>    -    ocfs2_inode_unlock(inode, 0);
>>>>> +    if (!oh)
>>>>> +        ocfs2_inode_unlock(inode, 0);
>>>>>        brelse(di_bh);
>>>>>        return acl;
>>>>>    }
>>>>>
>>>
>>> .
>>>
>>
> 
> 
> .
>
Zhen Ren Nov. 14, 2016, 10:03 a.m. UTC | #7
Hi,

On 11/14/2016 01:42 PM, piaojun wrote:
> Hi Eric,
>
>
> OCFS2_LOCK_BLOCKED flag of this lockres is set in BAST (ocfs2_generic_handle_bast) when downconvert is needed
> on behalf of remote lock request.
>
> The recursive cluster lock (the second one) will be blocked in __ocfs2_cluster_lock() because of OCFS2_LOCK_BLOCKED.
> But the downconvert cannot be done, why? because there is no chance for the first cluster lock on this node to be unlocked -
> we blocked ourselves in the code path.
>
> Eric
> You clear my doubt. I will look through your solution.

Thanks for your attention. Actually, I tried different versions of draft patch locally. 
Either of them can satisfy myself so far.
Some rules I'd like to follow:
1) check and avoid recursive cluster locking, rather than allow it which Junxiao had tried 
before;
2) Just keep track of lock resource that meets the following requirements:
      a. normal inodes (non systemfile);
      b. inode metadata lockres (not open, rw lockres);
why? to avoid more special cluster locking usecases, like journal systemfile, "LOST+FOUND" 
open lockres, that lock/unlock
operations are performed by different processes, making tracking task more tricky.
3) There is another problem if we follow "check + avoid" pattern, which I have mentioned in 
this thread:
"""
This is wrong. We also depend ocfs2_inode_lock() pass out "bh" for later use.

So, we may need another function something like ocfs2_inode_getbh():
      if (!oh)
         ocfs2_inode_lock();
    else
        ocfs2_inode_getbh();
"""

Hope we can work out a nice solution for this tricky issue ;-)

Eric

>
Zhen Ren Nov. 15, 2016, 2:13 a.m. UTC | #8
Hi,
> Thanks for your attention. Actually, I tried different versions of draft patch locally.
> Either of them can satisfy myself so far.

Sorry, I meat "neither of them".

Eric
> Some rules I'd like to follow:
> 1) check and avoid recursive cluster locking, rather than allow it which Junxiao had tried
> before;
> 2) Just keep track of lock resource that meets the following requirements:
>        a. normal inodes (non systemfile);
>        b. inode metadata lockres (not open, rw lockres);
> why? to avoid more special cluster locking usecases, like journal systemfile, "LOST+FOUND"
> open lockres, that lock/unlock
> operations are performed by different processes, making tracking task more tricky.
> 3) There is another problem if we follow "check + avoid" pattern, which I have mentioned in
> this thread:
> """
> This is wrong. We also depend ocfs2_inode_lock() pass out "bh" for later use.
>
> So, we may need another function something like ocfs2_inode_getbh():
>        if (!oh)
>           ocfs2_inode_lock();
>      else
>          ocfs2_inode_getbh();
> """
>
> Hope we can work out a nice solution for this tricky issue ;-)
>
> Eric
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>

Patch
diff mbox

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index bed1fcb..7e3544e 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -283,16 +283,24 @@  int ocfs2_set_acl(handle_t *handle,
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	struct buffer_head *bh = NULL;
+	struct ocfs2_holder *oh;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	int status = 0;
 
-	status = ocfs2_inode_lock(inode, &bh, 1);
-	if (status < 0) {
-		if (status != -ENOENT)
-			mlog_errno(status);
-		return status;
+	oh = ocfs2_is_locked_by_me(lockres);
+	if (!oh) {
+		status = ocfs2_inode_lock(inode, &bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			return status;
+		}
 	}
+
 	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
-	ocfs2_inode_unlock(inode, 1);
+
+	if (!oh)
+		ocfs2_inode_unlock(inode, 1);
 	brelse(bh);
 	return status;
 }
@@ -302,21 +310,28 @@  struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 	struct ocfs2_super *osb;
 	struct buffer_head *di_bh = NULL;
 	struct posix_acl *acl;
+	struct ocfs2_holder *oh;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	int ret;
 
 	osb = OCFS2_SB(inode->i_sb);
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
 		return NULL;
-	ret = ocfs2_inode_lock(inode, &di_bh, 0);
-	if (ret < 0) {
-		if (ret != -ENOENT)
-			mlog_errno(ret);
-		return ERR_PTR(ret);
+
+	oh = ocfs2_is_locked_by_me(lockres);
+	if (!oh) {
+		ret = ocfs2_inode_lock(inode, &di_bh, 0);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				mlog_errno(ret);
+			return ERR_PTR(ret);
+		}
 	}
 
 	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
 
-	ocfs2_inode_unlock(inode, 0);
+	if (!oh)
+		ocfs2_inode_unlock(inode, 0);
 	brelse(di_bh);
 	return acl;
 }