[v2] ocfs2/dlm: fix race between convert and recovery
diff mbox

Message ID 55FBF4BA.30000@huawei.com
State New
Headers show

Commit Message

Joseph Qi Sept. 18, 2015, 11:25 a.m. UTC
There is a race window between dlmconvert_remote and
dlm_move_lockres_to_recovery_list, which will cause a lock with
OCFS2_LOCK_BUSY in grant list, thus system hangs.

dlmconvert_remote
{
        spin_lock(&res->spinlock);
        list_move_tail(&lock->list, &res->converting);
        lock->convert_pending = 1;
        spin_unlock(&res->spinlock);

        status = dlm_send_remote_convert_request();
        >>>>>> race window, master has queued ast and return DLM_NORMAL,
               and then down before sending ast.
               this node detects master down and calls
               dlm_move_lockres_to_recovery_list, which will revert the
               lock to grant list.
               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
               send ast any more because it thinks already be authorized.

        spin_lock(&res->spinlock);
        lock->convert_pending = 0;
        if (status != DLM_NORMAL)
                dlm_revert_pending_convert(res, lock);
        spin_unlock(&res->spinlock);
}

In this case, just leave it in convert list and new master will take care
of it after recovery.  And if convert request returns other than
DLM_NORMAL, convert thread will do the revert itself. So remove the
revert logic in dlm_move_lockres_to_recovery_list.

changelog since v1:
Clean up convert_pending since it is now useless.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: <stable@vger.kernel.org>
---
 fs/ocfs2/dlm/dlmcommon.h   |  1 -
 fs/ocfs2/dlm/dlmconvert.c  |  2 --
 fs/ocfs2/dlm/dlmdebug.c    |  7 +++----
 fs/ocfs2/dlm/dlmlock.c     |  1 -
 fs/ocfs2/dlm/dlmrecovery.c | 10 +---------
 5 files changed, 4 insertions(+), 17 deletions(-)

Comments

Junxiao Bi Sept. 20, 2015, 7:20 a.m. UTC | #1
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>

> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
> 
> There is a race window between dlmconvert_remote and
> dlm_move_lockres_to_recovery_list, which will cause a lock with
> OCFS2_LOCK_BUSY in grant list, thus system hangs.
> 
> dlmconvert_remote
> {
>        spin_lock(&res->spinlock);
>        list_move_tail(&lock->list, &res->converting);
>        lock->convert_pending = 1;
>        spin_unlock(&res->spinlock);
> 
>        status = dlm_send_remote_convert_request();
>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>               and then down before sending ast.
>               this node detects master down and calls
>               dlm_move_lockres_to_recovery_list, which will revert the
>               lock to grant list.
>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>               send ast any more because it thinks already be authorized.
> 
>        spin_lock(&res->spinlock);
>        lock->convert_pending = 0;
>        if (status != DLM_NORMAL)
>                dlm_revert_pending_convert(res, lock);
>        spin_unlock(&res->spinlock);
> }
> 
> In this case, just leave it in convert list and new master will take care
> of it after recovery.  And if convert request returns other than
> DLM_NORMAL, convert thread will do the revert itself. So remove the
> revert logic in dlm_move_lockres_to_recovery_list.
> 
> changelog since v1:
> Clean up convert_pending since it is now useless.
> 
> Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Cc: <stable@vger.kernel.org>
> ---
> fs/ocfs2/dlm/dlmcommon.h   |  1 -
> fs/ocfs2/dlm/dlmconvert.c  |  2 --
> fs/ocfs2/dlm/dlmdebug.c    |  7 +++----
> fs/ocfs2/dlm/dlmlock.c     |  1 -
> fs/ocfs2/dlm/dlmrecovery.c | 10 +---------
> 5 files changed, 4 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index e88ccf8..25853d7 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -369,7 +369,6 @@ struct dlm_lock
> 	struct dlm_lockstatus *lksb;
> 	unsigned ast_pending:1,
> 		 bast_pending:1,
> -		 convert_pending:1,
> 		 lock_pending:1,
> 		 cancel_pending:1,
> 		 unlock_pending:1,
> diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
> index e36d63f..82ba8d5 100644
> --- a/fs/ocfs2/dlm/dlmconvert.c
> +++ b/fs/ocfs2/dlm/dlmconvert.c
> @@ -291,7 +291,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
> 	/* move lock to local convert queue */
> 	/* do not alter lock refcount.  switching lists. */
> 	list_move_tail(&lock->list, &res->converting);
> -	lock->convert_pending = 1;
> 	lock->ml.convert_type = type;
> 
> 	if (flags & LKM_VALBLK) {
> @@ -315,7 +314,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
> 
> 	spin_lock(&res->spinlock);
> 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
> -	lock->convert_pending = 0;
> 	/* if it failed, move it back to granted queue */
> 	if (status != DLM_NORMAL) {
> 		if (status != DLM_NOTQUEUED)
> diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
> index 8251360..710e94c 100644
> --- a/fs/ocfs2/dlm/dlmdebug.c
> +++ b/fs/ocfs2/dlm/dlmdebug.c
> @@ -77,7 +77,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
> 
> 	printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
> 	       "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
> -	       "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
> +	       "pending=(lock=%c,cancel=%c,unlock=%c)\n",
> 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
> 	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
> 	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
> @@ -86,7 +86,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
> 	       (lock->ast_pending ? 'y' : 'n'),
> 	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
> 	       (lock->bast_pending ? 'y' : 'n'),
> -	       (lock->convert_pending ? 'y' : 'n'),
> 	       (lock->lock_pending ? 'y' : 'n'),
> 	       (lock->cancel_pending ? 'y' : 'n'),
> 	       (lock->unlock_pending ? 'y' : 'n'));
> @@ -502,7 +501,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
> 
> #define DEBUG_LOCK_VERSION	1
> 	spin_lock(&lock->spinlock);
> -	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
> +	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,"
> 		       "%d,%d,%d,%d\n",
> 		       DEBUG_LOCK_VERSION,
> 		       list_type, lock->ml.type, lock->ml.convert_type,
> @@ -512,7 +511,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
> 		       !list_empty(&lock->ast_list),
> 		       !list_empty(&lock->bast_list),
> 		       lock->ast_pending, lock->bast_pending,
> -		       lock->convert_pending, lock->lock_pending,
> +		       lock->lock_pending,
> 		       lock->cancel_pending, lock->unlock_pending,
> 		       atomic_read(&lock->lock_refs.refcount));
> 	spin_unlock(&lock->spinlock);
> diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
> index 66c2a49..857dd15 100644
> --- a/fs/ocfs2/dlm/dlmlock.c
> +++ b/fs/ocfs2/dlm/dlmlock.c
> @@ -411,7 +411,6 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
> 	newlock->ml.cookie = cpu_to_be64(cookie);
> 	newlock->ast_pending = 0;
> 	newlock->bast_pending = 0;
> -	newlock->convert_pending = 0;
> 	newlock->lock_pending = 0;
> 	newlock->unlock_pending = 0;
> 	newlock->cancel_pending = 0;
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 3d90ad7..2402ef7 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2062,15 +2062,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
> 		queue = dlm_list_idx_to_ptr(res, i);
> 		list_for_each_entry_safe(lock, next, queue, list) {
> 			dlm_lock_get(lock);
> -			if (lock->convert_pending) {
> -				/* move converting lock back to granted */
> -				BUG_ON(i != DLM_CONVERTING_LIST);
> -				mlog(0, "node died with convert pending "
> -				     "on %.*s. move back to granted list.\n",
> -				     res->lockname.len, res->lockname.name);
> -				dlm_revert_pending_convert(res, lock);
> -				lock->convert_pending = 0;
> -			} else if (lock->lock_pending) {
> +			if (lock->lock_pending) {
> 				/* remove pending lock requests completely */
> 				BUG_ON(i != DLM_BLOCKED_LIST);
> 				mlog(0, "node died with lock pending "
> -- 
> 1.8.4.3
> 
>
Joseph Qi Sept. 21, 2015, 9:23 a.m. UTC | #2
Hi Junxiao,
This solution may have problem in the following scenario:
Consider dlm_send_remote_convert_request has taken too much time, and
dlm_move_lockres_to_recovery_list runs first and new master will see
this node is currently in convert list after recovery.
Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
it will revert it to grant list, then retry convert. This will makes
this node and master inconsistent.
I will try to find another solution to resolve the race issue.

On 2015/9/20 15:20, Junxiao Bi wrote:
> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
> 
>> > ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>> > 
>> > There is a race window between dlmconvert_remote and
>> > dlm_move_lockres_to_recovery_list, which will cause a lock with
>> > OCFS2_LOCK_BUSY in grant list, thus system hangs.
>> > 
>> > dlmconvert_remote
>> > {
>> >        spin_lock(&res->spinlock);
>> >        list_move_tail(&lock->list, &res->converting);
>> >        lock->convert_pending = 1;
>> >        spin_unlock(&res->spinlock);
>> > 
>> >        status = dlm_send_remote_convert_request();
>>>>>>>> >>>>>>> race window, master has queued ast and return DLM_NORMAL,
>> >               and then down before sending ast.
>> >               this node detects master down and calls
>> >               dlm_move_lockres_to_recovery_list, which will revert the
>> >               lock to grant list.
>> >               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>> >               send ast any more because it thinks already be authorized.
>> > 
>> >        spin_lock(&res->spinlock);
>> >        lock->convert_pending = 0;
>> >        if (status != DLM_NORMAL)
>> >                dlm_revert_pending_convert(res, lock);
>> >        spin_unlock(&res->spinlock);
>> > }
>> > 
>> > In this case, just leave it in convert list and new master will take care
>> > of it after recovery.  And if convert request returns other than
>> > DLM_NORMAL, convert thread will do the revert itself. So remove the
>> > revert logic in dlm_move_lockres_to_recovery_list.
>> > 
>> > changelog since v1:
>> > Clean up convert_pending since it is now useless.
Junxiao Bi Sept. 22, 2015, 12:55 a.m. UTC | #3
On 09/21/2015 05:23 PM, Joseph Qi wrote:
> Hi Junxiao,
> This solution may have problem in the following scenario:
> Consider dlm_send_remote_convert_request has taken too much time, and
> dlm_move_lockres_to_recovery_list runs first and new master will see
> this node is currently in convert list after recovery.
> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
> it will revert it to grant list, then retry convert. This will makes
> this node and master inconsistent.
> I will try to find another solution to resolve the race issue.

If master is down, no need retry convert. May check the return value of
dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
otherwise retry?

Thanks,
Junxiao.

> 
> On 2015/9/20 15:20, Junxiao Bi wrote:
>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>
>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>
>>>> There is a race window between dlmconvert_remote and
>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>
>>>> dlmconvert_remote
>>>> {
>>>>        spin_lock(&res->spinlock);
>>>>        list_move_tail(&lock->list, &res->converting);
>>>>        lock->convert_pending = 1;
>>>>        spin_unlock(&res->spinlock);
>>>>
>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>               and then down before sending ast.
>>>>               this node detects master down and calls
>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>               lock to grant list.
>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>               send ast any more because it thinks already be authorized.
>>>>
>>>>        spin_lock(&res->spinlock);
>>>>        lock->convert_pending = 0;
>>>>        if (status != DLM_NORMAL)
>>>>                dlm_revert_pending_convert(res, lock);
>>>>        spin_unlock(&res->spinlock);
>>>> }
>>>>
>>>> In this case, just leave it in convert list and new master will take care
>>>> of it after recovery.  And if convert request returns other than
>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>
>>>> changelog since v1:
>>>> Clean up convert_pending since it is now useless.
> 
>
Joseph Qi Sept. 22, 2015, 12:23 p.m. UTC | #4
Hi Junxiao,

On 2015/9/22 8:55, Junxiao Bi wrote:
> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>> Hi Junxiao,
>> This solution may have problem in the following scenario:
>> Consider dlm_send_remote_convert_request has taken too much time, and
>> dlm_move_lockres_to_recovery_list runs first and new master will see
>> this node is currently in convert list after recovery.
>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>> it will revert it to grant list, then retry convert. This will makes
>> this node and master inconsistent.
>> I will try to find another solution to resolve the race issue.
> 
> If master is down, no need retry convert. May check the return value of
> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
> otherwise retry?

I want to keep the original logic. And for fixing the race case I
described, how about the following idea?

Check the status DLM_NORMAL. If res->state is currently
DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
still in recovery) or res master changed (means new master has finished
the recovery), reset the status to DLM_RECOVERING, just like the check
at the beginning of dlmconvert_remote. Then it is now in grant list and
outer will retry.

> 
> Thanks,
> Junxiao.
> 
>>
>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>
>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>
>>>>> There is a race window between dlmconvert_remote and
>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>
>>>>> dlmconvert_remote
>>>>> {	
>>>>>        spin_lock(&res->spinlock);
>>>>>        list_move_tail(&lock->list, &res->converting);
>>>>>        lock->convert_pending = 1;
>>>>>        spin_unlock(&res->spinlock);
>>>>>
>>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>               and then down before sending ast.
>>>>>               this node detects master down and calls
>>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>>               lock to grant list.
>>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>               send ast any more because it thinks already be authorized.
>>>>>
>>>>>        spin_lock(&res->spinlock);
>>>>>        lock->convert_pending = 0;
>>>>>        if (status != DLM_NORMAL)
>>>>>                dlm_revert_pending_convert(res, lock);
>>>>>        spin_unlock(&res->spinlock);
>>>>> }
>>>>>
>>>>> In this case, just leave it in convert list and new master will take care
>>>>> of it after recovery.  And if convert request returns other than
>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>
>>>>> changelog since v1:
>>>>> Clean up convert_pending since it is now useless.
>>
>>
> 
> 
> .
>
Junxiao Bi Sept. 23, 2015, 1:25 a.m. UTC | #5
On 09/22/2015 08:23 PM, Joseph Qi wrote:
> Hi Junxiao,
> 
> On 2015/9/22 8:55, Junxiao Bi wrote:
>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>> Hi Junxiao,
>>> This solution may have problem in the following scenario:
>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>> this node is currently in convert list after recovery.
>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>> it will revert it to grant list, then retry convert. This will makes
>>> this node and master inconsistent.
>>> I will try to find another solution to resolve the race issue.
>>
>> If master is down, no need retry convert. May check the return value of
>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>> otherwise retry?
> 
> I want to keep the original logic. And for fixing the race case I
> described, how about the following idea?
> 
> Check the status DLM_NORMAL. If res->state is currently
> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
> still in recovery) or res master changed (means new master has finished
> the recovery), reset the status to DLM_RECOVERING, just like the check
> at the beginning of dlmconvert_remote. Then it is now in grant list and
> outer will retry.
How this idea fix the race windows you described in patch log? Lock is
reverted to granted list but dlm_send_remote_convert_request() return
DLM_NORMAL.

Thanks,
Junxiao.
> 
>>
>> Thanks,
>> Junxiao.
>>
>>>
>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>
>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>
>>>>>> There is a race window between dlmconvert_remote and
>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>
>>>>>> dlmconvert_remote
>>>>>> {	
>>>>>>        spin_lock(&res->spinlock);
>>>>>>        list_move_tail(&lock->list, &res->converting);
>>>>>>        lock->convert_pending = 1;
>>>>>>        spin_unlock(&res->spinlock);
>>>>>>
>>>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>               and then down before sending ast.
>>>>>>               this node detects master down and calls
>>>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>               lock to grant list.
>>>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>               send ast any more because it thinks already be authorized.
>>>>>>
>>>>>>        spin_lock(&res->spinlock);
>>>>>>        lock->convert_pending = 0;
>>>>>>        if (status != DLM_NORMAL)
>>>>>>                dlm_revert_pending_convert(res, lock);
>>>>>>        spin_unlock(&res->spinlock);
>>>>>> }
>>>>>>
>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>> of it after recovery.  And if convert request returns other than
>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>
>>>>>> changelog since v1:
>>>>>> Clean up convert_pending since it is now useless.
>>>
>>>
>>
>>
>> .
>>
> 
>
Joseph Qi Sept. 23, 2015, 1:47 a.m. UTC | #6
On 2015/9/23 9:25, Junxiao Bi wrote:
> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>> Hi Junxiao,
>>
>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>> Hi Junxiao,
>>>> This solution may have problem in the following scenario:
>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>> this node is currently in convert list after recovery.
>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>> it will revert it to grant list, then retry convert. This will makes
>>>> this node and master inconsistent.
>>>> I will try to find another solution to resolve the race issue.
>>>
>>> If master is down, no need retry convert. May check the return value of
>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>> otherwise retry?
>>
>> I want to keep the original logic. And for fixing the race case I
>> described, how about the following idea?
>>
>> Check the status DLM_NORMAL. If res->state is currently
>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>> still in recovery) or res master changed (means new master has finished
>> the recovery), reset the status to DLM_RECOVERING, just like the check
>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>> outer will retry.
> How this idea fix the race windows you described in patch log? Lock is
> reverted to granted list but dlm_send_remote_convert_request() return
> DLM_NORMAL.
> 
As I described in the former mail, status is now reset to DLM_RECOVERING,
caller will retry. And the original way will just wait forever.

Thanks
Joseph

> Thanks,
> Junxiao.
>>
>>>
>>> Thanks,
>>> Junxiao.
>>>
>>>>
>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>
>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>
>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>
>>>>>>> dlmconvert_remote
>>>>>>> {	
>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>        list_move_tail(&lock->list, &res->converting);
>>>>>>>        lock->convert_pending = 1;
>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>
>>>>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>               and then down before sending ast.
>>>>>>>               this node detects master down and calls
>>>>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>               lock to grant list.
>>>>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>               send ast any more because it thinks already be authorized.
>>>>>>>
>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>        lock->convert_pending = 0;
>>>>>>>        if (status != DLM_NORMAL)
>>>>>>>                dlm_revert_pending_convert(res, lock);
>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>> }
>>>>>>>
>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>> of it after recovery.  And if convert request returns other than
>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>
>>>>>>> changelog since v1:
>>>>>>> Clean up convert_pending since it is now useless.
>>>>
>>>>
>>>
>>>
>>> .
>>>
>>
>>
> 
> 
> .
>
Junxiao Bi Sept. 23, 2015, 1:59 a.m. UTC | #7
On 09/23/2015 09:47 AM, Joseph Qi wrote:
> On 2015/9/23 9:25, Junxiao Bi wrote:
>> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>>> Hi Junxiao,
>>>
>>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>>> Hi Junxiao,
>>>>> This solution may have problem in the following scenario:
>>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>>> this node is currently in convert list after recovery.
>>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>>> it will revert it to grant list, then retry convert. This will makes
>>>>> this node and master inconsistent.
>>>>> I will try to find another solution to resolve the race issue.
>>>>
>>>> If master is down, no need retry convert. May check the return value of
>>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>>> otherwise retry?
>>>
>>> I want to keep the original logic. And for fixing the race case I
>>> described, how about the following idea?
>>>
>>> Check the status DLM_NORMAL. If res->state is currently
>>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>>> still in recovery) or res master changed (means new master has finished
>>> the recovery), reset the status to DLM_RECOVERING, just like the check
>>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>>> outer will retry.
>> How this idea fix the race windows you described in patch log? Lock is
>> reverted to granted list but dlm_send_remote_convert_request() return
>> DLM_NORMAL.
>>
> As I described in the former mail, status is now reset to DLM_RECOVERING,
> caller will retry. And the original way will just wait forever.
I mean convert request was sent to res master successfully,
dlm_send_remote_convert_request() have return DLM_NORMAL, but before res
master send ast it panic. Looks convert will not retry in this case.

Thanks,
Junxiao.
> 
> Thanks
> Joseph
> 
>> Thanks,
>> Junxiao.
>>>
>>>>
>>>> Thanks,
>>>> Junxiao.
>>>>
>>>>>
>>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>>
>>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>>
>>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>>
>>>>>>>> dlmconvert_remote
>>>>>>>> {	
>>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>>        list_move_tail(&lock->list, &res->converting);
>>>>>>>>        lock->convert_pending = 1;
>>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>>
>>>>>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>>               and then down before sending ast.
>>>>>>>>               this node detects master down and calls
>>>>>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>>               lock to grant list.
>>>>>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>>               send ast any more because it thinks already be authorized.
>>>>>>>>
>>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>>        lock->convert_pending = 0;
>>>>>>>>        if (status != DLM_NORMAL)
>>>>>>>>                dlm_revert_pending_convert(res, lock);
>>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>> }
>>>>>>>>
>>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>>> of it after recovery.  And if convert request returns other than
>>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>>
>>>>>>>> changelog since v1:
>>>>>>>> Clean up convert_pending since it is now useless.
>>>>>
>>>>>
>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>
>>
>> .
>>
> 
>
Joseph Qi Sept. 23, 2015, 7:48 a.m. UTC | #8
On 2015/9/23 9:59, Junxiao Bi wrote:
> On 09/23/2015 09:47 AM, Joseph Qi wrote:
>> On 2015/9/23 9:25, Junxiao Bi wrote:
>>> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>>>> Hi Junxiao,
>>>>
>>>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>>>> Hi Junxiao,
>>>>>> This solution may have problem in the following scenario:
>>>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>>>> this node is currently in convert list after recovery.
>>>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>>>> it will revert it to grant list, then retry convert. This will makes
>>>>>> this node and master inconsistent.
>>>>>> I will try to find another solution to resolve the race issue.
>>>>>
>>>>> If master is down, no need retry convert. May check the return value of
>>>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>>>> otherwise retry?
>>>>
>>>> I want to keep the original logic. And for fixing the race case I
>>>> described, how about the following idea?
>>>>
>>>> Check the status DLM_NORMAL. If res->state is currently
>>>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>>>> still in recovery) or res master changed (means new master has finished
>>>> the recovery), reset the status to DLM_RECOVERING, just like the check
>>>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>>>> outer will retry.
>>> How this idea fix the race windows you described in patch log? Lock is
>>> reverted to granted list but dlm_send_remote_convert_request() return
>>> DLM_NORMAL.
>>>
>> As I described in the former mail, status is now reset to DLM_RECOVERING,
>> caller will retry. And the original way will just wait forever.
> I mean convert request was sent to res master successfully,
> dlm_send_remote_convert_request() have return DLM_NORMAL, but before res
> master send ast it panic. Looks convert will not retry in this case.
> 
The original way is just what you said.
And my idea is letting dlmlock retry by resetting status to DLM_RECOVERING in
such a case.

> Thanks,
> Junxiao.
>>
>> Thanks
>> Joseph
>>
>>> Thanks,
>>> Junxiao.
>>>>
>>>>>
>>>>> Thanks,
>>>>> Junxiao.
>>>>>
>>>>>>
>>>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>>>
>>>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>>>
>>>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>>>
>>>>>>>>> dlmconvert_remote
>>>>>>>>> {	
>>>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>>>        list_move_tail(&lock->list, &res->converting);
>>>>>>>>>        lock->convert_pending = 1;
>>>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>>>
>>>>>>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>>>               and then down before sending ast.
>>>>>>>>>               this node detects master down and calls
>>>>>>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>>>               lock to grant list.
>>>>>>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>>>               send ast any more because it thinks already be authorized.
>>>>>>>>>
>>>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>>>        lock->convert_pending = 0;
>>>>>>>>>        if (status != DLM_NORMAL)
>>>>>>>>>                dlm_revert_pending_convert(res, lock);
>>>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>>>> of it after recovery.  And if convert request returns other than
>>>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>>>
>>>>>>>>> changelog since v1:
>>>>>>>>> Clean up convert_pending since it is now useless.
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>> .
>>>>>
>>>>
>>>>
>>>
>>>
>>> .
>>>
>>
>>
> 
> 
> .
>
Junxiao Bi Sept. 23, 2015, 8:42 a.m. UTC | #9
On 09/23/2015 03:48 PM, Joseph Qi wrote:
> On 2015/9/23 9:59, Junxiao Bi wrote:
>> On 09/23/2015 09:47 AM, Joseph Qi wrote:
>>> On 2015/9/23 9:25, Junxiao Bi wrote:
>>>> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>>>>> Hi Junxiao,
>>>>>
>>>>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>>>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>>>>> Hi Junxiao,
>>>>>>> This solution may have problem in the following scenario:
>>>>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>>>>> this node is currently in convert list after recovery.
>>>>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>>>>> it will revert it to grant list, then retry convert. This will makes
>>>>>>> this node and master inconsistent.
>>>>>>> I will try to find another solution to resolve the race issue.
>>>>>>
>>>>>> If master is down, no need retry convert. May check the return value of
>>>>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>>>>> otherwise retry?
>>>>>
>>>>> I want to keep the original logic. And for fixing the race case I
>>>>> described, how about the following idea?
>>>>>
>>>>> Check the status DLM_NORMAL. If res->state is currently
>>>>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>>>>> still in recovery) or res master changed (means new master has finished
>>>>> the recovery), reset the status to DLM_RECOVERING, just like the check
>>>>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>>>>> outer will retry.
>>>> How this idea fix the race windows you described in patch log? Lock is
>>>> reverted to granted list but dlm_send_remote_convert_request() return
>>>> DLM_NORMAL.
>>>>
>>> As I described in the former mail, status is now reset to DLM_RECOVERING,
>>> caller will retry. And the original way will just wait forever.
>> I mean convert request was sent to res master successfully,
>> dlm_send_remote_convert_request() have return DLM_NORMAL, but before res
>> master send ast it panic. Looks convert will not retry in this case.
>>
> The original way is just what you said.
> And my idea is letting dlmlock retry by resetting status to DLM_RECOVERING in
> such a case.
So convert_pending is still dropped? If so when
dlm_send_remote_convert_request() return DLM_RECOVERING, should reset it
to DLM_NORMAL and didn't revert lock to granted list. Retry is not needed.

Thanks,
Junxiao.

> 
>> Thanks,
>> Junxiao.
>>>
>>> Thanks
>>> Joseph
>>>
>>>> Thanks,
>>>> Junxiao.
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Junxiao.
>>>>>>
>>>>>>>
>>>>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>>>>
>>>>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>>>>
>>>>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>>>>
>>>>>>>>>> dlmconvert_remote
>>>>>>>>>> {	
>>>>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>>>>        list_move_tail(&lock->list, &res->converting);
>>>>>>>>>>        lock->convert_pending = 1;
>>>>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>>>>
>>>>>>>>>>        status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>>>>               and then down before sending ast.
>>>>>>>>>>               this node detects master down and calls
>>>>>>>>>>               dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>>>>               lock to grant list.
>>>>>>>>>>               Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>>>>               send ast any more because it thinks already be authorized.
>>>>>>>>>>
>>>>>>>>>>        spin_lock(&res->spinlock);
>>>>>>>>>>        lock->convert_pending = 0;
>>>>>>>>>>        if (status != DLM_NORMAL)
>>>>>>>>>>                dlm_revert_pending_convert(res, lock);
>>>>>>>>>>        spin_unlock(&res->spinlock);
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>>>>> of it after recovery.  And if convert request returns other than
>>>>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>>>>
>>>>>>>>>> changelog since v1:
>>>>>>>>>> Clean up convert_pending since it is now useless.
>>>>>>>
>>>>>>>
>>>>>>
>>>>>>
>>>>>> .
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>
>>
>> .
>>
> 
>

Patch
diff mbox

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8..25853d7 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -369,7 +369,6 @@  struct dlm_lock
 	struct dlm_lockstatus *lksb;
 	unsigned ast_pending:1,
 		 bast_pending:1,
-		 convert_pending:1,
 		 lock_pending:1,
 		 cancel_pending:1,
 		 unlock_pending:1,
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63f..82ba8d5 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -291,7 +291,6 @@  enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 	/* move lock to local convert queue */
 	/* do not alter lock refcount.  switching lists. */
 	list_move_tail(&lock->list, &res->converting);
-	lock->convert_pending = 1;
 	lock->ml.convert_type = type;

 	if (flags & LKM_VALBLK) {
@@ -315,7 +314,6 @@  enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,

 	spin_lock(&res->spinlock);
 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
-	lock->convert_pending = 0;
 	/* if it failed, move it back to granted queue */
 	if (status != DLM_NORMAL) {
 		if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 8251360..710e94c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -77,7 +77,7 @@  static void __dlm_print_lock(struct dlm_lock *lock)

 	printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
 	       "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
-	       "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+	       "pending=(lock=%c,cancel=%c,unlock=%c)\n",
 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
 	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
@@ -86,7 +86,6 @@  static void __dlm_print_lock(struct dlm_lock *lock)
 	       (lock->ast_pending ? 'y' : 'n'),
 	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
 	       (lock->bast_pending ? 'y' : 'n'),
-	       (lock->convert_pending ? 'y' : 'n'),
 	       (lock->lock_pending ? 'y' : 'n'),
 	       (lock->cancel_pending ? 'y' : 'n'),
 	       (lock->unlock_pending ? 'y' : 'n'));
@@ -502,7 +501,7 @@  static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)

 #define DEBUG_LOCK_VERSION	1
 	spin_lock(&lock->spinlock);
-	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,"
 		       "%d,%d,%d,%d\n",
 		       DEBUG_LOCK_VERSION,
 		       list_type, lock->ml.type, lock->ml.convert_type,
@@ -512,7 +511,7 @@  static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
 		       !list_empty(&lock->ast_list),
 		       !list_empty(&lock->bast_list),
 		       lock->ast_pending, lock->bast_pending,
-		       lock->convert_pending, lock->lock_pending,
+		       lock->lock_pending,
 		       lock->cancel_pending, lock->unlock_pending,
 		       atomic_read(&lock->lock_refs.refcount));
 	spin_unlock(&lock->spinlock);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 66c2a49..857dd15 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -411,7 +411,6 @@  static void dlm_init_lock(struct dlm_lock *newlock, int type,
 	newlock->ml.cookie = cpu_to_be64(cookie);
 	newlock->ast_pending = 0;
 	newlock->bast_pending = 0;
-	newlock->convert_pending = 0;
 	newlock->lock_pending = 0;
 	newlock->unlock_pending = 0;
 	newlock->cancel_pending = 0;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 3d90ad7..2402ef7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2062,15 +2062,7 @@  void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 		queue = dlm_list_idx_to_ptr(res, i);
 		list_for_each_entry_safe(lock, next, queue, list) {
 			dlm_lock_get(lock);
-			if (lock->convert_pending) {
-				/* move converting lock back to granted */
-				BUG_ON(i != DLM_CONVERTING_LIST);
-				mlog(0, "node died with convert pending "
-				     "on %.*s. move back to granted list.\n",
-				     res->lockname.len, res->lockname.name);
-				dlm_revert_pending_convert(res, lock);
-				lock->convert_pending = 0;
-			} else if (lock->lock_pending) {
+			if (lock->lock_pending) {
 				/* remove pending lock requests completely */
 				BUG_ON(i != DLM_BLOCKED_LIST);
 				mlog(0, "node died with lock pending "