diff mbox series

fs/ocfs2: fix race in ocfs2_dentry_attach_lock

Message ID 20190503170102.14775-1-wen.gang.wang@oracle.com (mailing list archive)
State New, archived
Headers show
Series fs/ocfs2: fix race in ocfs2_dentry_attach_lock | expand

Commit Message

Wengang Wang May 3, 2019, 5:01 p.m. UTC
ocfs2_dentry_attach_lock() can be executed in parallel threads against the
same dentry. Make that race safe.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
---
 fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

Comments

Wengang Wang May 3, 2019, 5:03 p.m. UTC | #1
Daneil,

You could try this patch.
I didn't confirm yet if the patch helped or the problem didn't happen 
again where it happened (though I am sure the racing is there).

Anyone who is interested could take a pre-review on this patch.

Thanks,
Wengang

On 2019/5/3 10:01, Wengang Wang wrote:
> ocfs2_dentry_attach_lock() can be executed in parallel threads against the
> same dentry. Make that race safe.
>
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>   fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>   1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
> index 290373024d9d..980e571c163c 100644
> --- a/fs/ocfs2/dcache.c
> +++ b/fs/ocfs2/dcache.c
> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   			     struct inode *inode,
>   			     u64 parent_blkno)
>   {
> -	int ret;
> +	int ret, new_dl = 0;
>   	struct dentry *alias;
> -	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
> +	struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>   
>   	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
> -				       (unsigned long long)parent_blkno, dl);
> +				       (unsigned long long)parent_blkno,
> +				       dentry->d_fsdata);
>   
>   	/*
>   	 * Negative dentry. We ignore these for now.
> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   	if (!inode)
>   		return 0;
>   
> +	spin_lock(&dentry_attach_lock);
>   	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>   		/* Converting a negative dentry to positive
>   		   Clear dentry->d_fsdata */
>   		dentry->d_fsdata = dl = NULL;
> +	} else {
> +		dl = dentry->d_fsdata;
>   	}
> +	spin_unlock(&dentry_attach_lock);
>   
>   	if (dl) {
>   		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   
>   out_attach:
>   	spin_lock(&dentry_attach_lock);
> -	dentry->d_fsdata = dl;
> -	dl->dl_count++;
> +	/* d_fsdata could be set by parallel thread */
> +	if (unlikely(dentry->d_fsdata)) {
> +		pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
> +		if (!alias)
> +			dl_free_on_race = dl;
> +	} else {
> +		dentry->d_fsdata = dl;
> +		dl->dl_count++;
> +		if (!alias)
> +			new_dl = 1;
> +	}
>   	spin_unlock(&dentry_attach_lock);
>   
> +	kfree(dl_free_on_race);
>   	/*
>   	 * This actually gets us our PRMODE level lock. From now on,
>   	 * we'll have a notification if one of these names is
> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   	 * We need to do this because error here means no d_instantiate(),
>   	 * which means iput() will not be called during dput(dentry).
>   	 */
> -	if (ret < 0 && !alias) {
> +	if (ret < 0 && new_dl) {
>   		ocfs2_lock_res_free(&dl->dl_lockres);
>   		BUG_ON(dl->dl_count != 1);
>   		spin_lock(&dentry_attach_lock);
Wengang Wang May 3, 2019, 5:06 p.m. UTC | #2
The pr_err() line would help verify if this patch helped and it is for 
debugging purpose only.  It would be removed in final patch version.

thanks,
wengang

On 2019/5/3 10:03, Wengang Wang wrote:
> Daneil,
>
> You could try this patch.
> I didn't confirm yet if the patch helped or the problem didn't happen
> again where it happened (though I am sure the racing is there).
>
> Anyone who is interested could take a pre-review on this patch.
>
> Thanks,
> Wengang
>
> On 2019/5/3 10:01, Wengang Wang wrote:
>> ocfs2_dentry_attach_lock() can be executed in parallel threads against the
>> same dentry. Make that race safe.
>>
>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>> ---
>>    fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>>    1 file changed, 21 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
>> index 290373024d9d..980e571c163c 100644
>> --- a/fs/ocfs2/dcache.c
>> +++ b/fs/ocfs2/dcache.c
>> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    			     struct inode *inode,
>>    			     u64 parent_blkno)
>>    {
>> -	int ret;
>> +	int ret, new_dl = 0;
>>    	struct dentry *alias;
>> -	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
>> +	struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>>    
>>    	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
>> -				       (unsigned long long)parent_blkno, dl);
>> +				       (unsigned long long)parent_blkno,
>> +				       dentry->d_fsdata);
>>    
>>    	/*
>>    	 * Negative dentry. We ignore these for now.
>> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    	if (!inode)
>>    		return 0;
>>    
>> +	spin_lock(&dentry_attach_lock);
>>    	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>>    		/* Converting a negative dentry to positive
>>    		   Clear dentry->d_fsdata */
>>    		dentry->d_fsdata = dl = NULL;
>> +	} else {
>> +		dl = dentry->d_fsdata;
>>    	}
>> +	spin_unlock(&dentry_attach_lock);
>>    
>>    	if (dl) {
>>    		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
>> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    
>>    out_attach:
>>    	spin_lock(&dentry_attach_lock);
>> -	dentry->d_fsdata = dl;
>> -	dl->dl_count++;
>> +	/* d_fsdata could be set by parallel thread */
>> +	if (unlikely(dentry->d_fsdata)) {
>> +		pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
>> +		if (!alias)
>> +			dl_free_on_race = dl;
>> +	} else {
>> +		dentry->d_fsdata = dl;
>> +		dl->dl_count++;
>> +		if (!alias)
>> +			new_dl = 1;
>> +	}
>>    	spin_unlock(&dentry_attach_lock);
>>    
>> +	kfree(dl_free_on_race);
>>    	/*
>>    	 * This actually gets us our PRMODE level lock. From now on,
>>    	 * we'll have a notification if one of these names is
>> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    	 * We need to do this because error here means no d_instantiate(),
>>    	 * which means iput() will not be called during dput(dentry).
>>    	 */
>> -	if (ret < 0 && !alias) {
>> +	if (ret < 0 && new_dl) {
>>    		ocfs2_lock_res_free(&dl->dl_lockres);
>>    		BUG_ON(dl->dl_count != 1);
>>    		spin_lock(&dentry_attach_lock);
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Changwei Ge May 5, 2019, 7:15 a.m. UTC | #3
Hi Wengang,


We actually met some strange problems around denty cluster lock 
attachment before. I suppose solidifying around logic makes sense.

Can you make your change log richer with a little more analysis?

If you can paste a dlm lock resouce and ocfs2 lock resource snapshot 
from coredump, that would be great and I can help figure what bad happened.


Thanks,

Changwei



On 2019/5/4 1:01 上午, Wengang Wang wrote:
> ocfs2_dentry_attach_lock() can be executed in parallel threads against the
> same dentry. Make that race safe.
>
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>   fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>   1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
> index 290373024d9d..980e571c163c 100644
> --- a/fs/ocfs2/dcache.c
> +++ b/fs/ocfs2/dcache.c
> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   			     struct inode *inode,
>   			     u64 parent_blkno)
>   {
> -	int ret;
> +	int ret, new_dl = 0;
>   	struct dentry *alias;
> -	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
> +	struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>   
>   	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
> -				       (unsigned long long)parent_blkno, dl);
> +				       (unsigned long long)parent_blkno,
> +				       dentry->d_fsdata);
>   
>   	/*
>   	 * Negative dentry. We ignore these for now.
> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   	if (!inode)
>   		return 0;
>   
> +	spin_lock(&dentry_attach_lock);
>   	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>   		/* Converting a negative dentry to positive
>   		   Clear dentry->d_fsdata */
>   		dentry->d_fsdata = dl = NULL;
> +	} else {
> +		dl = dentry->d_fsdata;
>   	}
> +	spin_unlock(&dentry_attach_lock);
>   
>   	if (dl) {
>   		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   
>   out_attach:
>   	spin_lock(&dentry_attach_lock);
> -	dentry->d_fsdata = dl;
> -	dl->dl_count++;
> +	/* d_fsdata could be set by parallel thread */
> +	if (unlikely(dentry->d_fsdata)) {
> +		pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
> +		if (!alias)
> +			dl_free_on_race = dl;
> +	} else {
> +		dentry->d_fsdata = dl;
> +		dl->dl_count++;
> +		if (!alias)
> +			new_dl = 1;
> +	}
>   	spin_unlock(&dentry_attach_lock);
>   
> +	kfree(dl_free_on_race);
>   	/*
>   	 * This actually gets us our PRMODE level lock. From now on,
>   	 * we'll have a notification if one of these names is
> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>   	 * We need to do this because error here means no d_instantiate(),
>   	 * which means iput() will not be called during dput(dentry).
>   	 */
> -	if (ret < 0 && !alias) {
> +	if (ret < 0 && new_dl) {
>   		ocfs2_lock_res_free(&dl->dl_lockres);
>   		BUG_ON(dl->dl_count != 1);
>   		spin_lock(&dentry_attach_lock);
Wengang Wang May 6, 2019, 5:55 p.m. UTC | #4
Hi Changwei,

Actually I'd like to put the details after I confirmed the patch fixes 
(I didn't get that information yet so far).
But since you are interested, I am putting it here.

The in core dlm_lock_resource and the dlm_locks in the granted list have 
the following looking:

<struct dlm_lock_resource 0xffff9c7e9ad57cc0>
--------------------------------------------------------------------------------------------------->
     lockname                      N000000000064e69f009a7bb5 hex: 
4e303030303030303030303634653639660000009a7bb500000
     owner                         2
     dlm                           34F9A5BE6F2A481B8DE25ABAE8BBC68D
     state                         NOSTATE
     *dlm_locks in granted list:
         <dlm_lock 
0xffff9c7e0429d700>-------------------------------------------------------------->
           ml.node                       1
           lockres                       0xffff9c7e9ad57cc0
           refs                          2
           lksb ffff9c7e05093720/ containing ocfs2_lock_res ffff9c7e050936a0
           convert_pending               0
           ml.list                       0
           ml.flags                      0
           type   3   convert_type   -1
           ml.highest_blocked            -1
<-------------------------------------------------------------------------------------------
         <dlm_lock 
0xffff9c7e02f3ee80>-------------------------------------------------------------->
           ml.node                       0
           lockres                       0xffff9c7e9ad57cc0
           refs                          2
           lksb ffff9c7e30658a20/ containing ocfs2_lock_res ffff9c7e306589a0
           convert_pending               0
           ml.list                       0
           ml.flags                      0
           type   3   convert_type   -1
           ml.highest_blocked            -1
<-------------------------------------------------------------------------------------------
         <dlm_lock 
0xffff9c7d3ef3fd80>-------------------------------------------------------------->
           ml.node                       2
           lockres                       0xffff9c7e9ad57cc0
           refs                          2
           lksb ffff9c7dbffe2898/ containing ocfs2_lock_res ffff9c7dbffe2818
           convert_pending               0
           ml.list                       0
           ml.flags                      0
           type   3   convert_type   -1
           ml.highest_blocked            -1
<-------------------------------------------------------------------------------------------
         <dlm_lock 
0xffff9c7e41c97880>-------------------------------------------------------------->
           ml.node                       2
           lockres                       0xffff9c7e9ad57cc0
           refs                          2
           lksb ffff9c7dbffad298/ containing ocfs2_lock_res ffff9c7dbffad218
           convert_pending               0
           ml.list                       0
           ml.flags                      0
           type   3   convert_type   -1
           ml.highest_blocked            -1
<-------------------------------------------------------------------------------------------
         4 dlm_lock in granted list

You see there are two granted dlm_lock for node 2 (0xffff9c7d3ef3fd80 
and 0xffff9c7e41c97880).
Normally, ocfs2 FS layer wouldn't request duplicated new lock against 
same lock resource, It would request coverts instead after the first new 
lock request.
So the two existence of dlm_lock for the same lock resource is a hint 
for ocfs2 FS layer racing.

 From lksb, we know the containing ocfs2_lock_res objects 
(ffff9c7dbffe2818 and ffff9c7dbffad218)

crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffe2818 |egrep 
"holder|flag|name"
     l_name                        N000000000064e69f009a7bb5 hex: 
4e303030303030303030303634653639660000009a7bb500000
     l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED
     l_ro_holders                  1
     l_ex_holders                  0
crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffad218 |egrep 
"holder|flag|name"
     l_name                        N000000000064e69f009a7bb5 hex: 
4e303030303030303030303634653639660000009a7bb500000
     l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED
     l_ro_holders                  0
     l_ex_holders                  0

The one @ffff9c7dbffad218 is the victim with l_ro_holdrs being 0.

I think the racing is like this:

                 thread A                                  thread B

   (A1) enter ocfs2_dentry_attach_lock,
   seeing dentry->d_fsdata is NULL,
   and no alias found by
   ocfs2_find_local_alias,
   kmalloc a new ocfs2_dentry_lock
   structure to local variable "dl", dl1

               .....

(B1) enter ocfs2_dentry_attach_lock,
seeing dentry->d_fsdata is NULL,
andno alias found by
ocfs2_find_local_alias so kmalloc
                                                                 a new 
ocfs2_dentry_lock
structure to local variable "dl", dl2

......

  (A2) set dentry->d_fsdata with dl1

              ......

  (A3) ocfs2_dentry_lock() succeeded
  and dl1->dl_lockres.l_ro_holders
   increased to 1.
              ......

(B2) set dentry->d_fsdata with dl2

.......

(B3) ocfs2_dentry_lock() succeeded
                                                                and 
dl2->dl_lockres.l_ro_holders
increased to 1.

......

(A4) ocfs2_dentry_unlock() called
and dl2->dl_lockres.l_ro_holders
decreased to 0.

             ....

(B4) ocfs2_dentry_unlock() called,
decreasing dl2->dl_lockres.l_ro_holders,
but see it's zero now, panic


The point is that for thread A, ro_holders increased on dl1 at step A3 
but decreased on dl2 at step A4. This causes dl1 has ro_holders left 1. 
It also caused and extra and unexpected decreasing on dl2 making the 
ro_holders 0.
Step B2 caused a memory leak (but not the topic here). Step B4 hit the 
panic on seeing ro_holders on dl2 is zero.

Go back to the vmcore, let's find out dl1 and dl2:

crash> kmem ffff9c7dbffe2818
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff9c7e97c03080      512      36007     36128   1129    16k kmalloc-512
   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
   ffffeeaec2fff800  ffff9c7dbffe0000     0     32         25     7
   FREE / [ALLOCATED]
   [ffff9c7dbffe2800]

       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffeeaec2fff880  bffe2000 dead000000000400        0  0 fffffc0000000
crash> kmem ffff9c7dbffad218
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff9c7e97c03080      512      36007     36128   1129    16k kmalloc-512
   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
   ffffeeaec2ffeb00  ffff9c7dbffac000     0     32         23     9
   FREE / [ALLOCATED]
   [ffff9c7dbffad200]

       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffeeaec2ffeb40  bffad000 dead0000ffffffff        0  0 fffffc0000000

crash> struct ocfs2_dentry_lock -x ffff9c7dbffe2800 |egrep "parent|inode"
   dl_parent_blkno = 0x64e69f,
   dl_inode = 0xffff9c7d13c624c0,
crash> struct ocfs2_dentry_lock -x ffff9c7dbffad200 |egrep "parent|inode"
   dl_parent_blkno = 0x64e69f,
   dl_inode = 0xffff9c7d13c624c0,

ffff9c7dbffe2800 is dl1, ffff9c7dbffad200 is dl2, they match well in 
dl_parent_blkno and dl_inode fields.

Verifying dentry:
crash> struct dentry ffff9c7d8b659780 |grep 24c0
   d_inode = 0xffff9c7d13c624c0,
crash> struct dentry ffff9c7d8b659780 |grep fsdata
   d_fsdata = 0xffff9c7dbffad200,

d_fsdata matches dl2 well.

Above is the analysis (hoping the format is still good after email 
sent). Fix is self-explaining.

Thanks,
Wengang

On 2019/5/5 0:15, Changwei Ge wrote:
> Hi Wengang,
>
>
> We actually met some strange problems around denty cluster lock
> attachment before. I suppose solidifying around logic makes sense.
>
> Can you make your change log richer with a little more analysis?
>
> If you can paste a dlm lock resouce and ocfs2 lock resource snapshot
> from coredump, that would be great and I can help figure what bad happened.
>
>
> Thanks,
>
> Changwei
>
>
>
> On 2019/5/4 1:01 上午, Wengang Wang wrote:
>> ocfs2_dentry_attach_lock() can be executed in parallel threads against the
>> same dentry. Make that race safe.
>>
>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>> ---
>>    fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>>    1 file changed, 21 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
>> index 290373024d9d..980e571c163c 100644
>> --- a/fs/ocfs2/dcache.c
>> +++ b/fs/ocfs2/dcache.c
>> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    			     struct inode *inode,
>>    			     u64 parent_blkno)
>>    {
>> -	int ret;
>> +	int ret, new_dl = 0;
>>    	struct dentry *alias;
>> -	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
>> +	struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>>    
>>    	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
>> -				       (unsigned long long)parent_blkno, dl);
>> +				       (unsigned long long)parent_blkno,
>> +				       dentry->d_fsdata);
>>    
>>    	/*
>>    	 * Negative dentry. We ignore these for now.
>> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    	if (!inode)
>>    		return 0;
>>    
>> +	spin_lock(&dentry_attach_lock);
>>    	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>>    		/* Converting a negative dentry to positive
>>    		   Clear dentry->d_fsdata */
>>    		dentry->d_fsdata = dl = NULL;
>> +	} else {
>> +		dl = dentry->d_fsdata;
>>    	}
>> +	spin_unlock(&dentry_attach_lock);
>>    
>>    	if (dl) {
>>    		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
>> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    
>>    out_attach:
>>    	spin_lock(&dentry_attach_lock);
>> -	dentry->d_fsdata = dl;
>> -	dl->dl_count++;
>> +	/* d_fsdata could be set by parallel thread */
>> +	if (unlikely(dentry->d_fsdata)) {
>> +		pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
>> +		if (!alias)
>> +			dl_free_on_race = dl;
>> +	} else {
>> +		dentry->d_fsdata = dl;
>> +		dl->dl_count++;
>> +		if (!alias)
>> +			new_dl = 1;
>> +	}
>>    	spin_unlock(&dentry_attach_lock);
>>    
>> +	kfree(dl_free_on_race);
>>    	/*
>>    	 * This actually gets us our PRMODE level lock. From now on,
>>    	 * we'll have a notification if one of these names is
>> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>    	 * We need to do this because error here means no d_instantiate(),
>>    	 * which means iput() will not be called during dput(dentry).
>>    	 */
>> -	if (ret < 0 && !alias) {
>> +	if (ret < 0 && new_dl) {
>>    		ocfs2_lock_res_free(&dl->dl_lockres);
>>    		BUG_ON(dl->dl_count != 1);
>>    		spin_lock(&dentry_attach_lock);
Wengang Wang May 6, 2019, 8:50 p.m. UTC | #5
The format describing the racing is messed up, please check the attached 
.txt file for that part.

thanks,
wengang

On 2019/5/6 10:55, Wengang Wang wrote:
> Hi Changwei,
>
> Actually I'd like to put the details after I confirmed the patch fixes
> (I didn't get that information yet so far).
> But since you are interested, I am putting it here.
>
> The in core dlm_lock_resource and the dlm_locks in the granted list have
> the following looking:
>
> <struct dlm_lock_resource 0xffff9c7e9ad57cc0>
> --------------------------------------------------------------------------------------------------->
>       lockname                      N000000000064e69f009a7bb5 hex:
> 4e303030303030303030303634653639660000009a7bb500000
>       owner                         2
>       dlm                           34F9A5BE6F2A481B8DE25ABAE8BBC68D
>       state                         NOSTATE
>       *dlm_locks in granted list:
>           <dlm_lock
> 0xffff9c7e0429d700>-------------------------------------------------------------->
>             ml.node                       1
>             lockres                       0xffff9c7e9ad57cc0
>             refs                          2
>             lksb ffff9c7e05093720/ containing ocfs2_lock_res ffff9c7e050936a0
>             convert_pending               0
>             ml.list                       0
>             ml.flags                      0
>             type   3   convert_type   -1
>             ml.highest_blocked            -1
> <-------------------------------------------------------------------------------------------
>           <dlm_lock
> 0xffff9c7e02f3ee80>-------------------------------------------------------------->
>             ml.node                       0
>             lockres                       0xffff9c7e9ad57cc0
>             refs                          2
>             lksb ffff9c7e30658a20/ containing ocfs2_lock_res ffff9c7e306589a0
>             convert_pending               0
>             ml.list                       0
>             ml.flags                      0
>             type   3   convert_type   -1
>             ml.highest_blocked            -1
> <-------------------------------------------------------------------------------------------
>           <dlm_lock
> 0xffff9c7d3ef3fd80>-------------------------------------------------------------->
>             ml.node                       2
>             lockres                       0xffff9c7e9ad57cc0
>             refs                          2
>             lksb ffff9c7dbffe2898/ containing ocfs2_lock_res ffff9c7dbffe2818
>             convert_pending               0
>             ml.list                       0
>             ml.flags                      0
>             type   3   convert_type   -1
>             ml.highest_blocked            -1
> <-------------------------------------------------------------------------------------------
>           <dlm_lock
> 0xffff9c7e41c97880>-------------------------------------------------------------->
>             ml.node                       2
>             lockres                       0xffff9c7e9ad57cc0
>             refs                          2
>             lksb ffff9c7dbffad298/ containing ocfs2_lock_res ffff9c7dbffad218
>             convert_pending               0
>             ml.list                       0
>             ml.flags                      0
>             type   3   convert_type   -1
>             ml.highest_blocked            -1
> <-------------------------------------------------------------------------------------------
>           4 dlm_lock in granted list
>
> You see there are two granted dlm_lock for node 2 (0xffff9c7d3ef3fd80
> and 0xffff9c7e41c97880).
> Normally, ocfs2 FS layer wouldn't request duplicated new lock against
> same lock resource, It would request coverts instead after the first new
> lock request.
> So the two existence of dlm_lock for the same lock resource is a hint
> for ocfs2 FS layer racing.
>
>   From lksb, we know the containing ocfs2_lock_res objects
> (ffff9c7dbffe2818 and ffff9c7dbffad218)
>
> crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffe2818 |egrep
> "holder|flag|name"
>       l_name                        N000000000064e69f009a7bb5 hex:
> 4e303030303030303030303634653639660000009a7bb500000
>       l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED
>       l_ro_holders                  1
>       l_ex_holders                  0
> crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffad218 |egrep
> "holder|flag|name"
>       l_name                        N000000000064e69f009a7bb5 hex:
> 4e303030303030303030303634653639660000009a7bb500000
>       l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED
>       l_ro_holders                  0
>       l_ex_holders                  0
>
> The one @ffff9c7dbffad218 is the victim with l_ro_holdrs being 0.
>
> I think the racing is like this:
>
>                   thread A                                  thread B
>
>     (A1) enter ocfs2_dentry_attach_lock,
>     seeing dentry->d_fsdata is NULL,
>     and no alias found by
>     ocfs2_find_local_alias,
>     kmalloc a new ocfs2_dentry_lock
>     structure to local variable "dl", dl1
>
>                 .....
>
> (B1) enter ocfs2_dentry_attach_lock,
> seeing dentry->d_fsdata is NULL,
> andno alias found by
> ocfs2_find_local_alias so kmalloc
>                                                                   a new
> ocfs2_dentry_lock
> structure to local variable "dl", dl2
>
> ......
>
>    (A2) set dentry->d_fsdata with dl1
>
>                ......
>
>    (A3) ocfs2_dentry_lock() succeeded
>    and dl1->dl_lockres.l_ro_holders
>     increased to 1.
>                ......
>
> (B2) set dentry->d_fsdata with dl2
>
> .......
>
> (B3) ocfs2_dentry_lock() succeeded
>                                                                  and
> dl2->dl_lockres.l_ro_holders
> increased to 1.
>
> ......
>
> (A4) ocfs2_dentry_unlock() called
> and dl2->dl_lockres.l_ro_holders
> decreased to 0.
>
>               ....
>
> (B4) ocfs2_dentry_unlock() called,
> decreasing dl2->dl_lockres.l_ro_holders,
> but see it's zero now, panic
>
>
> The point is that for thread A, ro_holders increased on dl1 at step A3
> but decreased on dl2 at step A4. This causes dl1 has ro_holders left 1.
> It also caused and extra and unexpected decreasing on dl2 making the
> ro_holders 0.
> Step B2 caused a memory leak (but not the topic here). Step B4 hit the
> panic on seeing ro_holders on dl2 is zero.
>
> Go back to the vmcore, let's find out dl1 and dl2:
>
> crash> kmem ffff9c7dbffe2818
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> ffff9c7e97c03080      512      36007     36128   1129    16k kmalloc-512
>     SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>     ffffeeaec2fff800  ffff9c7dbffe0000     0     32         25     7
>     FREE / [ALLOCATED]
>     [ffff9c7dbffe2800]
>
>         PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffeeaec2fff880  bffe2000 dead000000000400        0  0 fffffc0000000
> crash> kmem ffff9c7dbffad218
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> ffff9c7e97c03080      512      36007     36128   1129    16k kmalloc-512
>     SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>     ffffeeaec2ffeb00  ffff9c7dbffac000     0     32         23     9
>     FREE / [ALLOCATED]
>     [ffff9c7dbffad200]
>
>         PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffeeaec2ffeb40  bffad000 dead0000ffffffff        0  0 fffffc0000000
>
> crash> struct ocfs2_dentry_lock -x ffff9c7dbffe2800 |egrep "parent|inode"
>     dl_parent_blkno = 0x64e69f,
>     dl_inode = 0xffff9c7d13c624c0,
> crash> struct ocfs2_dentry_lock -x ffff9c7dbffad200 |egrep "parent|inode"
>     dl_parent_blkno = 0x64e69f,
>     dl_inode = 0xffff9c7d13c624c0,
>
> ffff9c7dbffe2800 is dl1, ffff9c7dbffad200 is dl2, they match well in
> dl_parent_blkno and dl_inode fields.
>
> Verifying dentry:
> crash> struct dentry ffff9c7d8b659780 |grep 24c0
>     d_inode = 0xffff9c7d13c624c0,
> crash> struct dentry ffff9c7d8b659780 |grep fsdata
>     d_fsdata = 0xffff9c7dbffad200,
>
> d_fsdata matches dl2 well.
>
> Above is the analysis (hoping the format is still good after email
> sent). Fix is self-explaining.
>
> Thanks,
> Wengang
>
> On 2019/5/5 0:15, Changwei Ge wrote:
>> Hi Wengang,
>>
>>
>> We actually met some strange problems around denty cluster lock
>> attachment before. I suppose solidifying around logic makes sense.
>>
>> Can you make your change log richer with a little more analysis?
>>
>> If you can paste a dlm lock resouce and ocfs2 lock resource snapshot
>> from coredump, that would be great and I can help figure what bad happened.
>>
>>
>> Thanks,
>>
>> Changwei
>>
>>
>>
>> On 2019/5/4 1:01 上午, Wengang Wang wrote:
>>> ocfs2_dentry_attach_lock() can be executed in parallel threads against the
>>> same dentry. Make that race safe.
>>>
>>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>>> ---
>>>     fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>>>     1 file changed, 21 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
>>> index 290373024d9d..980e571c163c 100644
>>> --- a/fs/ocfs2/dcache.c
>>> +++ b/fs/ocfs2/dcache.c
>>> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>>     			     struct inode *inode,
>>>     			     u64 parent_blkno)
>>>     {
>>> -	int ret;
>>> +	int ret, new_dl = 0;
>>>     	struct dentry *alias;
>>> -	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
>>> +	struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>>>     
>>>     	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
>>> -				       (unsigned long long)parent_blkno, dl);
>>> +				       (unsigned long long)parent_blkno,
>>> +				       dentry->d_fsdata);
>>>     
>>>     	/*
>>>     	 * Negative dentry. We ignore these for now.
>>> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>>     	if (!inode)
>>>     		return 0;
>>>     
>>> +	spin_lock(&dentry_attach_lock);
>>>     	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>>>     		/* Converting a negative dentry to positive
>>>     		   Clear dentry->d_fsdata */
>>>     		dentry->d_fsdata = dl = NULL;
>>> +	} else {
>>> +		dl = dentry->d_fsdata;
>>>     	}
>>> +	spin_unlock(&dentry_attach_lock);
>>>     
>>>     	if (dl) {
>>>     		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
>>> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>>     
>>>     out_attach:
>>>     	spin_lock(&dentry_attach_lock);
>>> -	dentry->d_fsdata = dl;
>>> -	dl->dl_count++;
>>> +	/* d_fsdata could be set by parallel thread */
>>> +	if (unlikely(dentry->d_fsdata)) {
>>> +		pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
>>> +		if (!alias)
>>> +			dl_free_on_race = dl;
>>> +	} else {
>>> +		dentry->d_fsdata = dl;
>>> +		dl->dl_count++;
>>> +		if (!alias)
>>> +			new_dl = 1;
>>> +	}
>>>     	spin_unlock(&dentry_attach_lock);
>>>     
>>> +	kfree(dl_free_on_race);
>>>     	/*
>>>     	 * This actually gets us our PRMODE level lock. From now on,
>>>     	 * we'll have a notification if one of these names is
>>> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>>     	 * We need to do this because error here means no d_instantiate(),
>>>     	 * which means iput() will not be called during dput(dentry).
>>>     	 */
>>> -	if (ret < 0 && !alias) {
>>> +	if (ret < 0 && new_dl) {
>>>     		ocfs2_lock_res_free(&dl->dl_lockres);
>>>     		BUG_ON(dl->dl_count != 1);
>>>     		spin_lock(&dentry_attach_lock);
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
thread A                                  thread B

(A1) enter ocfs2_dentry_attach_lock,
seeing dentry->d_fsdata is NULL,
and no alias found by
ocfs2_find_local_alias,
kmalloc a new ocfs2_dentry_lock
structure to local variable "dl", dl1

               .....

                                                (B1) enter ocfs2_dentry_attach_lock,
                                                 seeing dentry->d_fsdata is NULL,
                                                 andno alias found by
                                                 ocfs2_find_local_alias so kmalloc
                                                 a new ocfs2_dentry_lock structure
                                                 to local variable "dl", dl2

                                                        ......

(A2) set dentry->d_fsdata with dl1

              ......

(A3) ocfs2_dentry_lock() succeeded
and dl1->dl_lockres.l_ro_holders
increased to 1.
              ......

                                                (B2) set dentry->d_fsdata with dl2

                                                            .......

                                                (B3) ocfs2_dentry_lock() succeeded
                                                and dl2->dl_lockres.l_ro_holders
                                                increased to 1.

                                                        ......

(A4) ocfs2_dentry_unlock() called
and dl2->dl_lockres.l_ro_holders
decreased to 0.

             ....

                                                (B4) ocfs2_dentry_unlock() called,
                                                decreasing dl2->dl_lockres.l_ro_holders,
                                                but see it's zero now, panic
Changwei Ge May 7, 2019, 3:29 a.m. UTC | #6
Hi Wengang,


You indeed shared us a wonderful analysis  :)

I understand the the race now, but still have some thoughts on your 
draft patch. Please check out them inline.


On 2019/5/7 1:55 上午, Wengang Wang wrote:
> Hi Changwei,
>
> Actually I'd like to put the details after I confirmed the patch fixes 
> (I didn't get that information yet so far).
> But since you are interested, I am putting it here.
>
> The in core dlm_lock_resource and the dlm_locks in the granted list 
> have the following looking:
>
> <struct dlm_lock_resource 0xffff9c7e9ad57cc0>
> ---------------------------------------------------------------------------------------------------> 
>
>     lockname                      N000000000064e69f009a7bb5 hex: 
> 4e303030303030303030303634653639660000009a7bb500000
>     owner                         2
>     dlm                           34F9A5BE6F2A481B8DE25ABAE8BBC68D
>     state                         NOSTATE
>     *dlm_locks in granted list:
>         <dlm_lock 
> 0xffff9c7e0429d700>-------------------------------------------------------------->
>           ml.node                       1
>           lockres                       0xffff9c7e9ad57cc0
>           refs                          2
>           lksb ffff9c7e05093720/ containing ocfs2_lock_res 
> ffff9c7e050936a0
>           convert_pending               0
>           ml.list                       0
>           ml.flags                      0
>           type   3   convert_type   -1
>           ml.highest_blocked            -1
> <------------------------------------------------------------------------------------------- 
>
>         <dlm_lock 
> 0xffff9c7e02f3ee80>-------------------------------------------------------------->
>           ml.node                       0
>           lockres                       0xffff9c7e9ad57cc0
>           refs                          2
>           lksb ffff9c7e30658a20/ containing ocfs2_lock_res 
> ffff9c7e306589a0
>           convert_pending               0
>           ml.list                       0
>           ml.flags                      0
>           type   3   convert_type   -1
>           ml.highest_blocked            -1
> <------------------------------------------------------------------------------------------- 
>
>         <dlm_lock 
> 0xffff9c7d3ef3fd80>-------------------------------------------------------------->
>           ml.node                       2
>           lockres                       0xffff9c7e9ad57cc0
>           refs                          2
>           lksb ffff9c7dbffe2898/ containing ocfs2_lock_res 
> ffff9c7dbffe2818
>           convert_pending               0
>           ml.list                       0
>           ml.flags                      0
>           type   3   convert_type   -1
>           ml.highest_blocked            -1
> <------------------------------------------------------------------------------------------- 
>
>         <dlm_lock 
> 0xffff9c7e41c97880>-------------------------------------------------------------->
>           ml.node                       2
>           lockres                       0xffff9c7e9ad57cc0
>           refs                          2
>           lksb ffff9c7dbffad298/ containing ocfs2_lock_res 
> ffff9c7dbffad218
>           convert_pending               0
>           ml.list                       0
>           ml.flags                      0
>           type   3   convert_type   -1
>           ml.highest_blocked            -1
> <------------------------------------------------------------------------------------------- 
>
>         4 dlm_lock in granted list
>
> You see there are two granted dlm_lock for node 2 (0xffff9c7d3ef3fd80 
> and 0xffff9c7e41c97880).
> Normally, ocfs2 FS layer wouldn't request duplicated new lock against 
> same lock resource, It would request coverts instead after the first 
> new lock request.
> So the two existence of dlm_lock for the same lock resource is a hint 
> for ocfs2 FS layer racing.


I agree with your opinion that locking against to the same lock resource 
(shared inode, etc) should only have a unique ocfs2 lock resource object.

So do you think we should add some status check in dlm layer to assert 
insane cluster locking logic?  For instance, just check if a node wants 
to be granted twice, which is obviously unacceptable and unnecessary.


>
> From lksb, we know the containing ocfs2_lock_res objects 
> (ffff9c7dbffe2818 and ffff9c7dbffad218)
>
> crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffe2818 |egrep 
> "holder|flag|name"
>     l_name                        N000000000064e69f009a7bb5 hex: 
> 4e303030303030303030303634653639660000009a7bb500000
>     l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED
>     l_ro_holders                  1
>     l_ex_holders                  0
> crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffad218 |egrep 
> "holder|flag|name"
>     l_name                        N000000000064e69f009a7bb5 hex: 
> 4e303030303030303030303634653639660000009a7bb500000
>     l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED
>     l_ro_holders                  0
>     l_ex_holders                  0
>
> The one @ffff9c7dbffad218 is the victim with l_ro_holdrs being 0.
>
> I think the racing is like this:
>
>                 thread A                                  thread B
>
>   (A1) enter ocfs2_dentry_attach_lock,
>   seeing dentry->d_fsdata is NULL,
>   and no alias found by
>   ocfs2_find_local_alias,
>   kmalloc a new ocfs2_dentry_lock
>   structure to local variable "dl", dl1
>
>               .....
>
> (B1) enter ocfs2_dentry_attach_lock,
> seeing dentry->d_fsdata is NULL,
> andno alias found by
> ocfs2_find_local_alias so kmalloc
>                                                                 a new 
> ocfs2_dentry_lock
> structure to local variable "dl", dl2
>
> ......
>
>  (A2) set dentry->d_fsdata with dl1
>
>              ......
>
>  (A3) ocfs2_dentry_lock() succeeded
>  and dl1->dl_lockres.l_ro_holders
>   increased to 1.
>              ......
>
> (B2) set dentry->d_fsdata with dl2
>
> .......
>
> (B3) ocfs2_dentry_lock() succeeded
>                                                                and 
> dl2->dl_lockres.l_ro_holders
> increased to 1.
>
> ......
>
> (A4) ocfs2_dentry_unlock() called
> and dl2->dl_lockres.l_ro_holders
> decreased to 0.
>
>             ....
>
> (B4) ocfs2_dentry_unlock() called,
> decreasing dl2->dl_lockres.l_ro_holders,
> but see it's zero now, panic
>
>
> The point is that for thread A, ro_holders increased on dl1 at step A3 
> but decreased on dl2 at step A4. This causes dl1 has ro_holders left 
> 1. It also caused and extra and unexpected decreasing on dl2 making 
> the ro_holders 0.
> Step B2 caused a memory leak (but not the topic here). Step B4 hit the 
> panic on seeing ro_holders on dl2 is zero.
>
> Go back to the vmcore, let's find out dl1 and dl2:
>
> crash> kmem ffff9c7dbffe2818
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> ffff9c7e97c03080      512      36007     36128   1129    16k kmalloc-512
>   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>   ffffeeaec2fff800  ffff9c7dbffe0000     0     32         25     7
>   FREE / [ALLOCATED]
>   [ffff9c7dbffe2800]
>
>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffeeaec2fff880  bffe2000 dead000000000400        0  0 fffffc0000000
> crash> kmem ffff9c7dbffad218
> CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
> ffff9c7e97c03080      512      36007     36128   1129    16k kmalloc-512
>   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
>   ffffeeaec2ffeb00  ffff9c7dbffac000     0     32         23     9
>   FREE / [ALLOCATED]
>   [ffff9c7dbffad200]
>
>       PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
> ffffeeaec2ffeb40  bffad000 dead0000ffffffff        0  0 fffffc0000000
>
> crash> struct ocfs2_dentry_lock -x ffff9c7dbffe2800 |egrep "parent|inode"
>   dl_parent_blkno = 0x64e69f,
>   dl_inode = 0xffff9c7d13c624c0,
> crash> struct ocfs2_dentry_lock -x ffff9c7dbffad200 |egrep "parent|inode"
>   dl_parent_blkno = 0x64e69f,
>   dl_inode = 0xffff9c7d13c624c0,
>
> ffff9c7dbffe2800 is dl1, ffff9c7dbffad200 is dl2, they match well in 
> dl_parent_blkno and dl_inode fields.
>
> Verifying dentry:
> crash> struct dentry ffff9c7d8b659780 |grep 24c0
>   d_inode = 0xffff9c7d13c624c0,
> crash> struct dentry ffff9c7d8b659780 |grep fsdata
>   d_fsdata = 0xffff9c7dbffad200,
>
> d_fsdata matches dl2 well.
>
> Above is the analysis (hoping the format is still good after email 
> sent). Fix is self-explaining.
>
> Thanks,
> Wengang
>
> On 2019/5/5 0:15, Changwei Ge wrote:
>> Hi Wengang,
>>
>>
>> We actually met some strange problems around denty cluster lock
>> attachment before. I suppose solidifying around logic makes sense.
>>
>> Can you make your change log richer with a little more analysis?
>>
>> If you can paste a dlm lock resouce and ocfs2 lock resource snapshot
>> from coredump, that would be great and I can help figure what bad 
>> happened.
>>
>>
>> Thanks,
>>
>> Changwei
>>
>>
>>
>> On 2019/5/4 1:01 上午, Wengang Wang wrote:
>>> ocfs2_dentry_attach_lock() can be executed in parallel threads 
>>> against the
>>> same dentry. Make that race safe.
>>>
>>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>>> ---
>>>    fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>>>    1 file changed, 21 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
>>> index 290373024d9d..980e571c163c 100644
>>> --- a/fs/ocfs2/dcache.c
>>> +++ b/fs/ocfs2/dcache.c
>>> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry 
>>> *dentry,
>>>                     struct inode *inode,
>>>                     u64 parent_blkno)
>>>    {
>>> -    int ret;
>>> +    int ret, new_dl = 0;
>>>        struct dentry *alias;
>>> -    struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
>>> +    struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>>> trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
>>> -                       (unsigned long long)parent_blkno, dl);
>>> +                       (unsigned long long)parent_blkno,
>>> +                       dentry->d_fsdata);
>>>           /*
>>>         * Negative dentry. We ignore these for now.
>>> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry 
>>> *dentry,
>>>        if (!inode)
>>>            return 0;
>>>    +    spin_lock(&dentry_attach_lock);
>>>        if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>>>            /* Converting a negative dentry to positive
>>>               Clear dentry->d_fsdata */
>>>            dentry->d_fsdata = dl = NULL;
>>> +    } else {
>>> +        dl = dentry->d_fsdata;
>>>        }
>>> +    spin_unlock(&dentry_attach_lock);
>>>

I don't understand why acquring dentry_attach_lock here. Is there any 
race you want to close here?



>>>           if (dl) {
>>>            mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
>>> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry 
>>> *dentry,
>>>       out_attach:
>>>        spin_lock(&dentry_attach_lock);
>>> -    dentry->d_fsdata = dl;
>>> -    dl->dl_count++;
>>> +    /* d_fsdata could be set by parallel thread */
>>> +    if (unlikely(dentry->d_fsdata)) {
>>> +        pr_err("race found in ocfs2_dentry_attach_lock 
>>> dentry=%p\n", dentry);
>>> +        if (!alias)
>>> +            dl_free_on_race = dl;
>>> +    } else {
>>> +        dentry->d_fsdata = dl;
>>> +        dl->dl_count++;
>>> +        if (!alias)
>>> +            new_dl = 1;
>>> +    }
>>>        spin_unlock(&dentry_attach_lock);
>>>    +    kfree(dl_free_on_race);


Do we also have to do something for 'dl->dl_inode = igrab(inode)' ?

And if we don't have to allocate new lock resource for racing thread, 
that would be better.

How about just make the racing one wait somehow?


Thanks,

Changwei


>>>        /*
>>>         * This actually gets us our PRMODE level lock. From now on,
>>>         * we'll have a notification if one of these names is
>>> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>>>         * We need to do this because error here means no 
>>> d_instantiate(),
>>>         * which means iput() will not be called during dput(dentry).
>>>         */
>>> -    if (ret < 0 && !alias) {
>>> +    if (ret < 0 && new_dl) {
>>>            ocfs2_lock_res_free(&dl->dl_lockres);
>>>            BUG_ON(dl->dl_count != 1);
>>>            spin_lock(&dentry_attach_lock);
Daniel Sobe May 7, 2019, 1:52 p.m. UTC | #7
Hi Wengang,

I have applied your patch to the latest debian kernel in "stretch-backports", and been using it for 1..2 days. Today is the first time that I saw the pr_err() message:

May 07 15:23:19 drs1p001 kernel: race found in ocfs2_dentry_attach_lock dentry=0000000016c2f15c

I don't know whether it helps, but I thought I'll report it anyway. I don't remember any special activity at that time, just the same stuff I was doing all day.

Regards,

Daniel

-----Original Message-----
From: Wengang Wang <wen.gang.wang@oracle.com> 
Sent: Freitag, 3. Mai 2019 19:03
To: 'ocfs2-devel@oss.oracle.com' <ocfs2-devel@oss.oracle.com>
Cc: Daniel Sobe <daniel.sobe@nxp.com>
Subject: [EXT] Re: [PATCH] fs/ocfs2: fix race in ocfs2_dentry_attach_lock

Caution: EXT Email

Daneil,

You could try this patch.
I didn't confirm yet if the patch helped or the problem didn't happen again where it happened (though I am sure the racing is there).

Anyone who is interested could take a pre-review on this patch.

Thanks,
Wengang

On 2019/5/3 10:01, Wengang Wang wrote:
> ocfs2_dentry_attach_lock() can be executed in parallel threads against 
> the same dentry. Make that race safe.
>
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>   fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>   1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 
> 290373024d9d..980e571c163c 100644
> --- a/fs/ocfs2/dcache.c
> +++ b/fs/ocfs2/dcache.c
> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>                            struct inode *inode,
>                            u64 parent_blkno)
>   {
> -     int ret;
> +     int ret, new_dl = 0;
>       struct dentry *alias;
> -     struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
> +     struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>
>       trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
> -                                    (unsigned long long)parent_blkno, dl);
> +                                    (unsigned long long)parent_blkno,
> +                                    dentry->d_fsdata);
>
>       /*
>        * Negative dentry. We ignore these for now.
> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>       if (!inode)
>               return 0;
>
> +     spin_lock(&dentry_attach_lock);
>       if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>               /* Converting a negative dentry to positive
>                  Clear dentry->d_fsdata */
>               dentry->d_fsdata = dl = NULL;
> +     } else {
> +             dl = dentry->d_fsdata;
>       }
> +     spin_unlock(&dentry_attach_lock);
>
>       if (dl) {
>               mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, @@ 
> -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>
>   out_attach:
>       spin_lock(&dentry_attach_lock);
> -     dentry->d_fsdata = dl;
> -     dl->dl_count++;
> +     /* d_fsdata could be set by parallel thread */
> +     if (unlikely(dentry->d_fsdata)) {
> +             pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
> +             if (!alias)
> +                     dl_free_on_race = dl;
> +     } else {
> +             dentry->d_fsdata = dl;
> +             dl->dl_count++;
> +             if (!alias)
> +                     new_dl = 1;
> +     }
>       spin_unlock(&dentry_attach_lock);
>
> +     kfree(dl_free_on_race);
>       /*
>        * This actually gets us our PRMODE level lock. From now on,
>        * we'll have a notification if one of these names is @@ -330,7 
> +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>        * We need to do this because error here means no d_instantiate(),
>        * which means iput() will not be called during dput(dentry).
>        */
> -     if (ret < 0 && !alias) {
> +     if (ret < 0 && new_dl) {
>               ocfs2_lock_res_free(&dl->dl_lockres);
>               BUG_ON(dl->dl_count != 1);
>               spin_lock(&dentry_attach_lock);
Daniel Sobe May 7, 2019, 2:14 p.m. UTC | #8
About 30 minutes later this happened, I don't know whether it is related. Could it be related to not running the latest of the 4.19.X kernel?

May  7 15:51:07 drs1p001 kernel: [99859.583930] list_del corruption. prev->next should be ffff94bb01bda710, but was ffff94bacceda7c0
May  7 15:51:07 drs1p001 kernel: [99859.583951] ------------[ cut here ]------------
May  7 15:51:07 drs1p001 kernel: [99859.583951] kernel BUG at /kernelbuild/linux-4.19.28/lib/list_debug.c:53!
May  7 15:51:07 drs1p001 kernel: [99859.583960] invalid opcode: 0000 [#1] SMP PTI
May  7 15:51:07 drs1p001 kernel: [99859.583964] CPU: 1 PID: 38 Comm: khugepaged Tainted: G            E     4.19.0-0.bpo.4-amd64 #1 Debian 4.19.28-2~bpo9+1
May  7 15:51:07 drs1p001 kernel: [99859.583966] Hardware name: Dell Inc. OptiPlex 5040/0R790T, BIOS 1.2.7 01/15/2016
May  7 15:51:07 drs1p001 kernel: [99859.583974] RIP: 0010:__list_del_entry_valid+0x70/0x90
May  7 15:51:07 drs1p001 kernel: [99859.583976] Code: 7d e8 94 e8 90 c6 d0 ff 0f 0b 48 89 fe 48 c7 c7 30 7e e8 94 e8 7f c6 d0 ff 0f 0b 48 89 fe 48 c7 c7 68 7e e8 94 e8 6e c6 d0 ff <0f> 0b 48 89 fe 48 c7 c7 a8 7e e8 94 e8 5d c6 d0 ff 0f 0b 90 90 90
May  7 15:51:07 drs1p001 kernel: [99859.583980] RSP: 0018:ffffb96d00de7848 EFLAGS: 00010286
May  7 15:51:07 drs1p001 kernel: [99859.583982] RAX: 0000000000000054 RBX: ffff94bb01bda618 RCX: 0000000000000000
May  7 15:51:07 drs1p001 kernel: [99859.583984] RDX: 0000000000000000 RSI: ffff94bbd1a966a8 RDI: ffff94bbd1a966a8
May  7 15:51:07 drs1p001 kernel: [99859.583986] RBP: ffff94bb01bda710 R08: 0000000000000000 R09: 0000000000000471
May  7 15:51:07 drs1p001 kernel: [99859.583988] R10: ffffb96d00de7838 R11: ffffffff955ea6ed R12: ffff94bb01bda618
May  7 15:51:07 drs1p001 kernel: [99859.583990] R13: ffff94bb3e8327d8 R14: 0000000000000000 R15: ffff94bb730056c0
May  7 15:51:07 drs1p001 kernel: [99859.583992] FS:  0000000000000000(0000) GS:ffff94bbd1a80000(0000) knlGS:0000000000000000
May  7 15:51:07 drs1p001 kernel: [99859.583995] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
May  7 15:51:07 drs1p001 kernel: [99859.583997] CR2: 00007f5acb700e00 CR3: 0000000016c0a002 CR4: 00000000003606e0
May  7 15:51:07 drs1p001 kernel: [99859.583999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
May  7 15:51:07 drs1p001 kernel: [99859.584001] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
May  7 15:51:07 drs1p001 kernel: [99859.584003] Call Trace:
May  7 15:51:07 drs1p001 kernel: [99859.584065]  ocfs2_remove_lockres_tracking+0x31/0x70 [ocfs2]
May  7 15:51:07 drs1p001 kernel: [99859.584081]  ocfs2_lock_res_free+0x44/0x260 [ocfs2]
May  7 15:51:07 drs1p001 kernel: [99859.584093]  ocfs2_dentry_lock_put+0x5f/0x70 [ocfs2]
May  7 15:51:07 drs1p001 kernel: [99859.584106]  ocfs2_dentry_iput+0x49/0x110 [ocfs2]
May  7 15:51:07 drs1p001 kernel: [99859.584110]  __dentry_kill+0xd3/0x180
May  7 15:51:07 drs1p001 kernel: [99859.584113]  dentry_kill+0x4d/0x1b0
May  7 15:51:07 drs1p001 kernel: [99859.584115]  shrink_dentry_list+0xc3/0x1d0
May  7 15:51:07 drs1p001 kernel: [99859.584117]  prune_dcache_sb+0x52/0x70
May  7 15:51:07 drs1p001 kernel: [99859.584121]  super_cache_scan+0x10c/0x190
May  7 15:51:07 drs1p001 kernel: [99859.584124]  do_shrink_slab+0x12a/0x2b0
May  7 15:51:07 drs1p001 kernel: [99859.584127]  shrink_slab+0x214/0x2b0
May  7 15:51:07 drs1p001 kernel: [99859.584130]  shrink_node+0xf0/0x470
May  7 15:51:07 drs1p001 kernel: [99859.584133]  do_try_to_free_pages+0xeb/0x370
May  7 15:51:07 drs1p001 kernel: [99859.584135]  try_to_free_pages+0xf4/0x1b0
May  7 15:51:07 drs1p001 kernel: [99859.584139]  __alloc_pages_slowpath+0x2ee/0xdb0
May  7 15:51:07 drs1p001 kernel: [99859.584144]  ? __switch_to_asm+0x40/0x70
May  7 15:51:07 drs1p001 kernel: [99859.584147]  __alloc_pages_nodemask+0x258/0x2a0
May  7 15:51:07 drs1p001 kernel: [99859.584150]  khugepaged+0xc9b/0x21d0
May  7 15:51:07 drs1p001 kernel: [99859.584156]  ? remove_wait_queue+0x60/0x60
May  7 15:51:07 drs1p001 kernel: [99859.584159]  kthread+0xf8/0x130
May  7 15:51:07 drs1p001 kernel: [99859.584162]  ? collapse_shmem+0xc50/0xc50
May  7 15:51:07 drs1p001 kernel: [99859.584164]  ? kthread_create_worker_on_cpu+0x70/0x70
May  7 15:51:07 drs1p001 kernel: [99859.584166]  ret_from_fork+0x35/0x40
May  7 15:51:07 drs1p001 kernel: [99859.584169] Modules linked in: tcp_diag(E) inet_diag(E) unix_diag(E) veth(E) ocfs2(E) quota_tree(E) ocfs2_dlmfs(E) ocfs2_stack_o2cb(E) ocfs2_dlm(E) ocfs2_nodemanager(E) ocfs2_stackglue(E) configfs(E) bridge(E) stp(E) llc(E) iptable_filter(E) overlay(E) snd_hda_codec_hdmi(E) rfkill(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) fuse(E) intel_rapl(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) dell_wmi(E) mei_wdt(E) wmi_bmof(E) dell_smbios(E) sparse_keymap(E) i915(E) kvm_intel(E) dell_wmi_descriptor(E) snd_hda_intel(E) evdev(E) kvm(E) irqbypass(E) drm_kms_helper(E) crct10dif_pclmul(E) snd_hda_codec(E) crc32_pclmul(E) snd_hda_core(E) dcdbas(E) snd_hwdep(E) ghash_clmulni_intel(E) drm(E) intel_cstate(E) i2c_algo_bit(E) snd_pcm(E) intel_pch_thermal(E) intel_uncore(E) mei_me(E) pcspkr(E)
May  7 15:51:07 drs1p001 kernel: [99859.584208]  intel_rapl_perf(E) mei(E) snd_timer(E) iTCO_wdt(E) iTCO_vendor_support(E) snd(E) soundcore(E) sg(E) serio_raw(E) wmi(E) video(E) pcc_cpufreq(E) button(E) acpi_pad(E) drbd(E) lru_cache(E) libcrc32c(E) ip_tables(E) x_tables(E) autofs4(E) ext4(E) crc16(E) mbcache(E) jbd2(E) crc32c_generic(E) fscrypto(E) ecb(E) dm_mod(E) sr_mod(E) cdrom(E) sd_mod(E) crc32c_intel(E) aesni_intel(E) aes_x86_64(E) crypto_simd(E) cryptd(E) glue_helper(E) psmouse(E) xhci_pci(E) e1000(E) ahci(E) libahci(E) xhci_hcd(E) e1000e(E) libata(E) i2c_i801(E) usbcore(E) scsi_mod(E) usb_common(E) fan(E) thermal(E)
May  7 15:51:07 drs1p001 kernel: [99859.584243] ---[ end trace 2baf8e44ef57535a ]---



-----Original Message-----
From: ocfs2-devel-bounces@oss.oracle.com <ocfs2-devel-bounces@oss.oracle.com> On Behalf Of Daniel Sobe
Sent: Dienstag, 7. Mai 2019 15:53
To: Wengang Wang <wen.gang.wang@oracle.com>; 'ocfs2-devel@oss.oracle.com' <ocfs2-devel@oss.oracle.com>
Subject: Re: [Ocfs2-devel] [EXT] Re: [PATCH] fs/ocfs2: fix race in ocfs2_dentry_attach_lock

Caution: EXT Email

Hi Wengang,

I have applied your patch to the latest debian kernel in "stretch-backports", and been using it for 1..2 days. Today is the first time that I saw the pr_err() message:

May 07 15:23:19 drs1p001 kernel: race found in ocfs2_dentry_attach_lock dentry=0000000016c2f15c

I don't know whether it helps, but I thought I'll report it anyway. I don't remember any special activity at that time, just the same stuff I was doing all day.

Regards,

Daniel

-----Original Message-----
From: Wengang Wang <wen.gang.wang@oracle.com>
Sent: Freitag, 3. Mai 2019 19:03
To: 'ocfs2-devel@oss.oracle.com' <ocfs2-devel@oss.oracle.com>
Cc: Daniel Sobe <daniel.sobe@nxp.com>
Subject: [EXT] Re: [PATCH] fs/ocfs2: fix race in ocfs2_dentry_attach_lock

Caution: EXT Email

Daneil,

You could try this patch.
I didn't confirm yet if the patch helped or the problem didn't happen again where it happened (though I am sure the racing is there).

Anyone who is interested could take a pre-review on this patch.

Thanks,
Wengang

On 2019/5/3 10:01, Wengang Wang wrote:
> ocfs2_dentry_attach_lock() can be executed in parallel threads against 
> the same dentry. Make that race safe.
>
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>   fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------
>   1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 
> 290373024d9d..980e571c163c 100644
> --- a/fs/ocfs2/dcache.c
> +++ b/fs/ocfs2/dcache.c
> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>                            struct inode *inode,
>                            u64 parent_blkno)
>   {
> -     int ret;
> +     int ret, new_dl = 0;
>       struct dentry *alias;
> -     struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
> +     struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
>
>       trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
> -                                    (unsigned long long)parent_blkno, dl);
> +                                    (unsigned long long)parent_blkno,
> +                                    dentry->d_fsdata);
>
>       /*
>        * Negative dentry. We ignore these for now.
> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>       if (!inode)
>               return 0;
>
> +     spin_lock(&dentry_attach_lock);
>       if (d_really_is_negative(dentry) && dentry->d_fsdata) {
>               /* Converting a negative dentry to positive
>                  Clear dentry->d_fsdata */
>               dentry->d_fsdata = dl = NULL;
> +     } else {
> +             dl = dentry->d_fsdata;
>       }
> +     spin_unlock(&dentry_attach_lock);
>
>       if (dl) {
>               mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, @@
> -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>
>   out_attach:
>       spin_lock(&dentry_attach_lock);
> -     dentry->d_fsdata = dl;
> -     dl->dl_count++;
> +     /* d_fsdata could be set by parallel thread */
> +     if (unlikely(dentry->d_fsdata)) {
> +             pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
> +             if (!alias)
> +                     dl_free_on_race = dl;
> +     } else {
> +             dentry->d_fsdata = dl;
> +             dl->dl_count++;
> +             if (!alias)
> +                     new_dl = 1;
> +     }
>       spin_unlock(&dentry_attach_lock);
>
> +     kfree(dl_free_on_race);
>       /*
>        * This actually gets us our PRMODE level lock. From now on,
>        * we'll have a notification if one of these names is @@ -330,7
> +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
>        * We need to do this because error here means no d_instantiate(),
>        * which means iput() will not be called during dput(dentry).
>        */
> -     if (ret < 0 && !alias) {
> +     if (ret < 0 && new_dl) {
>               ocfs2_lock_res_free(&dl->dl_lockres);
>               BUG_ON(dl->dl_count != 1);
>               spin_lock(&dentry_attach_lock);
diff mbox series

Patch

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 290373024d9d..980e571c163c 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -227,12 +227,13 @@  int ocfs2_dentry_attach_lock(struct dentry *dentry,
 			     struct inode *inode,
 			     u64 parent_blkno)
 {
-	int ret;
+	int ret, new_dl = 0;
 	struct dentry *alias;
-	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+	struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL;
 
 	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
-				       (unsigned long long)parent_blkno, dl);
+				       (unsigned long long)parent_blkno,
+				       dentry->d_fsdata);
 
 	/*
 	 * Negative dentry. We ignore these for now.
@@ -243,11 +244,15 @@  int ocfs2_dentry_attach_lock(struct dentry *dentry,
 	if (!inode)
 		return 0;
 
+	spin_lock(&dentry_attach_lock);
 	if (d_really_is_negative(dentry) && dentry->d_fsdata) {
 		/* Converting a negative dentry to positive
 		   Clear dentry->d_fsdata */
 		dentry->d_fsdata = dl = NULL;
+	} else {
+		dl = dentry->d_fsdata;
 	}
+	spin_unlock(&dentry_attach_lock);
 
 	if (dl) {
 		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
@@ -310,10 +315,20 @@  int ocfs2_dentry_attach_lock(struct dentry *dentry,
 
 out_attach:
 	spin_lock(&dentry_attach_lock);
-	dentry->d_fsdata = dl;
-	dl->dl_count++;
+	/* d_fsdata could be set by parallel thread */
+	if (unlikely(dentry->d_fsdata)) {
+		pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry);
+		if (!alias)
+			dl_free_on_race = dl;
+	} else {
+		dentry->d_fsdata = dl;
+		dl->dl_count++;
+		if (!alias)
+			new_dl = 1;
+	}
 	spin_unlock(&dentry_attach_lock);
 
+	kfree(dl_free_on_race);
 	/*
 	 * This actually gets us our PRMODE level lock. From now on,
 	 * we'll have a notification if one of these names is
@@ -330,7 +345,7 @@  int ocfs2_dentry_attach_lock(struct dentry *dentry,
 	 * We need to do this because error here means no d_instantiate(),
 	 * which means iput() will not be called during dput(dentry).
 	 */
-	if (ret < 0 && !alias) {
+	if (ret < 0 && new_dl) {
 		ocfs2_lock_res_free(&dl->dl_lockres);
 		BUG_ON(dl->dl_count != 1);
 		spin_lock(&dentry_attach_lock);