Message ID | 20190503170102.14775-1-wen.gang.wang@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fs/ocfs2: fix race in ocfs2_dentry_attach_lock | expand |
Daneil, You could try this patch. I didn't confirm yet if the patch helped or the problem didn't happen again where it happened (though I am sure the racing is there). Anyone who is interested could take a pre-review on this patch. Thanks, Wengang On 2019/5/3 10:01, Wengang Wang wrote: > ocfs2_dentry_attach_lock() can be executed in parallel threads against the > same dentry. Make that race safe. > > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ > 1 file changed, 21 insertions(+), 6 deletions(-) > > diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c > index 290373024d9d..980e571c163c 100644 > --- a/fs/ocfs2/dcache.c > +++ b/fs/ocfs2/dcache.c > @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > struct inode *inode, > u64 parent_blkno) > { > - int ret; > + int ret, new_dl = 0; > struct dentry *alias; > - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; > + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; > > trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, > - (unsigned long long)parent_blkno, dl); > + (unsigned long long)parent_blkno, > + dentry->d_fsdata); > > /* > * Negative dentry. We ignore these for now. > @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > if (!inode) > return 0; > > + spin_lock(&dentry_attach_lock); > if (d_really_is_negative(dentry) && dentry->d_fsdata) { > /* Converting a negative dentry to positive > Clear dentry->d_fsdata */ > dentry->d_fsdata = dl = NULL; > + } else { > + dl = dentry->d_fsdata; > } > + spin_unlock(&dentry_attach_lock); > > if (dl) { > mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, > @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > > out_attach: > spin_lock(&dentry_attach_lock); > - dentry->d_fsdata = dl; > - dl->dl_count++; > + /* d_fsdata could be set by parallel thread */ > + if (unlikely(dentry->d_fsdata)) { > + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); > + if (!alias) > + dl_free_on_race = dl; > + } else { > + dentry->d_fsdata = dl; > + dl->dl_count++; > + if (!alias) > + new_dl = 1; > + } > spin_unlock(&dentry_attach_lock); > > + kfree(dl_free_on_race); > /* > * This actually gets us our PRMODE level lock. From now on, > * we'll have a notification if one of these names is > @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > * We need to do this because error here means no d_instantiate(), > * which means iput() will not be called during dput(dentry). > */ > - if (ret < 0 && !alias) { > + if (ret < 0 && new_dl) { > ocfs2_lock_res_free(&dl->dl_lockres); > BUG_ON(dl->dl_count != 1); > spin_lock(&dentry_attach_lock);
The pr_err() line would help verify if this patch helped and it is for debugging purpose only. It would be removed in final patch version. thanks, wengang On 2019/5/3 10:03, Wengang Wang wrote: > Daneil, > > You could try this patch. > I didn't confirm yet if the patch helped or the problem didn't happen > again where it happened (though I am sure the racing is there). > > Anyone who is interested could take a pre-review on this patch. > > Thanks, > Wengang > > On 2019/5/3 10:01, Wengang Wang wrote: >> ocfs2_dentry_attach_lock() can be executed in parallel threads against the >> same dentry. Make that race safe. >> >> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> >> --- >> fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ >> 1 file changed, 21 insertions(+), 6 deletions(-) >> >> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c >> index 290373024d9d..980e571c163c 100644 >> --- a/fs/ocfs2/dcache.c >> +++ b/fs/ocfs2/dcache.c >> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> struct inode *inode, >> u64 parent_blkno) >> { >> - int ret; >> + int ret, new_dl = 0; >> struct dentry *alias; >> - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; >> + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; >> >> trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, >> - (unsigned long long)parent_blkno, dl); >> + (unsigned long long)parent_blkno, >> + dentry->d_fsdata); >> >> /* >> * Negative dentry. We ignore these for now. >> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> if (!inode) >> return 0; >> >> + spin_lock(&dentry_attach_lock); >> if (d_really_is_negative(dentry) && dentry->d_fsdata) { >> /* Converting a negative dentry to positive >> Clear dentry->d_fsdata */ >> dentry->d_fsdata = dl = NULL; >> + } else { >> + dl = dentry->d_fsdata; >> } >> + spin_unlock(&dentry_attach_lock); >> >> if (dl) { >> mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, >> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> >> out_attach: >> spin_lock(&dentry_attach_lock); >> - dentry->d_fsdata = dl; >> - dl->dl_count++; >> + /* d_fsdata could be set by parallel thread */ >> + if (unlikely(dentry->d_fsdata)) { >> + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); >> + if (!alias) >> + dl_free_on_race = dl; >> + } else { >> + dentry->d_fsdata = dl; >> + dl->dl_count++; >> + if (!alias) >> + new_dl = 1; >> + } >> spin_unlock(&dentry_attach_lock); >> >> + kfree(dl_free_on_race); >> /* >> * This actually gets us our PRMODE level lock. From now on, >> * we'll have a notification if one of these names is >> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> * We need to do this because error here means no d_instantiate(), >> * which means iput() will not be called during dput(dentry). >> */ >> - if (ret < 0 && !alias) { >> + if (ret < 0 && new_dl) { >> ocfs2_lock_res_free(&dl->dl_lockres); >> BUG_ON(dl->dl_count != 1); >> spin_lock(&dentry_attach_lock); > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel@oss.oracle.com > https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Hi Wengang, We actually met some strange problems around denty cluster lock attachment before. I suppose solidifying around logic makes sense. Can you make your change log richer with a little more analysis? If you can paste a dlm lock resouce and ocfs2 lock resource snapshot from coredump, that would be great and I can help figure what bad happened. Thanks, Changwei On 2019/5/4 1:01 上午, Wengang Wang wrote: > ocfs2_dentry_attach_lock() can be executed in parallel threads against the > same dentry. Make that race safe. > > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ > 1 file changed, 21 insertions(+), 6 deletions(-) > > diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c > index 290373024d9d..980e571c163c 100644 > --- a/fs/ocfs2/dcache.c > +++ b/fs/ocfs2/dcache.c > @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > struct inode *inode, > u64 parent_blkno) > { > - int ret; > + int ret, new_dl = 0; > struct dentry *alias; > - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; > + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; > > trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, > - (unsigned long long)parent_blkno, dl); > + (unsigned long long)parent_blkno, > + dentry->d_fsdata); > > /* > * Negative dentry. We ignore these for now. > @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > if (!inode) > return 0; > > + spin_lock(&dentry_attach_lock); > if (d_really_is_negative(dentry) && dentry->d_fsdata) { > /* Converting a negative dentry to positive > Clear dentry->d_fsdata */ > dentry->d_fsdata = dl = NULL; > + } else { > + dl = dentry->d_fsdata; > } > + spin_unlock(&dentry_attach_lock); > > if (dl) { > mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, > @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > > out_attach: > spin_lock(&dentry_attach_lock); > - dentry->d_fsdata = dl; > - dl->dl_count++; > + /* d_fsdata could be set by parallel thread */ > + if (unlikely(dentry->d_fsdata)) { > + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); > + if (!alias) > + dl_free_on_race = dl; > + } else { > + dentry->d_fsdata = dl; > + dl->dl_count++; > + if (!alias) > + new_dl = 1; > + } > spin_unlock(&dentry_attach_lock); > > + kfree(dl_free_on_race); > /* > * This actually gets us our PRMODE level lock. From now on, > * we'll have a notification if one of these names is > @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > * We need to do this because error here means no d_instantiate(), > * which means iput() will not be called during dput(dentry). > */ > - if (ret < 0 && !alias) { > + if (ret < 0 && new_dl) { > ocfs2_lock_res_free(&dl->dl_lockres); > BUG_ON(dl->dl_count != 1); > spin_lock(&dentry_attach_lock);
Hi Changwei, Actually I'd like to put the details after I confirmed the patch fixes (I didn't get that information yet so far). But since you are interested, I am putting it here. The in core dlm_lock_resource and the dlm_locks in the granted list have the following looking: <struct dlm_lock_resource 0xffff9c7e9ad57cc0> ---------------------------------------------------------------------------------------------------> lockname N000000000064e69f009a7bb5 hex: 4e303030303030303030303634653639660000009a7bb500000 owner 2 dlm 34F9A5BE6F2A481B8DE25ABAE8BBC68D state NOSTATE *dlm_locks in granted list: <dlm_lock 0xffff9c7e0429d700>--------------------------------------------------------------> ml.node 1 lockres 0xffff9c7e9ad57cc0 refs 2 lksb ffff9c7e05093720/ containing ocfs2_lock_res ffff9c7e050936a0 convert_pending 0 ml.list 0 ml.flags 0 type 3 convert_type -1 ml.highest_blocked -1 <------------------------------------------------------------------------------------------- <dlm_lock 0xffff9c7e02f3ee80>--------------------------------------------------------------> ml.node 0 lockres 0xffff9c7e9ad57cc0 refs 2 lksb ffff9c7e30658a20/ containing ocfs2_lock_res ffff9c7e306589a0 convert_pending 0 ml.list 0 ml.flags 0 type 3 convert_type -1 ml.highest_blocked -1 <------------------------------------------------------------------------------------------- <dlm_lock 0xffff9c7d3ef3fd80>--------------------------------------------------------------> ml.node 2 lockres 0xffff9c7e9ad57cc0 refs 2 lksb ffff9c7dbffe2898/ containing ocfs2_lock_res ffff9c7dbffe2818 convert_pending 0 ml.list 0 ml.flags 0 type 3 convert_type -1 ml.highest_blocked -1 <------------------------------------------------------------------------------------------- <dlm_lock 0xffff9c7e41c97880>--------------------------------------------------------------> ml.node 2 lockres 0xffff9c7e9ad57cc0 refs 2 lksb ffff9c7dbffad298/ containing ocfs2_lock_res ffff9c7dbffad218 convert_pending 0 ml.list 0 ml.flags 0 type 3 convert_type -1 ml.highest_blocked -1 <------------------------------------------------------------------------------------------- 4 dlm_lock in granted list You see there are two granted dlm_lock for node 2 (0xffff9c7d3ef3fd80 and 0xffff9c7e41c97880). Normally, ocfs2 FS layer wouldn't request duplicated new lock against same lock resource, It would request coverts instead after the first new lock request. So the two existence of dlm_lock for the same lock resource is a hint for ocfs2 FS layer racing. From lksb, we know the containing ocfs2_lock_res objects (ffff9c7dbffe2818 and ffff9c7dbffad218) crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffe2818 |egrep "holder|flag|name" l_name N000000000064e69f009a7bb5 hex: 4e303030303030303030303634653639660000009a7bb500000 l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED l_ro_holders 1 l_ex_holders 0 crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffad218 |egrep "holder|flag|name" l_name N000000000064e69f009a7bb5 hex: 4e303030303030303030303634653639660000009a7bb500000 l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED l_ro_holders 0 l_ex_holders 0 The one @ffff9c7dbffad218 is the victim with l_ro_holdrs being 0. I think the racing is like this: thread A thread B (A1) enter ocfs2_dentry_attach_lock, seeing dentry->d_fsdata is NULL, and no alias found by ocfs2_find_local_alias, kmalloc a new ocfs2_dentry_lock structure to local variable "dl", dl1 ..... (B1) enter ocfs2_dentry_attach_lock, seeing dentry->d_fsdata is NULL, andno alias found by ocfs2_find_local_alias so kmalloc a new ocfs2_dentry_lock structure to local variable "dl", dl2 ...... (A2) set dentry->d_fsdata with dl1 ...... (A3) ocfs2_dentry_lock() succeeded and dl1->dl_lockres.l_ro_holders increased to 1. ...... (B2) set dentry->d_fsdata with dl2 ....... (B3) ocfs2_dentry_lock() succeeded and dl2->dl_lockres.l_ro_holders increased to 1. ...... (A4) ocfs2_dentry_unlock() called and dl2->dl_lockres.l_ro_holders decreased to 0. .... (B4) ocfs2_dentry_unlock() called, decreasing dl2->dl_lockres.l_ro_holders, but see it's zero now, panic The point is that for thread A, ro_holders increased on dl1 at step A3 but decreased on dl2 at step A4. This causes dl1 has ro_holders left 1. It also caused and extra and unexpected decreasing on dl2 making the ro_holders 0. Step B2 caused a memory leak (but not the topic here). Step B4 hit the panic on seeing ro_holders on dl2 is zero. Go back to the vmcore, let's find out dl1 and dl2: crash> kmem ffff9c7dbffe2818 CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME ffff9c7e97c03080 512 36007 36128 1129 16k kmalloc-512 SLAB MEMORY NODE TOTAL ALLOCATED FREE ffffeeaec2fff800 ffff9c7dbffe0000 0 32 25 7 FREE / [ALLOCATED] [ffff9c7dbffe2800] PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffeeaec2fff880 bffe2000 dead000000000400 0 0 fffffc0000000 crash> kmem ffff9c7dbffad218 CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME ffff9c7e97c03080 512 36007 36128 1129 16k kmalloc-512 SLAB MEMORY NODE TOTAL ALLOCATED FREE ffffeeaec2ffeb00 ffff9c7dbffac000 0 32 23 9 FREE / [ALLOCATED] [ffff9c7dbffad200] PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffeeaec2ffeb40 bffad000 dead0000ffffffff 0 0 fffffc0000000 crash> struct ocfs2_dentry_lock -x ffff9c7dbffe2800 |egrep "parent|inode" dl_parent_blkno = 0x64e69f, dl_inode = 0xffff9c7d13c624c0, crash> struct ocfs2_dentry_lock -x ffff9c7dbffad200 |egrep "parent|inode" dl_parent_blkno = 0x64e69f, dl_inode = 0xffff9c7d13c624c0, ffff9c7dbffe2800 is dl1, ffff9c7dbffad200 is dl2, they match well in dl_parent_blkno and dl_inode fields. Verifying dentry: crash> struct dentry ffff9c7d8b659780 |grep 24c0 d_inode = 0xffff9c7d13c624c0, crash> struct dentry ffff9c7d8b659780 |grep fsdata d_fsdata = 0xffff9c7dbffad200, d_fsdata matches dl2 well. Above is the analysis (hoping the format is still good after email sent). Fix is self-explaining. Thanks, Wengang On 2019/5/5 0:15, Changwei Ge wrote: > Hi Wengang, > > > We actually met some strange problems around denty cluster lock > attachment before. I suppose solidifying around logic makes sense. > > Can you make your change log richer with a little more analysis? > > If you can paste a dlm lock resouce and ocfs2 lock resource snapshot > from coredump, that would be great and I can help figure what bad happened. > > > Thanks, > > Changwei > > > > On 2019/5/4 1:01 上午, Wengang Wang wrote: >> ocfs2_dentry_attach_lock() can be executed in parallel threads against the >> same dentry. Make that race safe. >> >> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> >> --- >> fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ >> 1 file changed, 21 insertions(+), 6 deletions(-) >> >> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c >> index 290373024d9d..980e571c163c 100644 >> --- a/fs/ocfs2/dcache.c >> +++ b/fs/ocfs2/dcache.c >> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> struct inode *inode, >> u64 parent_blkno) >> { >> - int ret; >> + int ret, new_dl = 0; >> struct dentry *alias; >> - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; >> + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; >> >> trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, >> - (unsigned long long)parent_blkno, dl); >> + (unsigned long long)parent_blkno, >> + dentry->d_fsdata); >> >> /* >> * Negative dentry. We ignore these for now. >> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> if (!inode) >> return 0; >> >> + spin_lock(&dentry_attach_lock); >> if (d_really_is_negative(dentry) && dentry->d_fsdata) { >> /* Converting a negative dentry to positive >> Clear dentry->d_fsdata */ >> dentry->d_fsdata = dl = NULL; >> + } else { >> + dl = dentry->d_fsdata; >> } >> + spin_unlock(&dentry_attach_lock); >> >> if (dl) { >> mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, >> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> >> out_attach: >> spin_lock(&dentry_attach_lock); >> - dentry->d_fsdata = dl; >> - dl->dl_count++; >> + /* d_fsdata could be set by parallel thread */ >> + if (unlikely(dentry->d_fsdata)) { >> + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); >> + if (!alias) >> + dl_free_on_race = dl; >> + } else { >> + dentry->d_fsdata = dl; >> + dl->dl_count++; >> + if (!alias) >> + new_dl = 1; >> + } >> spin_unlock(&dentry_attach_lock); >> >> + kfree(dl_free_on_race); >> /* >> * This actually gets us our PRMODE level lock. From now on, >> * we'll have a notification if one of these names is >> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >> * We need to do this because error here means no d_instantiate(), >> * which means iput() will not be called during dput(dentry). >> */ >> - if (ret < 0 && !alias) { >> + if (ret < 0 && new_dl) { >> ocfs2_lock_res_free(&dl->dl_lockres); >> BUG_ON(dl->dl_count != 1); >> spin_lock(&dentry_attach_lock);
The format describing the racing is messed up, please check the attached .txt file for that part. thanks, wengang On 2019/5/6 10:55, Wengang Wang wrote: > Hi Changwei, > > Actually I'd like to put the details after I confirmed the patch fixes > (I didn't get that information yet so far). > But since you are interested, I am putting it here. > > The in core dlm_lock_resource and the dlm_locks in the granted list have > the following looking: > > <struct dlm_lock_resource 0xffff9c7e9ad57cc0> > ---------------------------------------------------------------------------------------------------> > lockname N000000000064e69f009a7bb5 hex: > 4e303030303030303030303634653639660000009a7bb500000 > owner 2 > dlm 34F9A5BE6F2A481B8DE25ABAE8BBC68D > state NOSTATE > *dlm_locks in granted list: > <dlm_lock > 0xffff9c7e0429d700>--------------------------------------------------------------> > ml.node 1 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7e05093720/ containing ocfs2_lock_res ffff9c7e050936a0 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > <dlm_lock > 0xffff9c7e02f3ee80>--------------------------------------------------------------> > ml.node 0 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7e30658a20/ containing ocfs2_lock_res ffff9c7e306589a0 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > <dlm_lock > 0xffff9c7d3ef3fd80>--------------------------------------------------------------> > ml.node 2 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7dbffe2898/ containing ocfs2_lock_res ffff9c7dbffe2818 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > <dlm_lock > 0xffff9c7e41c97880>--------------------------------------------------------------> > ml.node 2 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7dbffad298/ containing ocfs2_lock_res ffff9c7dbffad218 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > 4 dlm_lock in granted list > > You see there are two granted dlm_lock for node 2 (0xffff9c7d3ef3fd80 > and 0xffff9c7e41c97880). > Normally, ocfs2 FS layer wouldn't request duplicated new lock against > same lock resource, It would request coverts instead after the first new > lock request. > So the two existence of dlm_lock for the same lock resource is a hint > for ocfs2 FS layer racing. > > From lksb, we know the containing ocfs2_lock_res objects > (ffff9c7dbffe2818 and ffff9c7dbffad218) > > crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffe2818 |egrep > "holder|flag|name" > l_name N000000000064e69f009a7bb5 hex: > 4e303030303030303030303634653639660000009a7bb500000 > l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED > l_ro_holders 1 > l_ex_holders 0 > crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffad218 |egrep > "holder|flag|name" > l_name N000000000064e69f009a7bb5 hex: > 4e303030303030303030303634653639660000009a7bb500000 > l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED > l_ro_holders 0 > l_ex_holders 0 > > The one @ffff9c7dbffad218 is the victim with l_ro_holdrs being 0. > > I think the racing is like this: > > thread A thread B > > (A1) enter ocfs2_dentry_attach_lock, > seeing dentry->d_fsdata is NULL, > and no alias found by > ocfs2_find_local_alias, > kmalloc a new ocfs2_dentry_lock > structure to local variable "dl", dl1 > > ..... > > (B1) enter ocfs2_dentry_attach_lock, > seeing dentry->d_fsdata is NULL, > andno alias found by > ocfs2_find_local_alias so kmalloc > a new > ocfs2_dentry_lock > structure to local variable "dl", dl2 > > ...... > > (A2) set dentry->d_fsdata with dl1 > > ...... > > (A3) ocfs2_dentry_lock() succeeded > and dl1->dl_lockres.l_ro_holders > increased to 1. > ...... > > (B2) set dentry->d_fsdata with dl2 > > ....... > > (B3) ocfs2_dentry_lock() succeeded > and > dl2->dl_lockres.l_ro_holders > increased to 1. > > ...... > > (A4) ocfs2_dentry_unlock() called > and dl2->dl_lockres.l_ro_holders > decreased to 0. > > .... > > (B4) ocfs2_dentry_unlock() called, > decreasing dl2->dl_lockres.l_ro_holders, > but see it's zero now, panic > > > The point is that for thread A, ro_holders increased on dl1 at step A3 > but decreased on dl2 at step A4. This causes dl1 has ro_holders left 1. > It also caused and extra and unexpected decreasing on dl2 making the > ro_holders 0. > Step B2 caused a memory leak (but not the topic here). Step B4 hit the > panic on seeing ro_holders on dl2 is zero. > > Go back to the vmcore, let's find out dl1 and dl2: > > crash> kmem ffff9c7dbffe2818 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > ffff9c7e97c03080 512 36007 36128 1129 16k kmalloc-512 > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffeeaec2fff800 ffff9c7dbffe0000 0 32 25 7 > FREE / [ALLOCATED] > [ffff9c7dbffe2800] > > PAGE PHYSICAL MAPPING INDEX CNT FLAGS > ffffeeaec2fff880 bffe2000 dead000000000400 0 0 fffffc0000000 > crash> kmem ffff9c7dbffad218 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > ffff9c7e97c03080 512 36007 36128 1129 16k kmalloc-512 > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffeeaec2ffeb00 ffff9c7dbffac000 0 32 23 9 > FREE / [ALLOCATED] > [ffff9c7dbffad200] > > PAGE PHYSICAL MAPPING INDEX CNT FLAGS > ffffeeaec2ffeb40 bffad000 dead0000ffffffff 0 0 fffffc0000000 > > crash> struct ocfs2_dentry_lock -x ffff9c7dbffe2800 |egrep "parent|inode" > dl_parent_blkno = 0x64e69f, > dl_inode = 0xffff9c7d13c624c0, > crash> struct ocfs2_dentry_lock -x ffff9c7dbffad200 |egrep "parent|inode" > dl_parent_blkno = 0x64e69f, > dl_inode = 0xffff9c7d13c624c0, > > ffff9c7dbffe2800 is dl1, ffff9c7dbffad200 is dl2, they match well in > dl_parent_blkno and dl_inode fields. > > Verifying dentry: > crash> struct dentry ffff9c7d8b659780 |grep 24c0 > d_inode = 0xffff9c7d13c624c0, > crash> struct dentry ffff9c7d8b659780 |grep fsdata > d_fsdata = 0xffff9c7dbffad200, > > d_fsdata matches dl2 well. > > Above is the analysis (hoping the format is still good after email > sent). Fix is self-explaining. > > Thanks, > Wengang > > On 2019/5/5 0:15, Changwei Ge wrote: >> Hi Wengang, >> >> >> We actually met some strange problems around denty cluster lock >> attachment before. I suppose solidifying around logic makes sense. >> >> Can you make your change log richer with a little more analysis? >> >> If you can paste a dlm lock resouce and ocfs2 lock resource snapshot >> from coredump, that would be great and I can help figure what bad happened. >> >> >> Thanks, >> >> Changwei >> >> >> >> On 2019/5/4 1:01 上午, Wengang Wang wrote: >>> ocfs2_dentry_attach_lock() can be executed in parallel threads against the >>> same dentry. Make that race safe. >>> >>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> >>> --- >>> fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ >>> 1 file changed, 21 insertions(+), 6 deletions(-) >>> >>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c >>> index 290373024d9d..980e571c163c 100644 >>> --- a/fs/ocfs2/dcache.c >>> +++ b/fs/ocfs2/dcache.c >>> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >>> struct inode *inode, >>> u64 parent_blkno) >>> { >>> - int ret; >>> + int ret, new_dl = 0; >>> struct dentry *alias; >>> - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; >>> + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; >>> >>> trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, >>> - (unsigned long long)parent_blkno, dl); >>> + (unsigned long long)parent_blkno, >>> + dentry->d_fsdata); >>> >>> /* >>> * Negative dentry. We ignore these for now. >>> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >>> if (!inode) >>> return 0; >>> >>> + spin_lock(&dentry_attach_lock); >>> if (d_really_is_negative(dentry) && dentry->d_fsdata) { >>> /* Converting a negative dentry to positive >>> Clear dentry->d_fsdata */ >>> dentry->d_fsdata = dl = NULL; >>> + } else { >>> + dl = dentry->d_fsdata; >>> } >>> + spin_unlock(&dentry_attach_lock); >>> >>> if (dl) { >>> mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, >>> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >>> >>> out_attach: >>> spin_lock(&dentry_attach_lock); >>> - dentry->d_fsdata = dl; >>> - dl->dl_count++; >>> + /* d_fsdata could be set by parallel thread */ >>> + if (unlikely(dentry->d_fsdata)) { >>> + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); >>> + if (!alias) >>> + dl_free_on_race = dl; >>> + } else { >>> + dentry->d_fsdata = dl; >>> + dl->dl_count++; >>> + if (!alias) >>> + new_dl = 1; >>> + } >>> spin_unlock(&dentry_attach_lock); >>> >>> + kfree(dl_free_on_race); >>> /* >>> * This actually gets us our PRMODE level lock. From now on, >>> * we'll have a notification if one of these names is >>> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >>> * We need to do this because error here means no d_instantiate(), >>> * which means iput() will not be called during dput(dentry). >>> */ >>> - if (ret < 0 && !alias) { >>> + if (ret < 0 && new_dl) { >>> ocfs2_lock_res_free(&dl->dl_lockres); >>> BUG_ON(dl->dl_count != 1); >>> spin_lock(&dentry_attach_lock); > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel@oss.oracle.com > https://oss.oracle.com/mailman/listinfo/ocfs2-devel thread A thread B (A1) enter ocfs2_dentry_attach_lock, seeing dentry->d_fsdata is NULL, and no alias found by ocfs2_find_local_alias, kmalloc a new ocfs2_dentry_lock structure to local variable "dl", dl1 ..... (B1) enter ocfs2_dentry_attach_lock, seeing dentry->d_fsdata is NULL, andno alias found by ocfs2_find_local_alias so kmalloc a new ocfs2_dentry_lock structure to local variable "dl", dl2 ...... (A2) set dentry->d_fsdata with dl1 ...... (A3) ocfs2_dentry_lock() succeeded and dl1->dl_lockres.l_ro_holders increased to 1. ...... (B2) set dentry->d_fsdata with dl2 ....... (B3) ocfs2_dentry_lock() succeeded and dl2->dl_lockres.l_ro_holders increased to 1. ...... (A4) ocfs2_dentry_unlock() called and dl2->dl_lockres.l_ro_holders decreased to 0. .... (B4) ocfs2_dentry_unlock() called, decreasing dl2->dl_lockres.l_ro_holders, but see it's zero now, panic
Hi Wengang, You indeed shared us a wonderful analysis :) I understand the the race now, but still have some thoughts on your draft patch. Please check out them inline. On 2019/5/7 1:55 上午, Wengang Wang wrote: > Hi Changwei, > > Actually I'd like to put the details after I confirmed the patch fixes > (I didn't get that information yet so far). > But since you are interested, I am putting it here. > > The in core dlm_lock_resource and the dlm_locks in the granted list > have the following looking: > > <struct dlm_lock_resource 0xffff9c7e9ad57cc0> > ---------------------------------------------------------------------------------------------------> > > lockname N000000000064e69f009a7bb5 hex: > 4e303030303030303030303634653639660000009a7bb500000 > owner 2 > dlm 34F9A5BE6F2A481B8DE25ABAE8BBC68D > state NOSTATE > *dlm_locks in granted list: > <dlm_lock > 0xffff9c7e0429d700>--------------------------------------------------------------> > ml.node 1 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7e05093720/ containing ocfs2_lock_res > ffff9c7e050936a0 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > > <dlm_lock > 0xffff9c7e02f3ee80>--------------------------------------------------------------> > ml.node 0 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7e30658a20/ containing ocfs2_lock_res > ffff9c7e306589a0 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > > <dlm_lock > 0xffff9c7d3ef3fd80>--------------------------------------------------------------> > ml.node 2 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7dbffe2898/ containing ocfs2_lock_res > ffff9c7dbffe2818 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > > <dlm_lock > 0xffff9c7e41c97880>--------------------------------------------------------------> > ml.node 2 > lockres 0xffff9c7e9ad57cc0 > refs 2 > lksb ffff9c7dbffad298/ containing ocfs2_lock_res > ffff9c7dbffad218 > convert_pending 0 > ml.list 0 > ml.flags 0 > type 3 convert_type -1 > ml.highest_blocked -1 > <------------------------------------------------------------------------------------------- > > 4 dlm_lock in granted list > > You see there are two granted dlm_lock for node 2 (0xffff9c7d3ef3fd80 > and 0xffff9c7e41c97880). > Normally, ocfs2 FS layer wouldn't request duplicated new lock against > same lock resource, It would request coverts instead after the first > new lock request. > So the two existence of dlm_lock for the same lock resource is a hint > for ocfs2 FS layer racing. I agree with your opinion that locking against to the same lock resource (shared inode, etc) should only have a unique ocfs2 lock resource object. So do you think we should add some status check in dlm layer to assert insane cluster locking logic? For instance, just check if a node wants to be granted twice, which is obviously unacceptable and unnecessary. > > From lksb, we know the containing ocfs2_lock_res objects > (ffff9c7dbffe2818 and ffff9c7dbffad218) > > crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffe2818 |egrep > "holder|flag|name" > l_name N000000000064e69f009a7bb5 hex: > 4e303030303030303030303634653639660000009a7bb500000 > l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED > l_ro_holders 1 > l_ex_holders 0 > crash> ocfs2 struct ocfs2_lock_res ffff9c7dbffad218 |egrep > "holder|flag|name" > l_name N000000000064e69f009a7bb5 hex: > 4e303030303030303030303634653639660000009a7bb500000 > l_flags |OCFS2_LOCK_ATTACHED|OCFS2_LOCK_INITIALIZED > l_ro_holders 0 > l_ex_holders 0 > > The one @ffff9c7dbffad218 is the victim with l_ro_holdrs being 0. > > I think the racing is like this: > > thread A thread B > > (A1) enter ocfs2_dentry_attach_lock, > seeing dentry->d_fsdata is NULL, > and no alias found by > ocfs2_find_local_alias, > kmalloc a new ocfs2_dentry_lock > structure to local variable "dl", dl1 > > ..... > > (B1) enter ocfs2_dentry_attach_lock, > seeing dentry->d_fsdata is NULL, > andno alias found by > ocfs2_find_local_alias so kmalloc > a new > ocfs2_dentry_lock > structure to local variable "dl", dl2 > > ...... > > (A2) set dentry->d_fsdata with dl1 > > ...... > > (A3) ocfs2_dentry_lock() succeeded > and dl1->dl_lockres.l_ro_holders > increased to 1. > ...... > > (B2) set dentry->d_fsdata with dl2 > > ....... > > (B3) ocfs2_dentry_lock() succeeded > and > dl2->dl_lockres.l_ro_holders > increased to 1. > > ...... > > (A4) ocfs2_dentry_unlock() called > and dl2->dl_lockres.l_ro_holders > decreased to 0. > > .... > > (B4) ocfs2_dentry_unlock() called, > decreasing dl2->dl_lockres.l_ro_holders, > but see it's zero now, panic > > > The point is that for thread A, ro_holders increased on dl1 at step A3 > but decreased on dl2 at step A4. This causes dl1 has ro_holders left > 1. It also caused and extra and unexpected decreasing on dl2 making > the ro_holders 0. > Step B2 caused a memory leak (but not the topic here). Step B4 hit the > panic on seeing ro_holders on dl2 is zero. > > Go back to the vmcore, let's find out dl1 and dl2: > > crash> kmem ffff9c7dbffe2818 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > ffff9c7e97c03080 512 36007 36128 1129 16k kmalloc-512 > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffeeaec2fff800 ffff9c7dbffe0000 0 32 25 7 > FREE / [ALLOCATED] > [ffff9c7dbffe2800] > > PAGE PHYSICAL MAPPING INDEX CNT FLAGS > ffffeeaec2fff880 bffe2000 dead000000000400 0 0 fffffc0000000 > crash> kmem ffff9c7dbffad218 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > ffff9c7e97c03080 512 36007 36128 1129 16k kmalloc-512 > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffeeaec2ffeb00 ffff9c7dbffac000 0 32 23 9 > FREE / [ALLOCATED] > [ffff9c7dbffad200] > > PAGE PHYSICAL MAPPING INDEX CNT FLAGS > ffffeeaec2ffeb40 bffad000 dead0000ffffffff 0 0 fffffc0000000 > > crash> struct ocfs2_dentry_lock -x ffff9c7dbffe2800 |egrep "parent|inode" > dl_parent_blkno = 0x64e69f, > dl_inode = 0xffff9c7d13c624c0, > crash> struct ocfs2_dentry_lock -x ffff9c7dbffad200 |egrep "parent|inode" > dl_parent_blkno = 0x64e69f, > dl_inode = 0xffff9c7d13c624c0, > > ffff9c7dbffe2800 is dl1, ffff9c7dbffad200 is dl2, they match well in > dl_parent_blkno and dl_inode fields. > > Verifying dentry: > crash> struct dentry ffff9c7d8b659780 |grep 24c0 > d_inode = 0xffff9c7d13c624c0, > crash> struct dentry ffff9c7d8b659780 |grep fsdata > d_fsdata = 0xffff9c7dbffad200, > > d_fsdata matches dl2 well. > > Above is the analysis (hoping the format is still good after email > sent). Fix is self-explaining. > > Thanks, > Wengang > > On 2019/5/5 0:15, Changwei Ge wrote: >> Hi Wengang, >> >> >> We actually met some strange problems around denty cluster lock >> attachment before. I suppose solidifying around logic makes sense. >> >> Can you make your change log richer with a little more analysis? >> >> If you can paste a dlm lock resouce and ocfs2 lock resource snapshot >> from coredump, that would be great and I can help figure what bad >> happened. >> >> >> Thanks, >> >> Changwei >> >> >> >> On 2019/5/4 1:01 上午, Wengang Wang wrote: >>> ocfs2_dentry_attach_lock() can be executed in parallel threads >>> against the >>> same dentry. Make that race safe. >>> >>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> >>> --- >>> fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ >>> 1 file changed, 21 insertions(+), 6 deletions(-) >>> >>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c >>> index 290373024d9d..980e571c163c 100644 >>> --- a/fs/ocfs2/dcache.c >>> +++ b/fs/ocfs2/dcache.c >>> @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry >>> *dentry, >>> struct inode *inode, >>> u64 parent_blkno) >>> { >>> - int ret; >>> + int ret, new_dl = 0; >>> struct dentry *alias; >>> - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; >>> + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; >>> trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, >>> - (unsigned long long)parent_blkno, dl); >>> + (unsigned long long)parent_blkno, >>> + dentry->d_fsdata); >>> /* >>> * Negative dentry. We ignore these for now. >>> @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry >>> *dentry, >>> if (!inode) >>> return 0; >>> + spin_lock(&dentry_attach_lock); >>> if (d_really_is_negative(dentry) && dentry->d_fsdata) { >>> /* Converting a negative dentry to positive >>> Clear dentry->d_fsdata */ >>> dentry->d_fsdata = dl = NULL; >>> + } else { >>> + dl = dentry->d_fsdata; >>> } >>> + spin_unlock(&dentry_attach_lock); >>> I don't understand why acquring dentry_attach_lock here. Is there any race you want to close here? >>> if (dl) { >>> mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, >>> @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry >>> *dentry, >>> out_attach: >>> spin_lock(&dentry_attach_lock); >>> - dentry->d_fsdata = dl; >>> - dl->dl_count++; >>> + /* d_fsdata could be set by parallel thread */ >>> + if (unlikely(dentry->d_fsdata)) { >>> + pr_err("race found in ocfs2_dentry_attach_lock >>> dentry=%p\n", dentry); >>> + if (!alias) >>> + dl_free_on_race = dl; >>> + } else { >>> + dentry->d_fsdata = dl; >>> + dl->dl_count++; >>> + if (!alias) >>> + new_dl = 1; >>> + } >>> spin_unlock(&dentry_attach_lock); >>> + kfree(dl_free_on_race); Do we also have to do something for 'dl->dl_inode = igrab(inode)' ? And if we don't have to allocate new lock resource for racing thread, that would be better. How about just make the racing one wait somehow? Thanks, Changwei >>> /* >>> * This actually gets us our PRMODE level lock. From now on, >>> * we'll have a notification if one of these names is >>> @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, >>> * We need to do this because error here means no >>> d_instantiate(), >>> * which means iput() will not be called during dput(dentry). >>> */ >>> - if (ret < 0 && !alias) { >>> + if (ret < 0 && new_dl) { >>> ocfs2_lock_res_free(&dl->dl_lockres); >>> BUG_ON(dl->dl_count != 1); >>> spin_lock(&dentry_attach_lock);
Hi Wengang, I have applied your patch to the latest debian kernel in "stretch-backports", and been using it for 1..2 days. Today is the first time that I saw the pr_err() message: May 07 15:23:19 drs1p001 kernel: race found in ocfs2_dentry_attach_lock dentry=0000000016c2f15c I don't know whether it helps, but I thought I'll report it anyway. I don't remember any special activity at that time, just the same stuff I was doing all day. Regards, Daniel -----Original Message----- From: Wengang Wang <wen.gang.wang@oracle.com> Sent: Freitag, 3. Mai 2019 19:03 To: 'ocfs2-devel@oss.oracle.com' <ocfs2-devel@oss.oracle.com> Cc: Daniel Sobe <daniel.sobe@nxp.com> Subject: [EXT] Re: [PATCH] fs/ocfs2: fix race in ocfs2_dentry_attach_lock Caution: EXT Email Daneil, You could try this patch. I didn't confirm yet if the patch helped or the problem didn't happen again where it happened (though I am sure the racing is there). Anyone who is interested could take a pre-review on this patch. Thanks, Wengang On 2019/5/3 10:01, Wengang Wang wrote: > ocfs2_dentry_attach_lock() can be executed in parallel threads against > the same dentry. Make that race safe. > > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ > 1 file changed, 21 insertions(+), 6 deletions(-) > > diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index > 290373024d9d..980e571c163c 100644 > --- a/fs/ocfs2/dcache.c > +++ b/fs/ocfs2/dcache.c > @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > struct inode *inode, > u64 parent_blkno) > { > - int ret; > + int ret, new_dl = 0; > struct dentry *alias; > - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; > + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; > > trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, > - (unsigned long long)parent_blkno, dl); > + (unsigned long long)parent_blkno, > + dentry->d_fsdata); > > /* > * Negative dentry. We ignore these for now. > @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > if (!inode) > return 0; > > + spin_lock(&dentry_attach_lock); > if (d_really_is_negative(dentry) && dentry->d_fsdata) { > /* Converting a negative dentry to positive > Clear dentry->d_fsdata */ > dentry->d_fsdata = dl = NULL; > + } else { > + dl = dentry->d_fsdata; > } > + spin_unlock(&dentry_attach_lock); > > if (dl) { > mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, @@ > -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > > out_attach: > spin_lock(&dentry_attach_lock); > - dentry->d_fsdata = dl; > - dl->dl_count++; > + /* d_fsdata could be set by parallel thread */ > + if (unlikely(dentry->d_fsdata)) { > + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); > + if (!alias) > + dl_free_on_race = dl; > + } else { > + dentry->d_fsdata = dl; > + dl->dl_count++; > + if (!alias) > + new_dl = 1; > + } > spin_unlock(&dentry_attach_lock); > > + kfree(dl_free_on_race); > /* > * This actually gets us our PRMODE level lock. From now on, > * we'll have a notification if one of these names is @@ -330,7 > +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > * We need to do this because error here means no d_instantiate(), > * which means iput() will not be called during dput(dentry). > */ > - if (ret < 0 && !alias) { > + if (ret < 0 && new_dl) { > ocfs2_lock_res_free(&dl->dl_lockres); > BUG_ON(dl->dl_count != 1); > spin_lock(&dentry_attach_lock);
About 30 minutes later this happened, I don't know whether it is related. Could it be related to not running the latest of the 4.19.X kernel? May 7 15:51:07 drs1p001 kernel: [99859.583930] list_del corruption. prev->next should be ffff94bb01bda710, but was ffff94bacceda7c0 May 7 15:51:07 drs1p001 kernel: [99859.583951] ------------[ cut here ]------------ May 7 15:51:07 drs1p001 kernel: [99859.583951] kernel BUG at /kernelbuild/linux-4.19.28/lib/list_debug.c:53! May 7 15:51:07 drs1p001 kernel: [99859.583960] invalid opcode: 0000 [#1] SMP PTI May 7 15:51:07 drs1p001 kernel: [99859.583964] CPU: 1 PID: 38 Comm: khugepaged Tainted: G E 4.19.0-0.bpo.4-amd64 #1 Debian 4.19.28-2~bpo9+1 May 7 15:51:07 drs1p001 kernel: [99859.583966] Hardware name: Dell Inc. OptiPlex 5040/0R790T, BIOS 1.2.7 01/15/2016 May 7 15:51:07 drs1p001 kernel: [99859.583974] RIP: 0010:__list_del_entry_valid+0x70/0x90 May 7 15:51:07 drs1p001 kernel: [99859.583976] Code: 7d e8 94 e8 90 c6 d0 ff 0f 0b 48 89 fe 48 c7 c7 30 7e e8 94 e8 7f c6 d0 ff 0f 0b 48 89 fe 48 c7 c7 68 7e e8 94 e8 6e c6 d0 ff <0f> 0b 48 89 fe 48 c7 c7 a8 7e e8 94 e8 5d c6 d0 ff 0f 0b 90 90 90 May 7 15:51:07 drs1p001 kernel: [99859.583980] RSP: 0018:ffffb96d00de7848 EFLAGS: 00010286 May 7 15:51:07 drs1p001 kernel: [99859.583982] RAX: 0000000000000054 RBX: ffff94bb01bda618 RCX: 0000000000000000 May 7 15:51:07 drs1p001 kernel: [99859.583984] RDX: 0000000000000000 RSI: ffff94bbd1a966a8 RDI: ffff94bbd1a966a8 May 7 15:51:07 drs1p001 kernel: [99859.583986] RBP: ffff94bb01bda710 R08: 0000000000000000 R09: 0000000000000471 May 7 15:51:07 drs1p001 kernel: [99859.583988] R10: ffffb96d00de7838 R11: ffffffff955ea6ed R12: ffff94bb01bda618 May 7 15:51:07 drs1p001 kernel: [99859.583990] R13: ffff94bb3e8327d8 R14: 0000000000000000 R15: ffff94bb730056c0 May 7 15:51:07 drs1p001 kernel: [99859.583992] FS: 0000000000000000(0000) GS:ffff94bbd1a80000(0000) knlGS:0000000000000000 May 7 15:51:07 drs1p001 kernel: [99859.583995] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 May 7 15:51:07 drs1p001 kernel: [99859.583997] CR2: 00007f5acb700e00 CR3: 0000000016c0a002 CR4: 00000000003606e0 May 7 15:51:07 drs1p001 kernel: [99859.583999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 May 7 15:51:07 drs1p001 kernel: [99859.584001] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 May 7 15:51:07 drs1p001 kernel: [99859.584003] Call Trace: May 7 15:51:07 drs1p001 kernel: [99859.584065] ocfs2_remove_lockres_tracking+0x31/0x70 [ocfs2] May 7 15:51:07 drs1p001 kernel: [99859.584081] ocfs2_lock_res_free+0x44/0x260 [ocfs2] May 7 15:51:07 drs1p001 kernel: [99859.584093] ocfs2_dentry_lock_put+0x5f/0x70 [ocfs2] May 7 15:51:07 drs1p001 kernel: [99859.584106] ocfs2_dentry_iput+0x49/0x110 [ocfs2] May 7 15:51:07 drs1p001 kernel: [99859.584110] __dentry_kill+0xd3/0x180 May 7 15:51:07 drs1p001 kernel: [99859.584113] dentry_kill+0x4d/0x1b0 May 7 15:51:07 drs1p001 kernel: [99859.584115] shrink_dentry_list+0xc3/0x1d0 May 7 15:51:07 drs1p001 kernel: [99859.584117] prune_dcache_sb+0x52/0x70 May 7 15:51:07 drs1p001 kernel: [99859.584121] super_cache_scan+0x10c/0x190 May 7 15:51:07 drs1p001 kernel: [99859.584124] do_shrink_slab+0x12a/0x2b0 May 7 15:51:07 drs1p001 kernel: [99859.584127] shrink_slab+0x214/0x2b0 May 7 15:51:07 drs1p001 kernel: [99859.584130] shrink_node+0xf0/0x470 May 7 15:51:07 drs1p001 kernel: [99859.584133] do_try_to_free_pages+0xeb/0x370 May 7 15:51:07 drs1p001 kernel: [99859.584135] try_to_free_pages+0xf4/0x1b0 May 7 15:51:07 drs1p001 kernel: [99859.584139] __alloc_pages_slowpath+0x2ee/0xdb0 May 7 15:51:07 drs1p001 kernel: [99859.584144] ? __switch_to_asm+0x40/0x70 May 7 15:51:07 drs1p001 kernel: [99859.584147] __alloc_pages_nodemask+0x258/0x2a0 May 7 15:51:07 drs1p001 kernel: [99859.584150] khugepaged+0xc9b/0x21d0 May 7 15:51:07 drs1p001 kernel: [99859.584156] ? remove_wait_queue+0x60/0x60 May 7 15:51:07 drs1p001 kernel: [99859.584159] kthread+0xf8/0x130 May 7 15:51:07 drs1p001 kernel: [99859.584162] ? collapse_shmem+0xc50/0xc50 May 7 15:51:07 drs1p001 kernel: [99859.584164] ? kthread_create_worker_on_cpu+0x70/0x70 May 7 15:51:07 drs1p001 kernel: [99859.584166] ret_from_fork+0x35/0x40 May 7 15:51:07 drs1p001 kernel: [99859.584169] Modules linked in: tcp_diag(E) inet_diag(E) unix_diag(E) veth(E) ocfs2(E) quota_tree(E) ocfs2_dlmfs(E) ocfs2_stack_o2cb(E) ocfs2_dlm(E) ocfs2_nodemanager(E) ocfs2_stackglue(E) configfs(E) bridge(E) stp(E) llc(E) iptable_filter(E) overlay(E) snd_hda_codec_hdmi(E) rfkill(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) fuse(E) intel_rapl(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) dell_wmi(E) mei_wdt(E) wmi_bmof(E) dell_smbios(E) sparse_keymap(E) i915(E) kvm_intel(E) dell_wmi_descriptor(E) snd_hda_intel(E) evdev(E) kvm(E) irqbypass(E) drm_kms_helper(E) crct10dif_pclmul(E) snd_hda_codec(E) crc32_pclmul(E) snd_hda_core(E) dcdbas(E) snd_hwdep(E) ghash_clmulni_intel(E) drm(E) intel_cstate(E) i2c_algo_bit(E) snd_pcm(E) intel_pch_thermal(E) intel_uncore(E) mei_me(E) pcspkr(E) May 7 15:51:07 drs1p001 kernel: [99859.584208] intel_rapl_perf(E) mei(E) snd_timer(E) iTCO_wdt(E) iTCO_vendor_support(E) snd(E) soundcore(E) sg(E) serio_raw(E) wmi(E) video(E) pcc_cpufreq(E) button(E) acpi_pad(E) drbd(E) lru_cache(E) libcrc32c(E) ip_tables(E) x_tables(E) autofs4(E) ext4(E) crc16(E) mbcache(E) jbd2(E) crc32c_generic(E) fscrypto(E) ecb(E) dm_mod(E) sr_mod(E) cdrom(E) sd_mod(E) crc32c_intel(E) aesni_intel(E) aes_x86_64(E) crypto_simd(E) cryptd(E) glue_helper(E) psmouse(E) xhci_pci(E) e1000(E) ahci(E) libahci(E) xhci_hcd(E) e1000e(E) libata(E) i2c_i801(E) usbcore(E) scsi_mod(E) usb_common(E) fan(E) thermal(E) May 7 15:51:07 drs1p001 kernel: [99859.584243] ---[ end trace 2baf8e44ef57535a ]--- -----Original Message----- From: ocfs2-devel-bounces@oss.oracle.com <ocfs2-devel-bounces@oss.oracle.com> On Behalf Of Daniel Sobe Sent: Dienstag, 7. Mai 2019 15:53 To: Wengang Wang <wen.gang.wang@oracle.com>; 'ocfs2-devel@oss.oracle.com' <ocfs2-devel@oss.oracle.com> Subject: Re: [Ocfs2-devel] [EXT] Re: [PATCH] fs/ocfs2: fix race in ocfs2_dentry_attach_lock Caution: EXT Email Hi Wengang, I have applied your patch to the latest debian kernel in "stretch-backports", and been using it for 1..2 days. Today is the first time that I saw the pr_err() message: May 07 15:23:19 drs1p001 kernel: race found in ocfs2_dentry_attach_lock dentry=0000000016c2f15c I don't know whether it helps, but I thought I'll report it anyway. I don't remember any special activity at that time, just the same stuff I was doing all day. Regards, Daniel -----Original Message----- From: Wengang Wang <wen.gang.wang@oracle.com> Sent: Freitag, 3. Mai 2019 19:03 To: 'ocfs2-devel@oss.oracle.com' <ocfs2-devel@oss.oracle.com> Cc: Daniel Sobe <daniel.sobe@nxp.com> Subject: [EXT] Re: [PATCH] fs/ocfs2: fix race in ocfs2_dentry_attach_lock Caution: EXT Email Daneil, You could try this patch. I didn't confirm yet if the patch helped or the problem didn't happen again where it happened (though I am sure the racing is there). Anyone who is interested could take a pre-review on this patch. Thanks, Wengang On 2019/5/3 10:01, Wengang Wang wrote: > ocfs2_dentry_attach_lock() can be executed in parallel threads against > the same dentry. Make that race safe. > > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ > 1 file changed, 21 insertions(+), 6 deletions(-) > > diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index > 290373024d9d..980e571c163c 100644 > --- a/fs/ocfs2/dcache.c > +++ b/fs/ocfs2/dcache.c > @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > struct inode *inode, > u64 parent_blkno) > { > - int ret; > + int ret, new_dl = 0; > struct dentry *alias; > - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; > + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; > > trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, > - (unsigned long long)parent_blkno, dl); > + (unsigned long long)parent_blkno, > + dentry->d_fsdata); > > /* > * Negative dentry. We ignore these for now. > @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > if (!inode) > return 0; > > + spin_lock(&dentry_attach_lock); > if (d_really_is_negative(dentry) && dentry->d_fsdata) { > /* Converting a negative dentry to positive > Clear dentry->d_fsdata */ > dentry->d_fsdata = dl = NULL; > + } else { > + dl = dentry->d_fsdata; > } > + spin_unlock(&dentry_attach_lock); > > if (dl) { > mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, @@ > -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > > out_attach: > spin_lock(&dentry_attach_lock); > - dentry->d_fsdata = dl; > - dl->dl_count++; > + /* d_fsdata could be set by parallel thread */ > + if (unlikely(dentry->d_fsdata)) { > + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); > + if (!alias) > + dl_free_on_race = dl; > + } else { > + dentry->d_fsdata = dl; > + dl->dl_count++; > + if (!alias) > + new_dl = 1; > + } > spin_unlock(&dentry_attach_lock); > > + kfree(dl_free_on_race); > /* > * This actually gets us our PRMODE level lock. From now on, > * we'll have a notification if one of these names is @@ -330,7 > +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, > * We need to do this because error here means no d_instantiate(), > * which means iput() will not be called during dput(dentry). > */ > - if (ret < 0 && !alias) { > + if (ret < 0 && new_dl) { > ocfs2_lock_res_free(&dl->dl_lockres); > BUG_ON(dl->dl_count != 1); > spin_lock(&dentry_attach_lock);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 290373024d9d..980e571c163c 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -227,12 +227,13 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno) { - int ret; + int ret, new_dl = 0; struct dentry *alias; - struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_dentry_lock *dl, *dl_free_on_race = NULL; trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, - (unsigned long long)parent_blkno, dl); + (unsigned long long)parent_blkno, + dentry->d_fsdata); /* * Negative dentry. We ignore these for now. @@ -243,11 +244,15 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; + spin_lock(&dentry_attach_lock); if (d_really_is_negative(dentry) && dentry->d_fsdata) { /* Converting a negative dentry to positive Clear dentry->d_fsdata */ dentry->d_fsdata = dl = NULL; + } else { + dl = dentry->d_fsdata; } + spin_unlock(&dentry_attach_lock); if (dl) { mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, @@ -310,10 +315,20 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, out_attach: spin_lock(&dentry_attach_lock); - dentry->d_fsdata = dl; - dl->dl_count++; + /* d_fsdata could be set by parallel thread */ + if (unlikely(dentry->d_fsdata)) { + pr_err("race found in ocfs2_dentry_attach_lock dentry=%p\n", dentry); + if (!alias) + dl_free_on_race = dl; + } else { + dentry->d_fsdata = dl; + dl->dl_count++; + if (!alias) + new_dl = 1; + } spin_unlock(&dentry_attach_lock); + kfree(dl_free_on_race); /* * This actually gets us our PRMODE level lock. From now on, * we'll have a notification if one of these names is @@ -330,7 +345,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, * We need to do this because error here means no d_instantiate(), * which means iput() will not be called during dput(dentry). */ - if (ret < 0 && !alias) { + if (ret < 0 && new_dl) { ocfs2_lock_res_free(&dl->dl_lockres); BUG_ON(dl->dl_count != 1); spin_lock(&dentry_attach_lock);
ocfs2_dentry_attach_lock() can be executed in parallel threads against the same dentry. Make that race safe. Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> --- fs/ocfs2/dcache.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-)