Message ID | 20210110020140.141727-1-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3] ceph: defer flushing the capsnap if the Fb is used | expand |
On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > If the Fb cap is used it means the current inode is flushing the > dirty data to OSD, just defer flushing the capsnap. > > URL: https://tracker.ceph.com/issues/48679 > URL: https://tracker.ceph.com/issues/48640 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > > V3: > - Add more comments about putting the inode ref > - A small change about the code style > > V2: > - Fix inode reference leak bug > > fs/ceph/caps.c | 32 +++++++++++++++++++------------- > fs/ceph/snap.c | 6 +++--- > 2 files changed, 22 insertions(+), 16 deletions(-) > Hi Xiubo, This patch seems to cause hangs in some xfstests (generic/013, in particular). I'll take a closer look when I have a chance, but I'm dropping this for now. -- Jeff > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > index abbf48fc6230..b00234cf3b04 100644 > --- a/fs/ceph/caps.c > +++ b/fs/ceph/caps.c > @@ -3047,6 +3047,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, > { > struct inode *inode = &ci->vfs_inode; > int last = 0, put = 0, flushsnaps = 0, wake = 0; > + bool check_flushsnaps = false; > > > > > spin_lock(&ci->i_ceph_lock); > if (had & CEPH_CAP_PIN) > @@ -3063,26 +3064,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, > if (had & CEPH_CAP_FILE_BUFFER) { > if (--ci->i_wb_ref == 0) { > last++; > + /* put the ref held by ceph_take_cap_refs() */ > put++; > + check_flushsnaps = true; > } > dout("put_cap_refs %p wb %d -> %d (?)\n", > inode, ci->i_wb_ref+1, ci->i_wb_ref); > } > - if (had & CEPH_CAP_FILE_WR) > + if (had & CEPH_CAP_FILE_WR) { > if (--ci->i_wr_ref == 0) { > last++; > - if (__ceph_have_pending_cap_snap(ci)) { > - struct ceph_cap_snap *capsnap = > - list_last_entry(&ci->i_cap_snaps, > - struct ceph_cap_snap, > - ci_item); > - capsnap->writing = 0; > - if (ceph_try_drop_cap_snap(ci, capsnap)) > - put++; > - else if (__ceph_finish_cap_snap(ci, capsnap)) > - flushsnaps = 1; > - wake = 1; > - } > + check_flushsnaps = true; > if (ci->i_wrbuffer_ref_head == 0 && > ci->i_dirty_caps == 0 && > ci->i_flushing_caps == 0) { > @@ -3094,6 +3086,20 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, > if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) > drop_inode_snap_realm(ci); > } > + } > + if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { > + struct ceph_cap_snap *capsnap = > + list_last_entry(&ci->i_cap_snaps, > + struct ceph_cap_snap, > + ci_item); > + capsnap->writing = 0; > + if (ceph_try_drop_cap_snap(ci, capsnap)) > + /* put the ref held by ceph_queue_cap_snap() */ > + put++; > + else if (__ceph_finish_cap_snap(ci, capsnap)) > + flushsnaps = 1; > + wake = 1; > + } > spin_unlock(&ci->i_ceph_lock); > > > > > dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c > index b611f829cb61..639fb91cc9db 100644 > --- a/fs/ceph/snap.c > +++ b/fs/ceph/snap.c > @@ -561,10 +561,10 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) > capsnap->context = old_snapc; > list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); > > > > > - if (used & CEPH_CAP_FILE_WR) { > + if (used & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) { > dout("queue_cap_snap %p cap_snap %p snapc %p" > - " seq %llu used WR, now pending\n", inode, > - capsnap, old_snapc, old_snapc->seq); > + " seq %llu used WR | BUFFFER, now pending\n", > + inode, capsnap, old_snapc, old_snapc->seq); > capsnap->writing = 1; > } else { > /* note mtime, size NOW. */
On 2021/1/13 5:48, Jeff Layton wrote: > On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> If the Fb cap is used it means the current inode is flushing the >> dirty data to OSD, just defer flushing the capsnap. >> >> URL: https://tracker.ceph.com/issues/48679 >> URL: https://tracker.ceph.com/issues/48640 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> >> V3: >> - Add more comments about putting the inode ref >> - A small change about the code style >> >> V2: >> - Fix inode reference leak bug >> >> fs/ceph/caps.c | 32 +++++++++++++++++++------------- >> fs/ceph/snap.c | 6 +++--- >> 2 files changed, 22 insertions(+), 16 deletions(-) >> > Hi Xiubo, > > This patch seems to cause hangs in some xfstests (generic/013, in > particular). I'll take a closer look when I have a chance, but I'm > dropping this for now. Okay. BTW, what's your test commands to reproduce it ? I will take a look when I am free these days or later. BRs > > -- Jeff > > >> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c >> index abbf48fc6230..b00234cf3b04 100644 >> --- a/fs/ceph/caps.c >> +++ b/fs/ceph/caps.c >> @@ -3047,6 +3047,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, >> { >> struct inode *inode = &ci->vfs_inode; >> int last = 0, put = 0, flushsnaps = 0, wake = 0; >> + bool check_flushsnaps = false; >> >> >> >> >> spin_lock(&ci->i_ceph_lock); >> if (had & CEPH_CAP_PIN) >> @@ -3063,26 +3064,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, >> if (had & CEPH_CAP_FILE_BUFFER) { >> if (--ci->i_wb_ref == 0) { >> last++; >> + /* put the ref held by ceph_take_cap_refs() */ >> put++; >> + check_flushsnaps = true; >> } >> dout("put_cap_refs %p wb %d -> %d (?)\n", >> inode, ci->i_wb_ref+1, ci->i_wb_ref); >> } >> - if (had & CEPH_CAP_FILE_WR) >> + if (had & CEPH_CAP_FILE_WR) { >> if (--ci->i_wr_ref == 0) { >> last++; >> - if (__ceph_have_pending_cap_snap(ci)) { >> - struct ceph_cap_snap *capsnap = >> - list_last_entry(&ci->i_cap_snaps, >> - struct ceph_cap_snap, >> - ci_item); >> - capsnap->writing = 0; >> - if (ceph_try_drop_cap_snap(ci, capsnap)) >> - put++; >> - else if (__ceph_finish_cap_snap(ci, capsnap)) >> - flushsnaps = 1; >> - wake = 1; >> - } >> + check_flushsnaps = true; >> if (ci->i_wrbuffer_ref_head == 0 && >> ci->i_dirty_caps == 0 && >> ci->i_flushing_caps == 0) { >> @@ -3094,6 +3086,20 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, >> if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) >> drop_inode_snap_realm(ci); >> } >> + } >> + if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { >> + struct ceph_cap_snap *capsnap = >> + list_last_entry(&ci->i_cap_snaps, >> + struct ceph_cap_snap, >> + ci_item); >> + capsnap->writing = 0; >> + if (ceph_try_drop_cap_snap(ci, capsnap)) >> + /* put the ref held by ceph_queue_cap_snap() */ >> + put++; >> + else if (__ceph_finish_cap_snap(ci, capsnap)) >> + flushsnaps = 1; >> + wake = 1; >> + } >> spin_unlock(&ci->i_ceph_lock); >> >> >> >> >> dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), >> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c >> index b611f829cb61..639fb91cc9db 100644 >> --- a/fs/ceph/snap.c >> +++ b/fs/ceph/snap.c >> @@ -561,10 +561,10 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) >> capsnap->context = old_snapc; >> list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); >> >> >> >> >> - if (used & CEPH_CAP_FILE_WR) { >> + if (used & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) { >> dout("queue_cap_snap %p cap_snap %p snapc %p" >> - " seq %llu used WR, now pending\n", inode, >> - capsnap, old_snapc, old_snapc->seq); >> + " seq %llu used WR | BUFFFER, now pending\n", >> + inode, capsnap, old_snapc, old_snapc->seq); >> capsnap->writing = 1; >> } else { >> /* note mtime, size NOW. */
On Mon, 2021-01-18 at 17:10 +0800, Xiubo Li wrote: > On 2021/1/13 5:48, Jeff Layton wrote: > > On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > If the Fb cap is used it means the current inode is flushing the > > > dirty data to OSD, just defer flushing the capsnap. > > > > > > URL: https://tracker.ceph.com/issues/48679 > > > URL: https://tracker.ceph.com/issues/48640 > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > --- > > > > > > V3: > > > - Add more comments about putting the inode ref > > > - A small change about the code style > > > > > > V2: > > > - Fix inode reference leak bug > > > > > > fs/ceph/caps.c | 32 +++++++++++++++++++------------- > > > fs/ceph/snap.c | 6 +++--- > > > 2 files changed, 22 insertions(+), 16 deletions(-) > > > > > Hi Xiubo, > > > > This patch seems to cause hangs in some xfstests (generic/013, in > > particular). I'll take a closer look when I have a chance, but I'm > > dropping this for now. > > Okay. > > BTW, what's your test commands to reproduce it ? I will take a look when > I am free these days or later. > > BRs > I set up xfstests to run on cephfs, and then just run: $ sudo ./check generic/013 It wouldn't reliably complete with this patch in place. Setting up xfstests is the "hard part". I'll plan to roll up a wiki page on how to do that soon (that's good info to have out there anyway). > Cheers,
On 2021/1/18 19:08, Jeff Layton wrote: > On Mon, 2021-01-18 at 17:10 +0800, Xiubo Li wrote: >> On 2021/1/13 5:48, Jeff Layton wrote: >>> On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: >>>> From: Xiubo Li <xiubli@redhat.com> >>>> >>>> If the Fb cap is used it means the current inode is flushing the >>>> dirty data to OSD, just defer flushing the capsnap. >>>> >>>> URL: https://tracker.ceph.com/issues/48679 >>>> URL: https://tracker.ceph.com/issues/48640 >>>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>>> --- >>>> >>>> V3: >>>> - Add more comments about putting the inode ref >>>> - A small change about the code style >>>> >>>> V2: >>>> - Fix inode reference leak bug >>>> >>>> Â Â fs/ceph/caps.c | 32 +++++++++++++++++++------------- >>>> Â Â fs/ceph/snap.c | 6 +++--- >>>> Â Â 2 files changed, 22 insertions(+), 16 deletions(-) >>>> >>> Hi Xiubo, >>> >>> This patch seems to cause hangs in some xfstests (generic/013, in >>> particular). I'll take a closer look when I have a chance, but I'm >>> dropping this for now. >> Okay. >> >> BTW, what's your test commands to reproduce it ? I will take a look when >> I am free these days or later. >> >> BRs >> > I set up xfstests to run on cephfs, and then just run: > > $ sudo ./check generic/013 > > It wouldn't reliably complete with this patch in place. Setting up > xfstests is the "hard part". I'll plan to roll up a wiki page on how to > do that soon (that's good info to have out there anyway). Okay, sure. Thanks. > Cheers,
On Wed, 2021-01-20 at 08:56 +0800, Xiubo Li wrote: > On 2021/1/18 19:08, Jeff Layton wrote: > > On Mon, 2021-01-18 at 17:10 +0800, Xiubo Li wrote: > > > On 2021/1/13 5:48, Jeff Layton wrote: > > > > On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: > > > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > > > > > If the Fb cap is used it means the current inode is flushing the > > > > > dirty data to OSD, just defer flushing the capsnap. > > > > > > > > > > URL: https://tracker.ceph.com/issues/48679 > > > > > URL: https://tracker.ceph.com/issues/48640 > > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > > > --- > > > > > > > > > > V3: > > > > > - Add more comments about putting the inode ref > > > > > - A small change about the code style > > > > > > > > > > V2: > > > > > - Fix inode reference leak bug > > > > > > > > > > Â Â fs/ceph/caps.c | 32 +++++++++++++++++++------------- > > > > > Â Â fs/ceph/snap.c | 6 +++--- > > > > > Â Â 2 files changed, 22 insertions(+), 16 deletions(-) > > > > > > > > > Hi Xiubo, > > > > > > > > This patch seems to cause hangs in some xfstests (generic/013, in > > > > particular). I'll take a closer look when I have a chance, but I'm > > > > dropping this for now. > > > Okay. > > > > > > BTW, what's your test commands to reproduce it ? I will take a look when > > > I am free these days or later. > > > > > > BRs > > > > > I set up xfstests to run on cephfs, and then just run: > > > > $ sudo ./check generic/013 > > > > It wouldn't reliably complete with this patch in place. Setting up > > xfstests is the "hard part". I'll plan to roll up a wiki page on how to > > do that soon (that's good info to have out there anyway). > > Okay, sure. > I'm not sure where this should be documented. Still, here's a local.config that I'm using now (with comments). I'm happy to merge this somewhere for posterity, but not sure where it should go.
On Mon, 2021-01-18 at 17:10 +0800, Xiubo Li wrote: > On 2021/1/13 5:48, Jeff Layton wrote: > > On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > If the Fb cap is used it means the current inode is flushing the > > > dirty data to OSD, just defer flushing the capsnap. > > > > > > URL: https://tracker.ceph.com/issues/48679 > > > URL: https://tracker.ceph.com/issues/48640 > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > --- > > > > > > V3: > > > - Add more comments about putting the inode ref > > > - A small change about the code style > > > > > > V2: > > > - Fix inode reference leak bug > > > > > > fs/ceph/caps.c | 32 +++++++++++++++++++------------- > > > fs/ceph/snap.c | 6 +++--- > > > 2 files changed, 22 insertions(+), 16 deletions(-) > > > > > Hi Xiubo, > > > > This patch seems to cause hangs in some xfstests (generic/013, in > > particular). I'll take a closer look when I have a chance, but I'm > > dropping this for now. > > Okay. > > BTW, what's your test commands to reproduce it ? I will take a look when > I am free these days or later. > FWIW, I was able to trigger a hang with this patch by running one of the tests that this patch was intended to fix (snaptest-git-ceph.sh). Here's the stack trace of the hung task: # cat /proc/1166/stack [<0>] wait_woken+0x87/0xb0 [<0>] ceph_get_caps+0x405/0x6a0 [ceph] [<0>] ceph_write_iter+0x2ca/0xd20 [ceph] [<0>] new_sync_write+0x10b/0x190 [<0>] vfs_write+0x240/0x390 [<0>] ksys_write+0x58/0xd0 [<0>] do_syscall_64+0x33/0x40 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Without this patch I could run that test in a loop without issue. This bug mentions that the original issue occurred during mds thrashing though, and I haven't tried reproducing that scenario yet: https://tracker.ceph.com/issues/48640 Cheers,
On 2021/1/21 22:28, Jeff Layton wrote: > On Mon, 2021-01-18 at 17:10 +0800, Xiubo Li wrote: >> On 2021/1/13 5:48, Jeff Layton wrote: >>> On Sun, 2021-01-10 at 10:01 +0800, xiubli@redhat.com wrote: >>>> From: Xiubo Li <xiubli@redhat.com> >>>> >>>> If the Fb cap is used it means the current inode is flushing the >>>> dirty data to OSD, just defer flushing the capsnap. >>>> >>>> URL: https://tracker.ceph.com/issues/48679 >>>> URL: https://tracker.ceph.com/issues/48640 >>>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>>> --- >>>> >>>> V3: >>>> - Add more comments about putting the inode ref >>>> - A small change about the code style >>>> >>>> V2: >>>> - Fix inode reference leak bug >>>> >>>> fs/ceph/caps.c | 32 +++++++++++++++++++------------- >>>> fs/ceph/snap.c | 6 +++--- >>>> 2 files changed, 22 insertions(+), 16 deletions(-) >>>> >>> Hi Xiubo, >>> >>> This patch seems to cause hangs in some xfstests (generic/013, in >>> particular). I'll take a closer look when I have a chance, but I'm >>> dropping this for now. >> Okay. >> >> BTW, what's your test commands to reproduce it ? I will take a look when >> I am free these days or later. >> > > FWIW, I was able to trigger a hang with this patch by running one of the > tests that this patch was intended to fix (snaptest-git-ceph.sh). Here's > the stack trace of the hung task: > > # cat /proc/1166/stack > [<0>] wait_woken+0x87/0xb0 > [<0>] ceph_get_caps+0x405/0x6a0 [ceph] > [<0>] ceph_write_iter+0x2ca/0xd20 [ceph] > [<0>] new_sync_write+0x10b/0x190 > [<0>] vfs_write+0x240/0x390 > [<0>] ksys_write+0x58/0xd0 > [<0>] do_syscall_64+0x33/0x40 > [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Hi Jeff, I have reproduced it, and also tried the libcephfs, which have the same logic for this issue, and it worked well. I will take a look at it later. > Without this patch I could run that test in a loop without issue. This > bug mentions that the original issue occurred during mds thrashing > though, and I haven't tried reproducing that scenario yet: > > https://tracker.ceph.com/issues/48640 From the logs this issue seems not related the thrashing operation, during this test the MDS has already been secussfully thrashed. BRs Xiubo > Cheers,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index abbf48fc6230..b00234cf3b04 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3047,6 +3047,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; + bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -3063,26 +3064,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; + /* put the ref held by ceph_take_cap_refs() */ put++; + check_flushsnaps = true; } dout("put_cap_refs %p wb %d -> %d (?)\n", inode, ci->i_wb_ref+1, ci->i_wb_ref); } - if (had & CEPH_CAP_FILE_WR) + if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { last++; - if (__ceph_have_pending_cap_snap(ci)) { - struct ceph_cap_snap *capsnap = - list_last_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - capsnap->writing = 0; - if (ceph_try_drop_cap_snap(ci, capsnap)) - put++; - else if (__ceph_finish_cap_snap(ci, capsnap)) - flushsnaps = 1; - wake = 1; - } + check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { @@ -3094,6 +3086,20 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) drop_inode_snap_realm(ci); } + } + if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(ci, capsnap)) + /* put the ref held by ceph_queue_cap_snap() */ + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } spin_unlock(&ci->i_ceph_lock); dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b611f829cb61..639fb91cc9db 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -561,10 +561,10 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->context = old_snapc; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - if (used & CEPH_CAP_FILE_WR) { + if (used & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) { dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, old_snapc, old_snapc->seq); + " seq %llu used WR | BUFFFER, now pending\n", + inode, capsnap, old_snapc, old_snapc->seq); capsnap->writing = 1; } else { /* note mtime, size NOW. */