diff mbox series

[RFC,10/11] ceph: perform asynchronous unlink if we have sufficient caps

Message ID 20190409194229.8247-11-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series [RFC,01/11] ceph: after an MDS request, do callback and completions | expand

Commit Message

Jeff Layton April 9, 2019, 7:42 p.m. UTC
When performing an unlink, if we have Fx on the parent directory and
Lx on the inode of the dentry being unlinked then we don't need to
wait on the reply before returning.

In that situation, just fix up the dcache and link count and return
immediately after issuing the call to the MDS. This does mean that we
need to hold an extra reference to the inode being unlinked, and extra
references to the caps to avoid races. Put those references in the
r_callback routine.

If the operation ends up failing, then set a writeback error on the
directory inode that can be fetched later by an fsync on the dir.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c  | 22 ++++++++++++++++++++++
 fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
 fs/ceph/super.h |  1 +
 3 files changed, 58 insertions(+), 3 deletions(-)

Comments

Yan, Zheng April 10, 2019, 1:06 p.m. UTC | #1
On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> When performing an unlink, if we have Fx on the parent directory and
> Lx on the inode of the dentry being unlinked then we don't need to
> wait on the reply before returning.
>
> In that situation, just fix up the dcache and link count and return
> immediately after issuing the call to the MDS. This does mean that we
> need to hold an extra reference to the inode being unlinked, and extra
> references to the caps to avoid races. Put those references in the
> r_callback routine.
>
> If the operation ends up failing, then set a writeback error on the
> directory inode that can be fetched later by an fsync on the dir.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
>  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
>  fs/ceph/super.h |  1 +
>  3 files changed, 58 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index d022e15c8378..8fbb09c761a7 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
>         return ret;
>  }
>
> +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> +{
> +       int err, got;
> +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> +
> +       /* Ensure we have Lx on the inode being unlinked */
> +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> +               return false;
> +
> +       /* Do we have Fx on the dir ? */
> +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> +                               true, &got);
> +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> +               return false;
> +       }
> +       return true;
> +}

holding caps for request may cause deadlock.  For example

- client hold Fx caps and send unlink request
- mds process request from other client, it change filelock's state to
EXCL_FOO and revoke Fx caps
- mds receives the unlink request, it can't process it because it
can't acquire wrlock on filelock

filelock state stays in EXCL_FOO because client does not release Fx caps.

I checked MDS code, supporting async operation is more complex than I
thought.  For the unlink case, mds needs to acquire locks other than
filelock and linklock.

- Request A from client1 has acquired lock X, waiting for wrlock of
filelock.
- Client2 sends async unlink  request 'B' to mds
- MDS processes request 'B'. It needs to lock X and filelock.  But it
can't get lock X because request A holds it. Request A can get
filelock because Client2 hold Fx caps.

I think a way to handle this is delegating locks hold by unsafe
request to client. For example:

- mds processes unlink request from client A,  sends unsafe reply and
delegates acquired locks to client
- client got the delegation, it can do async unlink on the same
directory as  long as it hold the delegation

Regards
Yan, Zheng
















> +
>  /*
>   * Check the offset we are writing up to against our current
>   * max_size.  If necessary, tell the MDS we want to write to
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 3eb9bc226b77..386c9439a020 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -1029,6 +1029,18 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>         return err;
>  }
>
> +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> +                                struct ceph_mds_request *req)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
> +
> +       /* If op failed, set error on parent directory */
> +       mapping_set_error(req->r_parent->i_mapping, req->r_err);
> +       ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> +       ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
> +       iput(req->r_old_inode);
> +}
> +
>  /*
>   * rmdir and unlink are differ only by the metadata op code
>   */
> @@ -1065,9 +1077,29 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>         req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
>         req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
>         req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> -       err = ceph_mdsc_do_request(mdsc, dir, req);
> -       if (!err && !req->r_reply_info.head->is_dentry)
> -               d_delete(dentry);
> +
> +       if (op == CEPH_MDS_OP_UNLINK &&
> +           ceph_get_caps_for_unlink(dir, dentry)) {
> +               /* Keep LINK caps to ensure continuity over async call */
> +               req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
> +               req->r_callback = ceph_async_unlink_cb;
> +               req->r_old_inode = d_inode(dentry);
> +               ihold(req->r_old_inode);
> +               err = ceph_mdsc_submit_request(mdsc, dir, req);
> +               if (!err) {
> +                       /*
> +                        * We have enough caps, so we assume that the unlink
> +                        * will succeed. Fix up the target inode and dcache.
> +                        */
> +                       drop_nlink(inode);
> +                       d_delete(dentry);
> +               }
> +       } else {
> +               err = ceph_mdsc_do_request(mdsc, dir, req);
> +               if (!err && !req->r_reply_info.head->is_dentry)
> +                       d_delete(dentry);
> +       }
> +
>         ceph_mdsc_put_request(req);
>  out:
>         return err;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 3c5608f2108a..5c361dc1f47f 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1033,6 +1033,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
>                          loff_t endoff, int *got, struct page **pinned_page);
>  extern int ceph_try_get_caps(struct ceph_inode_info *ci,
>                              int need, int want, bool nonblock, int *got);
> +extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
>
>  /* for counting open files by mode */
>  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> --
> 2.20.1
>
Jeff Layton April 10, 2019, 1:22 p.m. UTC | #2
On Wed, Apr 10, 2019 at 9:06 AM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > When performing an unlink, if we have Fx on the parent directory and
> > Lx on the inode of the dentry being unlinked then we don't need to
> > wait on the reply before returning.
> >
> > In that situation, just fix up the dcache and link count and return
> > immediately after issuing the call to the MDS. This does mean that we
> > need to hold an extra reference to the inode being unlinked, and extra
> > references to the caps to avoid races. Put those references in the
> > r_callback routine.
> >
> > If the operation ends up failing, then set a writeback error on the
> > directory inode that can be fetched later by an fsync on the dir.
> >
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
> >  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
> >  fs/ceph/super.h |  1 +
> >  3 files changed, 58 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index d022e15c8378..8fbb09c761a7 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
> >         return ret;
> >  }
> >
> > +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> > +{
> > +       int err, got;
> > +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> > +
> > +       /* Ensure we have Lx on the inode being unlinked */
> > +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> > +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> > +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> > +               return false;
> > +
> > +       /* Do we have Fx on the dir ? */
> > +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> > +                               true, &got);
> > +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> > +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> > +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > +               return false;
> > +       }
> > +       return true;
> > +}
>
> holding caps for request may cause deadlock.  For example
>
> - client hold Fx caps and send unlink request
> - mds process request from other client, it change filelock's state to
> EXCL_FOO and revoke Fx caps
> - mds receives the unlink request, it can't process it because it
> can't acquire wrlock on filelock
>
> filelock state stays in EXCL_FOO because client does not release Fx caps.
>

The client doing the unlink may have received a revoke for Fx on the
dir at that point, but it won't have returned it yet. Shouldn't it
still be considered to hold Fx on the dir until that happens?

> I checked MDS code, supporting async operation is more complex than I
> thought.  For the unlink case, mds needs to acquire locks other than
> filelock and linklock.
>
> - Request A from client1 has acquired lock X, waiting for wrlock of
> filelock.
> - Client2 sends async unlink  request 'B' to mds
> - MDS processes request 'B'. It needs to lock X and filelock.  But it
> can't get lock X because request A holds it. Request A can get
> filelock because Client2 hold Fx caps.
>

This sounds like a bog-standard ABBA ordering issue.

If any request can't get any of the locks it needs, shouldn't it just
unwind everything at that point and be to try again later? In fact, it
sort of looks like that's what the MDS does, but the Locker code in
the MDS is really hard to understand.

> I think a way to handle this is delegating locks hold by unsafe
> request to client. For example:
>
> - mds processes unlink request from client A,  sends unsafe reply and
> delegates acquired locks to client
> - client got the delegation, it can do async unlink on the same
> directory as  long as it hold the delegation
>

So this would be a proposed new object that we'd be handing to
clients? I'd have to understand what this means in practice.

>
>
> > +
> >  /*
> >   * Check the offset we are writing up to against our current
> >   * max_size.  If necessary, tell the MDS we want to write to
> > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > index 3eb9bc226b77..386c9439a020 100644
> > --- a/fs/ceph/dir.c
> > +++ b/fs/ceph/dir.c
> > @@ -1029,6 +1029,18 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> >         return err;
> >  }
> >
> > +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > +                                struct ceph_mds_request *req)
> > +{
> > +       struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
> > +
> > +       /* If op failed, set error on parent directory */
> > +       mapping_set_error(req->r_parent->i_mapping, req->r_err);
> > +       ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > +       ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
> > +       iput(req->r_old_inode);
> > +}
> > +
> >  /*
> >   * rmdir and unlink are differ only by the metadata op code
> >   */
> > @@ -1065,9 +1077,29 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> >         req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
> >         req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
> >         req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> > -       err = ceph_mdsc_do_request(mdsc, dir, req);
> > -       if (!err && !req->r_reply_info.head->is_dentry)
> > -               d_delete(dentry);
> > +
> > +       if (op == CEPH_MDS_OP_UNLINK &&
> > +           ceph_get_caps_for_unlink(dir, dentry)) {
> > +               /* Keep LINK caps to ensure continuity over async call */
> > +               req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
> > +               req->r_callback = ceph_async_unlink_cb;
> > +               req->r_old_inode = d_inode(dentry);
> > +               ihold(req->r_old_inode);
> > +               err = ceph_mdsc_submit_request(mdsc, dir, req);
> > +               if (!err) {
> > +                       /*
> > +                        * We have enough caps, so we assume that the unlink
> > +                        * will succeed. Fix up the target inode and dcache.
> > +                        */
> > +                       drop_nlink(inode);
> > +                       d_delete(dentry);
> > +               }
> > +       } else {
> > +               err = ceph_mdsc_do_request(mdsc, dir, req);
> > +               if (!err && !req->r_reply_info.head->is_dentry)
> > +                       d_delete(dentry);
> > +       }
> > +
> >         ceph_mdsc_put_request(req);
> >  out:
> >         return err;
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 3c5608f2108a..5c361dc1f47f 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -1033,6 +1033,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
> >                          loff_t endoff, int *got, struct page **pinned_page);
> >  extern int ceph_try_get_caps(struct ceph_inode_info *ci,
> >                              int need, int want, bool nonblock, int *got);
> > +extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
> >
> >  /* for counting open files by mode */
> >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > --
> > 2.20.1
> >
Yan, Zheng April 10, 2019, 2:11 p.m. UTC | #3
On Wed, Apr 10, 2019 at 9:22 PM Jeff Layton <jlayton@poochiereds.net> wrote:
>
> On Wed, Apr 10, 2019 at 9:06 AM Yan, Zheng <ukernel@gmail.com> wrote:
> >
> > On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > When performing an unlink, if we have Fx on the parent directory and
> > > Lx on the inode of the dentry being unlinked then we don't need to
> > > wait on the reply before returning.
> > >
> > > In that situation, just fix up the dcache and link count and return
> > > immediately after issuing the call to the MDS. This does mean that we
> > > need to hold an extra reference to the inode being unlinked, and extra
> > > references to the caps to avoid races. Put those references in the
> > > r_callback routine.
> > >
> > > If the operation ends up failing, then set a writeback error on the
> > > directory inode that can be fetched later by an fsync on the dir.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
> > >  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
> > >  fs/ceph/super.h |  1 +
> > >  3 files changed, 58 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index d022e15c8378..8fbb09c761a7 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
> > >         return ret;
> > >  }
> > >
> > > +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> > > +{
> > > +       int err, got;
> > > +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> > > +
> > > +       /* Ensure we have Lx on the inode being unlinked */
> > > +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> > > +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> > > +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> > > +               return false;
> > > +
> > > +       /* Do we have Fx on the dir ? */
> > > +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> > > +                               true, &got);
> > > +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> > > +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> > > +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > +               return false;
> > > +       }
> > > +       return true;
> > > +}
> >
> > holding caps for request may cause deadlock.  For example
> >
> > - client hold Fx caps and send unlink request
> > - mds process request from other client, it change filelock's state to
> > EXCL_FOO and revoke Fx caps
> > - mds receives the unlink request, it can't process it because it
> > can't acquire wrlock on filelock
> >
> > filelock state stays in EXCL_FOO because client does not release Fx caps.
> >
>
> The client doing the unlink may have received a revoke for Fx on the
> dir at that point, but it won't have returned it yet. Shouldn't it
> still be considered to hold Fx on the dir until that happens?
>

Client should release the Fx. But there is a problem, mds process
other request first after it get the release of Fx


> > I checked MDS code, supporting async operation is more complex than I
> > thought.  For the unlink case, mds needs to acquire locks other than
> > filelock and linklock.
> >
> > - Request A from client1 has acquired lock X, waiting for wrlock of
> > filelock.
> > - Client2 sends async unlink  request 'B' to mds
> > - MDS processes request 'B'. It needs to lock X and filelock.  But it
> > can't get lock X because request A holds it. Request A can get
> > filelock because Client2 hold Fx caps.
> >
>
> This sounds like a bog-standard ABBA ordering issue.
>
> If any request can't get any of the locks it needs, shouldn't it just
> unwind everything at that point and be to try again later? In fact, it
> sort of looks like that's what the MDS does, but the Locker code in
> the MDS is really hard to understand.
>

In some special cases, mds does that.  For normal cases, mds just
acquires locks in fixed order. See Locker::acquire_locks and
SimpleLock::ptr_lt

> > I think a way to handle this is delegating locks hold by unsafe
> > request to client. For example:
> >
> > - mds processes unlink request from client A,  sends unsafe reply and
> > delegates acquired locks to client
> > - client got the delegation, it can do async unlink on the same
> > directory as  long as it hold the delegation
> >
>
> So this would be a proposed new object that we'd be handing to
> clients? I'd have to understand what this means in practice.
>

For requests that do unlink in the same directory, they should acquire
similar locks (the only difference is xlock on dentry). When sending
unsafe reply for a unlink request, mds keeps the acquired locks for
later unlink in the same directory. The unsafe reply includes a
delegation for unlink operation in the directory.  Client can do
unlink locally while it holds the delegation. (mds keeps locks for
client as long as it holds the delegation)  This is my rough idea,
I'm still thinking how to handle xlock on dentry.

> >
> >
> > > +
> > >  /*
> > >   * Check the offset we are writing up to against our current
> > >   * max_size.  If necessary, tell the MDS we want to write to
> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > index 3eb9bc226b77..386c9439a020 100644
> > > --- a/fs/ceph/dir.c
> > > +++ b/fs/ceph/dir.c
> > > @@ -1029,6 +1029,18 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > >         return err;
> > >  }
> > >
> > > +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > > +                                struct ceph_mds_request *req)
> > > +{
> > > +       struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
> > > +
> > > +       /* If op failed, set error on parent directory */
> > > +       mapping_set_error(req->r_parent->i_mapping, req->r_err);
> > > +       ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > +       ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
> > > +       iput(req->r_old_inode);
> > > +}
> > > +
> > >  /*
> > >   * rmdir and unlink are differ only by the metadata op code
> > >   */
> > > @@ -1065,9 +1077,29 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > >         req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
> > >         req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
> > >         req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> > > -       err = ceph_mdsc_do_request(mdsc, dir, req);
> > > -       if (!err && !req->r_reply_info.head->is_dentry)
> > > -               d_delete(dentry);
> > > +
> > > +       if (op == CEPH_MDS_OP_UNLINK &&
> > > +           ceph_get_caps_for_unlink(dir, dentry)) {
> > > +               /* Keep LINK caps to ensure continuity over async call */
> > > +               req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
> > > +               req->r_callback = ceph_async_unlink_cb;
> > > +               req->r_old_inode = d_inode(dentry);
> > > +               ihold(req->r_old_inode);
> > > +               err = ceph_mdsc_submit_request(mdsc, dir, req);
> > > +               if (!err) {
> > > +                       /*
> > > +                        * We have enough caps, so we assume that the unlink
> > > +                        * will succeed. Fix up the target inode and dcache.
> > > +                        */
> > > +                       drop_nlink(inode);
> > > +                       d_delete(dentry);
> > > +               }
> > > +       } else {
> > > +               err = ceph_mdsc_do_request(mdsc, dir, req);
> > > +               if (!err && !req->r_reply_info.head->is_dentry)
> > > +                       d_delete(dentry);
> > > +       }
> > > +
> > >         ceph_mdsc_put_request(req);
> > >  out:
> > >         return err;
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 3c5608f2108a..5c361dc1f47f 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -1033,6 +1033,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
> > >                          loff_t endoff, int *got, struct page **pinned_page);
> > >  extern int ceph_try_get_caps(struct ceph_inode_info *ci,
> > >                              int need, int want, bool nonblock, int *got);
> > > +extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
> > >
> > >  /* for counting open files by mode */
> > >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > > --
> > > 2.20.1
> > >
>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>
Jeff Layton April 10, 2019, 2:21 p.m. UTC | #4
On Wed, Apr 10, 2019 at 10:11 AM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Wed, Apr 10, 2019 at 9:22 PM Jeff Layton <jlayton@poochiereds.net> wrote:
> >
> > On Wed, Apr 10, 2019 at 9:06 AM Yan, Zheng <ukernel@gmail.com> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > When performing an unlink, if we have Fx on the parent directory and
> > > > Lx on the inode of the dentry being unlinked then we don't need to
> > > > wait on the reply before returning.
> > > >
> > > > In that situation, just fix up the dcache and link count and return
> > > > immediately after issuing the call to the MDS. This does mean that we
> > > > need to hold an extra reference to the inode being unlinked, and extra
> > > > references to the caps to avoid races. Put those references in the
> > > > r_callback routine.
> > > >
> > > > If the operation ends up failing, then set a writeback error on the
> > > > directory inode that can be fetched later by an fsync on the dir.
> > > >
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
> > > >  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
> > > >  fs/ceph/super.h |  1 +
> > > >  3 files changed, 58 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > index d022e15c8378..8fbb09c761a7 100644
> > > > --- a/fs/ceph/caps.c
> > > > +++ b/fs/ceph/caps.c
> > > > @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
> > > >         return ret;
> > > >  }
> > > >
> > > > +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> > > > +{
> > > > +       int err, got;
> > > > +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> > > > +
> > > > +       /* Ensure we have Lx on the inode being unlinked */
> > > > +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> > > > +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> > > > +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> > > > +               return false;
> > > > +
> > > > +       /* Do we have Fx on the dir ? */
> > > > +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> > > > +                               true, &got);
> > > > +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> > > > +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> > > > +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > > +               return false;
> > > > +       }
> > > > +       return true;
> > > > +}
> > >
> > > holding caps for request may cause deadlock.  For example
> > >
> > > - client hold Fx caps and send unlink request
> > > - mds process request from other client, it change filelock's state to
> > > EXCL_FOO and revoke Fx caps
> > > - mds receives the unlink request, it can't process it because it
> > > can't acquire wrlock on filelock
> > >
> > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > >
> >
> > The client doing the unlink may have received a revoke for Fx on the
> > dir at that point, but it won't have returned it yet. Shouldn't it
> > still be considered to hold Fx on the dir until that happens?
> >
>
> Client should release the Fx. But there is a problem, mds process
> other request first after it get the release of Fx
>

As I envisioned it, the client would hold a reference to Fx while the
unlink is in flight, so it would not return Fx until after the unlink
has gotten an unsafe reply.

It seems like when the MDS tries to get the locks for the request from
the second client and fails, it should then drop any locks it did get
and requeue the request to be run when the caps are returned. If we're
blocking for these locks, then we're tying up a thread while we wait
on a cap release, which seems really inefficient.

That said, I don't have a great understanding of the MDS code so maybe
I'm missing something.

>
> > > I checked MDS code, supporting async operation is more complex than I
> > > thought.  For the unlink case, mds needs to acquire locks other than
> > > filelock and linklock.
> > >
> > > - Request A from client1 has acquired lock X, waiting for wrlock of
> > > filelock.
> > > - Client2 sends async unlink  request 'B' to mds
> > > - MDS processes request 'B'. It needs to lock X and filelock.  But it
> > > can't get lock X because request A holds it. Request A can get
> > > filelock because Client2 hold Fx caps.
> > >
> >
> > This sounds like a bog-standard ABBA ordering issue.
> >
> > If any request can't get any of the locks it needs, shouldn't it just
> > unwind everything at that point and be to try again later? In fact, it
> > sort of looks like that's what the MDS does, but the Locker code in
> > the MDS is really hard to understand.
> >
>
> In some special cases, mds does that.  For normal cases, mds just
> acquires locks in fixed order. See Locker::acquire_locks and
> SimpleLock::ptr_lt
>

I sort of see. Ok.


> > > I think a way to handle this is delegating locks hold by unsafe
> > > request to client. For example:
> > >
> > > - mds processes unlink request from client A,  sends unsafe reply and
> > > delegates acquired locks to client
> > > - client got the delegation, it can do async unlink on the same
> > > directory as  long as it hold the delegation
> > >
> >
> > So this would be a proposed new object that we'd be handing to
> > clients? I'd have to understand what this means in practice.
> >
>
> For requests that do unlink in the same directory, they should acquire
> similar locks (the only difference is xlock on dentry). When sending
> unsafe reply for a unlink request, mds keeps the acquired locks for
> later unlink in the same directory. The unsafe reply includes a
> delegation for unlink operation in the directory.  Client can do
> unlink locally while it holds the delegation. (mds keeps locks for
> client as long as it holds the delegation)  This is my rough idea,
> I'm still thinking how to handle xlock on dentry.
>

I think we might be better off reworking cap handling semantics rather
than layering a new mechanism on top for this. That said, I keep
finding ways that the cap subsystem doesn't work like I'd think it
would, so maybe I'm wrong here.

> > >
> > >
> > > > +
> > > >  /*
> > > >   * Check the offset we are writing up to against our current
> > > >   * max_size.  If necessary, tell the MDS we want to write to
> > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > index 3eb9bc226b77..386c9439a020 100644
> > > > --- a/fs/ceph/dir.c
> > > > +++ b/fs/ceph/dir.c
> > > > @@ -1029,6 +1029,18 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > > >         return err;
> > > >  }
> > > >
> > > > +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > > > +                                struct ceph_mds_request *req)
> > > > +{
> > > > +       struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
> > > > +
> > > > +       /* If op failed, set error on parent directory */
> > > > +       mapping_set_error(req->r_parent->i_mapping, req->r_err);
> > > > +       ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > > +       ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
> > > > +       iput(req->r_old_inode);
> > > > +}
> > > > +
> > > >  /*
> > > >   * rmdir and unlink are differ only by the metadata op code
> > > >   */
> > > > @@ -1065,9 +1077,29 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > > >         req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
> > > >         req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
> > > >         req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> > > > -       err = ceph_mdsc_do_request(mdsc, dir, req);
> > > > -       if (!err && !req->r_reply_info.head->is_dentry)
> > > > -               d_delete(dentry);
> > > > +
> > > > +       if (op == CEPH_MDS_OP_UNLINK &&
> > > > +           ceph_get_caps_for_unlink(dir, dentry)) {
> > > > +               /* Keep LINK caps to ensure continuity over async call */
> > > > +               req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
> > > > +               req->r_callback = ceph_async_unlink_cb;
> > > > +               req->r_old_inode = d_inode(dentry);
> > > > +               ihold(req->r_old_inode);
> > > > +               err = ceph_mdsc_submit_request(mdsc, dir, req);
> > > > +               if (!err) {
> > > > +                       /*
> > > > +                        * We have enough caps, so we assume that the unlink
> > > > +                        * will succeed. Fix up the target inode and dcache.
> > > > +                        */
> > > > +                       drop_nlink(inode);
> > > > +                       d_delete(dentry);
> > > > +               }
> > > > +       } else {
> > > > +               err = ceph_mdsc_do_request(mdsc, dir, req);
> > > > +               if (!err && !req->r_reply_info.head->is_dentry)
> > > > +                       d_delete(dentry);
> > > > +       }
> > > > +
> > > >         ceph_mdsc_put_request(req);
> > > >  out:
> > > >         return err;
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index 3c5608f2108a..5c361dc1f47f 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -1033,6 +1033,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
> > > >                          loff_t endoff, int *got, struct page **pinned_page);
> > > >  extern int ceph_try_get_caps(struct ceph_inode_info *ci,
> > > >                              int need, int want, bool nonblock, int *got);
> > > > +extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
> > > >
> > > >  /* for counting open files by mode */
> > > >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > > > --
> > > > 2.20.1
> > > >
> >
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
Patrick Donnelly April 10, 2019, 8:21 p.m. UTC | #5
On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > >
> > > > holding caps for request may cause deadlock.  For example
> > > >
> > > > - client hold Fx caps and send unlink request
> > > > - mds process request from other client, it change filelock's state to
> > > > EXCL_FOO and revoke Fx caps
> > > > - mds receives the unlink request, it can't process it because it
> > > > can't acquire wrlock on filelock
> > > >
> > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > >
> > >
> > > The client doing the unlink may have received a revoke for Fx on the
> > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > still be considered to hold Fx on the dir until that happens?
> > >
> >
> > Client should release the Fx. But there is a problem, mds process
> > other request first after it get the release of Fx
> >
>
> As I envisioned it, the client would hold a reference to Fx while the
> unlink is in flight, so it would not return Fx until after the unlink
> has gotten an unsafe reply.

This was my understanding as well. It seems to me that the correct
thing to do is to move forward with the understanding that the client
has a write lock on the filelock state for the directory inode (for Fx
cap) and a write lock on the linklock for the file inode (for the Lx
cap). Obtaining those locks should require cap revocation which would
cause the client to flush its buffered async unlinks. Importantly --
and what actually needs to change (?): the MDS should skip acquiring
those locks because the client already has the appropriate caps.

Does that work Zheng?
Jeff Layton April 10, 2019, 9:11 p.m. UTC | #6
On Wed, Apr 10, 2019 at 4:21 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
>
> On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > >
> > > > > holding caps for request may cause deadlock.  For example
> > > > >
> > > > > - client hold Fx caps and send unlink request
> > > > > - mds process request from other client, it change filelock's state to
> > > > > EXCL_FOO and revoke Fx caps
> > > > > - mds receives the unlink request, it can't process it because it
> > > > > can't acquire wrlock on filelock
> > > > >
> > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > >
> > > >
> > > > The client doing the unlink may have received a revoke for Fx on the
> > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > still be considered to hold Fx on the dir until that happens?
> > > >
> > >
> > > Client should release the Fx. But there is a problem, mds process
> > > other request first after it get the release of Fx
> > >
> >
> > As I envisioned it, the client would hold a reference to Fx while the
> > unlink is in flight, so it would not return Fx until after the unlink
> > has gotten an unsafe reply.
>
> This was my understanding as well. It seems to me that the correct
> thing to do is to move forward with the understanding that the client
> has a write lock on the filelock state for the directory inode (for Fx
> cap) and a write lock on the linklock for the file inode (for the Lx
> cap). Obtaining those locks should require cap revocation which would
> cause the client to flush its buffered async unlinks. Importantly --
> and what actually needs to change (?): the MDS should skip acquiring
> those locks because the client already has the appropriate caps.
>
> Does that work Zheng?
>

I'm not sure it will. IIUC...

I think part of what Zheng is pointing out is that when we assume that
the client already holds certain locks, then we are effectively
changing the order in which they can be acquired. That can leave us
subject to ABBA style deadlocks (though with all of the complexity
that class Locker provides).

That in and of itself wouldn't be a problem if the MDS code didn't
wait synchronously on cap revokes in some cases (which Zheng pointed
out). Fixing that latter bit seems like it might be a big win for
parallelism, in addition to making async calls more possible.
Gregory Farnum April 10, 2019, 9:54 p.m. UTC | #7
On Wed, Apr 10, 2019 at 2:11 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, Apr 10, 2019 at 4:21 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> >
> > On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > > >
> > > > > > holding caps for request may cause deadlock.  For example
> > > > > >
> > > > > > - client hold Fx caps and send unlink request
> > > > > > - mds process request from other client, it change filelock's state to
> > > > > > EXCL_FOO and revoke Fx caps
> > > > > > - mds receives the unlink request, it can't process it because it
> > > > > > can't acquire wrlock on filelock
> > > > > >
> > > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > > >
> > > > >
> > > > > The client doing the unlink may have received a revoke for Fx on the
> > > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > > still be considered to hold Fx on the dir until that happens?
> > > > >
> > > >
> > > > Client should release the Fx. But there is a problem, mds process
> > > > other request first after it get the release of Fx
> > > >
> > >
> > > As I envisioned it, the client would hold a reference to Fx while the
> > > unlink is in flight, so it would not return Fx until after the unlink
> > > has gotten an unsafe reply.
> >
> > This was my understanding as well. It seems to me that the correct
> > thing to do is to move forward with the understanding that the client
> > has a write lock on the filelock state for the directory inode (for Fx
> > cap) and a write lock on the linklock for the file inode (for the Lx
> > cap). Obtaining those locks should require cap revocation which would
> > cause the client to flush its buffered async unlinks. Importantly --
> > and what actually needs to change (?): the MDS should skip acquiring
> > those locks because the client already has the appropriate caps.
> >
> > Does that work Zheng?
> >
>
> I'm not sure it will. IIUC...
>
> I think part of what Zheng is pointing out is that when we assume that
> the client already holds certain locks, then we are effectively
> changing the order in which they can be acquired. That can leave us
> subject to ABBA style deadlocks (though with all of the complexity
> that class Locker provides).
>
> That in and of itself wouldn't be a problem if the MDS code didn't
> wait synchronously on cap revokes in some cases (which Zheng pointed
> out). Fixing that latter bit seems like it might be a big win for
> parallelism, in addition to making async calls more possible.

Well it's not literally synchronous; it's just that the MDS holds on
to the locks it's already taken. That's why you can see failure cases
where the MDS is still running and resolving requests but there's a
particular client which is stuck with one operation that never moves
forward.

In the fusty cobwebs of my mind it suggests the way we'd deal with
this is by creating a NEW unstable lock state that the MDS transitions
into that revokes the exclusive cap but lets the MDS take all the same
local locks as it could before, then proceeds on to the stable state
when the client's cap revoke is actually completed. Although I thought
the FileLock state would be the one that could handle this case
already since it's by far the most involved of the options....?
-Greg
Jeff Layton April 10, 2019, 10:45 p.m. UTC | #8
On Wed, Apr 10, 2019 at 5:54 PM Gregory Farnum <gfarnum@redhat.com> wrote:
>
> On Wed, Apr 10, 2019 at 2:11 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Wed, Apr 10, 2019 at 4:21 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > > > >
> > > > > > > holding caps for request may cause deadlock.  For example
> > > > > > >
> > > > > > > - client hold Fx caps and send unlink request
> > > > > > > - mds process request from other client, it change filelock's state to
> > > > > > > EXCL_FOO and revoke Fx caps
> > > > > > > - mds receives the unlink request, it can't process it because it
> > > > > > > can't acquire wrlock on filelock
> > > > > > >
> > > > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > > > >
> > > > > >
> > > > > > The client doing the unlink may have received a revoke for Fx on the
> > > > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > > > still be considered to hold Fx on the dir until that happens?
> > > > > >
> > > > >
> > > > > Client should release the Fx. But there is a problem, mds process
> > > > > other request first after it get the release of Fx
> > > > >
> > > >
> > > > As I envisioned it, the client would hold a reference to Fx while the
> > > > unlink is in flight, so it would not return Fx until after the unlink
> > > > has gotten an unsafe reply.
> > >
> > > This was my understanding as well. It seems to me that the correct
> > > thing to do is to move forward with the understanding that the client
> > > has a write lock on the filelock state for the directory inode (for Fx
> > > cap) and a write lock on the linklock for the file inode (for the Lx
> > > cap). Obtaining those locks should require cap revocation which would
> > > cause the client to flush its buffered async unlinks. Importantly --
> > > and what actually needs to change (?): the MDS should skip acquiring
> > > those locks because the client already has the appropriate caps.
> > >
> > > Does that work Zheng?
> > >
> >
> > I'm not sure it will. IIUC...
> >
> > I think part of what Zheng is pointing out is that when we assume that
> > the client already holds certain locks, then we are effectively
> > changing the order in which they can be acquired. That can leave us
> > subject to ABBA style deadlocks (though with all of the complexity
> > that class Locker provides).
> >
> > That in and of itself wouldn't be a problem if the MDS code didn't
> > wait synchronously on cap revokes in some cases (which Zheng pointed
> > out). Fixing that latter bit seems like it might be a big win for
> > parallelism, in addition to making async calls more possible.
>
> Well it's not literally synchronous; it's just that the MDS holds on
> to the locks it's already taken. That's why you can see failure cases
> where the MDS is still running and resolving requests but there's a
> particular client which is stuck with one operation that never moves
> forward.
>

Got it, thanks.

Could we resolve this by just unwinding the held locks in that case
before requeueing the request? Then just reacquire them when we
reattempt it. We might need a livelock avoidance mechanism but that
doesn't sound too conceptually difficult.

> In the fusty cobwebs of my mind it suggests the way we'd deal with
> this is by creating a NEW unstable lock state that the MDS transitions
> into that revokes the exclusive cap but lets the MDS take all the same
> local locks as it could before, then proceeds on to the stable state
> when the client's cap revoke is actually completed. Although I thought
> the FileLock state would be the one that could handle this case
> already since it's by far the most involved of the options....?

-- Jeff
Gregory Farnum April 11, 2019, 12:16 a.m. UTC | #9
On Wed, Apr 10, 2019 at 3:45 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, Apr 10, 2019 at 5:54 PM Gregory Farnum <gfarnum@redhat.com> wrote:
> >
> > On Wed, Apr 10, 2019 at 2:11 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 4:21 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > > > > >
> > > > > > > > holding caps for request may cause deadlock.  For example
> > > > > > > >
> > > > > > > > - client hold Fx caps and send unlink request
> > > > > > > > - mds process request from other client, it change filelock's state to
> > > > > > > > EXCL_FOO and revoke Fx caps
> > > > > > > > - mds receives the unlink request, it can't process it because it
> > > > > > > > can't acquire wrlock on filelock
> > > > > > > >
> > > > > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > > > > >
> > > > > > >
> > > > > > > The client doing the unlink may have received a revoke for Fx on the
> > > > > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > > > > still be considered to hold Fx on the dir until that happens?
> > > > > > >
> > > > > >
> > > > > > Client should release the Fx. But there is a problem, mds process
> > > > > > other request first after it get the release of Fx
> > > > > >
> > > > >
> > > > > As I envisioned it, the client would hold a reference to Fx while the
> > > > > unlink is in flight, so it would not return Fx until after the unlink
> > > > > has gotten an unsafe reply.
> > > >
> > > > This was my understanding as well. It seems to me that the correct
> > > > thing to do is to move forward with the understanding that the client
> > > > has a write lock on the filelock state for the directory inode (for Fx
> > > > cap) and a write lock on the linklock for the file inode (for the Lx
> > > > cap). Obtaining those locks should require cap revocation which would
> > > > cause the client to flush its buffered async unlinks. Importantly --
> > > > and what actually needs to change (?): the MDS should skip acquiring
> > > > those locks because the client already has the appropriate caps.
> > > >
> > > > Does that work Zheng?
> > > >
> > >
> > > I'm not sure it will. IIUC...
> > >
> > > I think part of what Zheng is pointing out is that when we assume that
> > > the client already holds certain locks, then we are effectively
> > > changing the order in which they can be acquired. That can leave us
> > > subject to ABBA style deadlocks (though with all of the complexity
> > > that class Locker provides).
> > >
> > > That in and of itself wouldn't be a problem if the MDS code didn't
> > > wait synchronously on cap revokes in some cases (which Zheng pointed
> > > out). Fixing that latter bit seems like it might be a big win for
> > > parallelism, in addition to making async calls more possible.
> >
> > Well it's not literally synchronous; it's just that the MDS holds on
> > to the locks it's already taken. That's why you can see failure cases
> > where the MDS is still running and resolving requests but there's a
> > particular client which is stuck with one operation that never moves
> > forward.
> >
>
> Got it, thanks.
>
> Could we resolve this by just unwinding the held locks in that case
> before requeueing the request? Then just reacquire them when we
> reattempt it. We might need a livelock avoidance mechanism but that
> doesn't sound too conceptually difficult.

I'm hesitant to speak too authoritatively after this long and missing
that it would apparently be trivially subject to ABBA, but the things
that concern me about it are:
1) the freezing process for snapshots and exports rely on waiting
until all acquired locks have been dropped. We could try and reference
count "requests-that-will-ask-for-these-again" but it gets difficult
2) Livelock avoidance mechanisms are always difficult?
3) The locking and caps infrastructures are supposed to let us handle
this. I grant you they're annoyingly difficult and undocumented; that
needs to be fixed (and much of it CAN be!). I don't think designing
new systems to avoid interacting with them is the right approach to
take.

I think I mentioned before that we really need to draw out the caps
needed and how they interact. Why does it need more than the FxLx as
we've discussed, Zheng? Is it unlikely the client can hold the
necessary pieces upfront?
-Greg
Yan, Zheng April 11, 2019, 2:32 a.m. UTC | #10
On Thu, Apr 11, 2019 at 4:21 AM Patrick Donnelly <pdonnell@redhat.com> wrote:
>
> On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > >
> > > > > holding caps for request may cause deadlock.  For example
> > > > >
> > > > > - client hold Fx caps and send unlink request
> > > > > - mds process request from other client, it change filelock's state to
> > > > > EXCL_FOO and revoke Fx caps
> > > > > - mds receives the unlink request, it can't process it because it
> > > > > can't acquire wrlock on filelock
> > > > >
> > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > >
> > > >
> > > > The client doing the unlink may have received a revoke for Fx on the
> > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > still be considered to hold Fx on the dir until that happens?
> > > >
> > >
> > > Client should release the Fx. But there is a problem, mds process
> > > other request first after it get the release of Fx
> > >
> >
> > As I envisioned it, the client would hold a reference to Fx while the
> > unlink is in flight, so it would not return Fx until after the unlink
> > has gotten an unsafe reply.
>
> This was my understanding as well. It seems to me that the correct
> thing to do is to move forward with the understanding that the client
> has a write lock on the filelock state for the directory inode (for Fx
> cap) and a write lock on the linklock for the file inode (for the Lx
> cap). Obtaining those locks should require cap revocation which would
> cause the client to flush its buffered async unlinks. Importantly --
> and what actually needs to change (?): the MDS should skip acquiring
> those locks because the client already has the appropriate caps.
>
> Does that work Zheng?
>

No. When handling unlink request, mds needs to acquire several other
locks. Treating Fx/Lx caps as 'already locked' breaks the order of
acquiring locks. Which will cause deadlock.

> --
> Patrick Donnelly
Yan, Zheng April 11, 2019, 2:50 a.m. UTC | #11
On Thu, Apr 11, 2019 at 8:16 AM Gregory Farnum <gfarnum@redhat.com> wrote:
>
> On Wed, Apr 10, 2019 at 3:45 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Wed, Apr 10, 2019 at 5:54 PM Gregory Farnum <gfarnum@redhat.com> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 2:11 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 4:21 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > > > >
> > > > > On Wed, Apr 10, 2019 at 7:21 AM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > > > > > >
> > > > > > > > > holding caps for request may cause deadlock.  For example
> > > > > > > > >
> > > > > > > > > - client hold Fx caps and send unlink request
> > > > > > > > > - mds process request from other client, it change filelock's state to
> > > > > > > > > EXCL_FOO and revoke Fx caps
> > > > > > > > > - mds receives the unlink request, it can't process it because it
> > > > > > > > > can't acquire wrlock on filelock
> > > > > > > > >
> > > > > > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > > > > > >
> > > > > > > >
> > > > > > > > The client doing the unlink may have received a revoke for Fx on the
> > > > > > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > > > > > still be considered to hold Fx on the dir until that happens?
> > > > > > > >
> > > > > > >
> > > > > > > Client should release the Fx. But there is a problem, mds process
> > > > > > > other request first after it get the release of Fx
> > > > > > >
> > > > > >
> > > > > > As I envisioned it, the client would hold a reference to Fx while the
> > > > > > unlink is in flight, so it would not return Fx until after the unlink
> > > > > > has gotten an unsafe reply.
> > > > >
> > > > > This was my understanding as well. It seems to me that the correct
> > > > > thing to do is to move forward with the understanding that the client
> > > > > has a write lock on the filelock state for the directory inode (for Fx
> > > > > cap) and a write lock on the linklock for the file inode (for the Lx
> > > > > cap). Obtaining those locks should require cap revocation which would
> > > > > cause the client to flush its buffered async unlinks. Importantly --
> > > > > and what actually needs to change (?): the MDS should skip acquiring
> > > > > those locks because the client already has the appropriate caps.
> > > > >
> > > > > Does that work Zheng?
> > > > >
> > > >
> > > > I'm not sure it will. IIUC...
> > > >
> > > > I think part of what Zheng is pointing out is that when we assume that
> > > > the client already holds certain locks, then we are effectively
> > > > changing the order in which they can be acquired. That can leave us
> > > > subject to ABBA style deadlocks (though with all of the complexity
> > > > that class Locker provides).
> > > >
> > > > That in and of itself wouldn't be a problem if the MDS code didn't
> > > > wait synchronously on cap revokes in some cases (which Zheng pointed
> > > > out). Fixing that latter bit seems like it might be a big win for
> > > > parallelism, in addition to making async calls more possible.
> > >
> > > Well it's not literally synchronous; it's just that the MDS holds on
> > > to the locks it's already taken. That's why you can see failure cases
> > > where the MDS is still running and resolving requests but there's a
> > > particular client which is stuck with one operation that never moves
> > > forward.
> > >
> >
> > Got it, thanks.
> >
> > Could we resolve this by just unwinding the held locks in that case
> > before requeueing the request? Then just reacquire them when we
> > reattempt it. We might need a livelock avoidance mechanism but that
> > doesn't sound too conceptually difficult.
>
> I'm hesitant to speak too authoritatively after this long and missing
> that it would apparently be trivially subject to ABBA, but the things
> that concern me about it are:
> 1) the freezing process for snapshots and exports rely on waiting
> until all acquired locks have been dropped. We could try and reference
> count "requests-that-will-ask-for-these-again" but it gets difficult
> 2) Livelock avoidance mechanisms are always difficult?
> 3) The locking and caps infrastructures are supposed to let us handle
> this. I grant you they're annoyingly difficult and undocumented; that
> needs to be fixed (and much of it CAN be!). I don't think designing
> new systems to avoid interacting with them is the right approach to
> take.
>
> I think I mentioned before that we really need to draw out the caps
> needed and how they interact. Why does it need more than the FxLx as
> we've discussed, Zheng? Is it unlikely the client can hold the
> necessary pieces upfront?

For the unlink case, mds also needs to wrlock nestlock and rdlock
snaplock. These locks are not covered by cap mechanism.

> -Greg
Yan, Zheng April 11, 2019, 3:10 a.m. UTC | #12
On Wed, Apr 10, 2019 at 10:21 PM Jeff Layton <jlayton@poochiereds.net> wrote:
>
> On Wed, Apr 10, 2019 at 10:11 AM Yan, Zheng <ukernel@gmail.com> wrote:
> >
> > On Wed, Apr 10, 2019 at 9:22 PM Jeff Layton <jlayton@poochiereds.net> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 9:06 AM Yan, Zheng <ukernel@gmail.com> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > >
> > > > > When performing an unlink, if we have Fx on the parent directory and
> > > > > Lx on the inode of the dentry being unlinked then we don't need to
> > > > > wait on the reply before returning.
> > > > >
> > > > > In that situation, just fix up the dcache and link count and return
> > > > > immediately after issuing the call to the MDS. This does mean that we
> > > > > need to hold an extra reference to the inode being unlinked, and extra
> > > > > references to the caps to avoid races. Put those references in the
> > > > > r_callback routine.
> > > > >
> > > > > If the operation ends up failing, then set a writeback error on the
> > > > > directory inode that can be fetched later by an fsync on the dir.
> > > > >
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > >  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
> > > > >  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
> > > > >  fs/ceph/super.h |  1 +
> > > > >  3 files changed, 58 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > index d022e15c8378..8fbb09c761a7 100644
> > > > > --- a/fs/ceph/caps.c
> > > > > +++ b/fs/ceph/caps.c
> > > > > @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
> > > > >         return ret;
> > > > >  }
> > > > >
> > > > > +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> > > > > +{
> > > > > +       int err, got;
> > > > > +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> > > > > +
> > > > > +       /* Ensure we have Lx on the inode being unlinked */
> > > > > +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> > > > > +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> > > > > +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> > > > > +               return false;
> > > > > +
> > > > > +       /* Do we have Fx on the dir ? */
> > > > > +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> > > > > +                               true, &got);
> > > > > +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> > > > > +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> > > > > +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > > > +               return false;
> > > > > +       }
> > > > > +       return true;
> > > > > +}
> > > >
> > > > holding caps for request may cause deadlock.  For example
> > > >
> > > > - client hold Fx caps and send unlink request
> > > > - mds process request from other client, it change filelock's state to
> > > > EXCL_FOO and revoke Fx caps
> > > > - mds receives the unlink request, it can't process it because it
> > > > can't acquire wrlock on filelock
> > > >
> > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > >
> > >
> > > The client doing the unlink may have received a revoke for Fx on the
> > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > still be considered to hold Fx on the dir until that happens?
> > >
> >
> > Client should release the Fx. But there is a problem, mds process
> > other request first after it get the release of Fx
> >
>
> As I envisioned it, the client would hold a reference to Fx while the
> unlink is in flight, so it would not return Fx until after the unlink
> has gotten an unsafe reply.
>
> It seems like when the MDS tries to get the locks for the request from
> the second client and fails, it should then drop any locks it did get
> and requeue the request to be run when the caps are returned. If we're
> blocking for these locks, then we're tying up a thread while we wait
> on a cap release, which seems really inefficient.
>
> That said, I don't have a great understanding of the MDS code so maybe
> I'm missing something.
>
> >
> > > > I checked MDS code, supporting async operation is more complex than I
> > > > thought.  For the unlink case, mds needs to acquire locks other than
> > > > filelock and linklock.
> > > >
> > > > - Request A from client1 has acquired lock X, waiting for wrlock of
> > > > filelock.
> > > > - Client2 sends async unlink  request 'B' to mds
> > > > - MDS processes request 'B'. It needs to lock X and filelock.  But it
> > > > can't get lock X because request A holds it. Request A can get
> > > > filelock because Client2 hold Fx caps.
> > > >
> > >
> > > This sounds like a bog-standard ABBA ordering issue.
> > >
> > > If any request can't get any of the locks it needs, shouldn't it just
> > > unwind everything at that point and be to try again later? In fact, it
> > > sort of looks like that's what the MDS does, but the Locker code in
> > > the MDS is really hard to understand.
> > >
> >
> > In some special cases, mds does that.  For normal cases, mds just
> > acquires locks in fixed order. See Locker::acquire_locks and
> > SimpleLock::ptr_lt
> >
>
> I sort of see. Ok.
>
>
> > > > I think a way to handle this is delegating locks hold by unsafe
> > > > request to client. For example:
> > > >
> > > > - mds processes unlink request from client A,  sends unsafe reply and
> > > > delegates acquired locks to client
> > > > - client got the delegation, it can do async unlink on the same
> > > > directory as  long as it hold the delegation
> > > >
> > >
> > > So this would be a proposed new object that we'd be handing to
> > > clients? I'd have to understand what this means in practice.
> > >
> >
> > For requests that do unlink in the same directory, they should acquire
> > similar locks (the only difference is xlock on dentry). When sending
> > unsafe reply for a unlink request, mds keeps the acquired locks for
> > later unlink in the same directory. The unsafe reply includes a
> > delegation for unlink operation in the directory.  Client can do
> > unlink locally while it holds the delegation. (mds keeps locks for
> > client as long as it holds the delegation)  This is my rough idea,
> > I'm still thinking how to handle xlock on dentry.
> >
>
> I think we might be better off reworking cap handling semantics rather
> than layering a new mechanism on top for this. That said, I keep
> finding ways that the cap subsystem doesn't work like I'd think it
> would, so maybe I'm wrong here.
>

please be more specific. what  semantics do you want to have?

Regards
Yan, Zheng


> > > >
> > > >
> > > > > +
> > > > >  /*
> > > > >   * Check the offset we are writing up to against our current
> > > > >   * max_size.  If necessary, tell the MDS we want to write to
> > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > index 3eb9bc226b77..386c9439a020 100644
> > > > > --- a/fs/ceph/dir.c
> > > > > +++ b/fs/ceph/dir.c
> > > > > @@ -1029,6 +1029,18 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > > > >         return err;
> > > > >  }
> > > > >
> > > > > +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > > > > +                                struct ceph_mds_request *req)
> > > > > +{
> > > > > +       struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
> > > > > +
> > > > > +       /* If op failed, set error on parent directory */
> > > > > +       mapping_set_error(req->r_parent->i_mapping, req->r_err);
> > > > > +       ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > > > +       ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
> > > > > +       iput(req->r_old_inode);
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * rmdir and unlink are differ only by the metadata op code
> > > > >   */
> > > > > @@ -1065,9 +1077,29 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > > > >         req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
> > > > >         req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
> > > > >         req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> > > > > -       err = ceph_mdsc_do_request(mdsc, dir, req);
> > > > > -       if (!err && !req->r_reply_info.head->is_dentry)
> > > > > -               d_delete(dentry);
> > > > > +
> > > > > +       if (op == CEPH_MDS_OP_UNLINK &&
> > > > > +           ceph_get_caps_for_unlink(dir, dentry)) {
> > > > > +               /* Keep LINK caps to ensure continuity over async call */
> > > > > +               req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
> > > > > +               req->r_callback = ceph_async_unlink_cb;
> > > > > +               req->r_old_inode = d_inode(dentry);
> > > > > +               ihold(req->r_old_inode);
> > > > > +               err = ceph_mdsc_submit_request(mdsc, dir, req);
> > > > > +               if (!err) {
> > > > > +                       /*
> > > > > +                        * We have enough caps, so we assume that the unlink
> > > > > +                        * will succeed. Fix up the target inode and dcache.
> > > > > +                        */
> > > > > +                       drop_nlink(inode);
> > > > > +                       d_delete(dentry);
> > > > > +               }
> > > > > +       } else {
> > > > > +               err = ceph_mdsc_do_request(mdsc, dir, req);
> > > > > +               if (!err && !req->r_reply_info.head->is_dentry)
> > > > > +                       d_delete(dentry);
> > > > > +       }
> > > > > +
> > > > >         ceph_mdsc_put_request(req);
> > > > >  out:
> > > > >         return err;
> > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > index 3c5608f2108a..5c361dc1f47f 100644
> > > > > --- a/fs/ceph/super.h
> > > > > +++ b/fs/ceph/super.h
> > > > > @@ -1033,6 +1033,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
> > > > >                          loff_t endoff, int *got, struct page **pinned_page);
> > > > >  extern int ceph_try_get_caps(struct ceph_inode_info *ci,
> > > > >                              int need, int want, bool nonblock, int *got);
> > > > > +extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
> > > > >
> > > > >  /* for counting open files by mode */
> > > > >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > > > > --
> > > > > 2.20.1
> > > > >
> > >
> > >
> > >
> > > --
> > > Jeff Layton <jlayton@poochiereds.net>
>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>
Jeff Layton April 11, 2019, 11:31 a.m. UTC | #13
On Wed, Apr 10, 2019 at 11:10 PM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Wed, Apr 10, 2019 at 10:21 PM Jeff Layton <jlayton@poochiereds.net> wrote:
> >
> > On Wed, Apr 10, 2019 at 10:11 AM Yan, Zheng <ukernel@gmail.com> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 9:22 PM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 9:06 AM Yan, Zheng <ukernel@gmail.com> wrote:
> > > > >
> > > > > On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > >
> > > > > > When performing an unlink, if we have Fx on the parent directory and
> > > > > > Lx on the inode of the dentry being unlinked then we don't need to
> > > > > > wait on the reply before returning.
> > > > > >
> > > > > > In that situation, just fix up the dcache and link count and return
> > > > > > immediately after issuing the call to the MDS. This does mean that we
> > > > > > need to hold an extra reference to the inode being unlinked, and extra
> > > > > > references to the caps to avoid races. Put those references in the
> > > > > > r_callback routine.
> > > > > >
> > > > > > If the operation ends up failing, then set a writeback error on the
> > > > > > directory inode that can be fetched later by an fsync on the dir.
> > > > > >
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > ---
> > > > > >  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
> > > > > >  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
> > > > > >  fs/ceph/super.h |  1 +
> > > > > >  3 files changed, 58 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > > index d022e15c8378..8fbb09c761a7 100644
> > > > > > --- a/fs/ceph/caps.c
> > > > > > +++ b/fs/ceph/caps.c
> > > > > > @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
> > > > > >         return ret;
> > > > > >  }
> > > > > >
> > > > > > +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> > > > > > +{
> > > > > > +       int err, got;
> > > > > > +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> > > > > > +
> > > > > > +       /* Ensure we have Lx on the inode being unlinked */
> > > > > > +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> > > > > > +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> > > > > > +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> > > > > > +               return false;
> > > > > > +
> > > > > > +       /* Do we have Fx on the dir ? */
> > > > > > +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> > > > > > +                               true, &got);
> > > > > > +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> > > > > > +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> > > > > > +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > > > > +               return false;
> > > > > > +       }
> > > > > > +       return true;
> > > > > > +}
> > > > >
> > > > > holding caps for request may cause deadlock.  For example
> > > > >
> > > > > - client hold Fx caps and send unlink request
> > > > > - mds process request from other client, it change filelock's state to
> > > > > EXCL_FOO and revoke Fx caps
> > > > > - mds receives the unlink request, it can't process it because it
> > > > > can't acquire wrlock on filelock
> > > > >
> > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > >
> > > >
> > > > The client doing the unlink may have received a revoke for Fx on the
> > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > still be considered to hold Fx on the dir until that happens?
> > > >
> > >
> > > Client should release the Fx. But there is a problem, mds process
> > > other request first after it get the release of Fx
> > >
> >
> > As I envisioned it, the client would hold a reference to Fx while the
> > unlink is in flight, so it would not return Fx until after the unlink
> > has gotten an unsafe reply.
> >
> > It seems like when the MDS tries to get the locks for the request from
> > the second client and fails, it should then drop any locks it did get
> > and requeue the request to be run when the caps are returned. If we're
> > blocking for these locks, then we're tying up a thread while we wait
> > on a cap release, which seems really inefficient.
> >
> > That said, I don't have a great understanding of the MDS code so maybe
> > I'm missing something.
> >
> > >
> > > > > I checked MDS code, supporting async operation is more complex than I
> > > > > thought.  For the unlink case, mds needs to acquire locks other than
> > > > > filelock and linklock.
> > > > >
> > > > > - Request A from client1 has acquired lock X, waiting for wrlock of
> > > > > filelock.
> > > > > - Client2 sends async unlink  request 'B' to mds
> > > > > - MDS processes request 'B'. It needs to lock X and filelock.  But it
> > > > > can't get lock X because request A holds it. Request A can get
> > > > > filelock because Client2 hold Fx caps.
> > > > >
> > > >
> > > > This sounds like a bog-standard ABBA ordering issue.
> > > >
> > > > If any request can't get any of the locks it needs, shouldn't it just
> > > > unwind everything at that point and be to try again later? In fact, it
> > > > sort of looks like that's what the MDS does, but the Locker code in
> > > > the MDS is really hard to understand.
> > > >
> > >
> > > In some special cases, mds does that.  For normal cases, mds just
> > > acquires locks in fixed order. See Locker::acquire_locks and
> > > SimpleLock::ptr_lt
> > >
> >
> > I sort of see. Ok.
> >
> >
> > > > > I think a way to handle this is delegating locks hold by unsafe
> > > > > request to client. For example:
> > > > >
> > > > > - mds processes unlink request from client A,  sends unsafe reply and
> > > > > delegates acquired locks to client
> > > > > - client got the delegation, it can do async unlink on the same
> > > > > directory as  long as it hold the delegation
> > > > >
> > > >
> > > > So this would be a proposed new object that we'd be handing to
> > > > clients? I'd have to understand what this means in practice.
> > > >
> > >
> > > For requests that do unlink in the same directory, they should acquire
> > > similar locks (the only difference is xlock on dentry). When sending
> > > unsafe reply for a unlink request, mds keeps the acquired locks for
> > > later unlink in the same directory. The unsafe reply includes a
> > > delegation for unlink operation in the directory.  Client can do
> > > unlink locally while it holds the delegation. (mds keeps locks for
> > > client as long as it holds the delegation)  This is my rough idea,
> > > I'm still thinking how to handle xlock on dentry.
> > >
> >
> > I think we might be better off reworking cap handling semantics rather
> > than layering a new mechanism on top for this. That said, I keep
> > finding ways that the cap subsystem doesn't work like I'd think it
> > would, so maybe I'm wrong here.
> >
>
> please be more specific. what  semantics do you want to have?
>

Basically, at a high level:

1) have the MDS avoid taking a lock when processing a request from a
client that holds corresponding caps. We may need to take steps to
prevent races here, particularly to deal with clients that may not
hold references to caps when issuing a request. For instance, a client
could return Fx caps on the dir while it has an unlink request
outstanding. In that event, we'd need to ensure that nothing else can
take the filelock until that request is done.

2) when the MDS is processing a request and can't get a lock because
it's waiting on a client to return a corresponding cap, then it should
release any locks taken so far before requeueing the request. It seems
like that should be enough to prevent the sort of deadlock you
outlined yesterday, but I don't know how difficult that would be. We
may also need to take steps to avoid livelocks. For instance, ensure
that we don't hand out caps corresponding to locks that a requeued
call will need until it has been completed.
Yan, Zheng April 17, 2019, 1:20 p.m. UTC | #14
On Thu, Apr 11, 2019 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, Apr 10, 2019 at 11:10 PM Yan, Zheng <ukernel@gmail.com> wrote:
> >
> > On Wed, Apr 10, 2019 at 10:21 PM Jeff Layton <jlayton@poochiereds.net> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 10:11 AM Yan, Zheng <ukernel@gmail.com> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 9:22 PM Jeff Layton <jlayton@poochiereds.net> wrote:
> > > > >
> > > > > On Wed, Apr 10, 2019 at 9:06 AM Yan, Zheng <ukernel@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > >
> > > > > > > When performing an unlink, if we have Fx on the parent directory and
> > > > > > > Lx on the inode of the dentry being unlinked then we don't need to
> > > > > > > wait on the reply before returning.
> > > > > > >
> > > > > > > In that situation, just fix up the dcache and link count and return
> > > > > > > immediately after issuing the call to the MDS. This does mean that we
> > > > > > > need to hold an extra reference to the inode being unlinked, and extra
> > > > > > > references to the caps to avoid races. Put those references in the
> > > > > > > r_callback routine.
> > > > > > >
> > > > > > > If the operation ends up failing, then set a writeback error on the
> > > > > > > directory inode that can be fetched later by an fsync on the dir.
> > > > > > >
> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > ---
> > > > > > >  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
> > > > > > >  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
> > > > > > >  fs/ceph/super.h |  1 +
> > > > > > >  3 files changed, 58 insertions(+), 3 deletions(-)
> > > > > > >
> > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > > > index d022e15c8378..8fbb09c761a7 100644
> > > > > > > --- a/fs/ceph/caps.c
> > > > > > > +++ b/fs/ceph/caps.c
> > > > > > > @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
> > > > > > >         return ret;
> > > > > > >  }
> > > > > > >
> > > > > > > +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> > > > > > > +{
> > > > > > > +       int err, got;
> > > > > > > +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> > > > > > > +
> > > > > > > +       /* Ensure we have Lx on the inode being unlinked */
> > > > > > > +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> > > > > > > +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> > > > > > > +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> > > > > > > +               return false;
> > > > > > > +
> > > > > > > +       /* Do we have Fx on the dir ? */
> > > > > > > +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> > > > > > > +                               true, &got);
> > > > > > > +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> > > > > > > +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> > > > > > > +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> > > > > > > +               return false;
> > > > > > > +       }
> > > > > > > +       return true;
> > > > > > > +}
> > > > > >
> > > > > > holding caps for request may cause deadlock.  For example
> > > > > >
> > > > > > - client hold Fx caps and send unlink request
> > > > > > - mds process request from other client, it change filelock's state to
> > > > > > EXCL_FOO and revoke Fx caps
> > > > > > - mds receives the unlink request, it can't process it because it
> > > > > > can't acquire wrlock on filelock
> > > > > >
> > > > > > filelock state stays in EXCL_FOO because client does not release Fx caps.
> > > > > >
> > > > >
> > > > > The client doing the unlink may have received a revoke for Fx on the
> > > > > dir at that point, but it won't have returned it yet. Shouldn't it
> > > > > still be considered to hold Fx on the dir until that happens?
> > > > >
> > > >
> > > > Client should release the Fx. But there is a problem, mds process
> > > > other request first after it get the release of Fx
> > > >
> > >
> > > As I envisioned it, the client would hold a reference to Fx while the
> > > unlink is in flight, so it would not return Fx until after the unlink
> > > has gotten an unsafe reply.
> > >
> > > It seems like when the MDS tries to get the locks for the request from
> > > the second client and fails, it should then drop any locks it did get
> > > and requeue the request to be run when the caps are returned. If we're
> > > blocking for these locks, then we're tying up a thread while we wait
> > > on a cap release, which seems really inefficient.
> > >
> > > That said, I don't have a great understanding of the MDS code so maybe
> > > I'm missing something.
> > >
> > > >
> > > > > > I checked MDS code, supporting async operation is more complex than I
> > > > > > thought.  For the unlink case, mds needs to acquire locks other than
> > > > > > filelock and linklock.
> > > > > >
> > > > > > - Request A from client1 has acquired lock X, waiting for wrlock of
> > > > > > filelock.
> > > > > > - Client2 sends async unlink  request 'B' to mds
> > > > > > - MDS processes request 'B'. It needs to lock X and filelock.  But it
> > > > > > can't get lock X because request A holds it. Request A can get
> > > > > > filelock because Client2 hold Fx caps.
> > > > > >
> > > > >
> > > > > This sounds like a bog-standard ABBA ordering issue.
> > > > >
> > > > > If any request can't get any of the locks it needs, shouldn't it just
> > > > > unwind everything at that point and be to try again later? In fact, it
> > > > > sort of looks like that's what the MDS does, but the Locker code in
> > > > > the MDS is really hard to understand.
> > > > >
> > > >
> > > > In some special cases, mds does that.  For normal cases, mds just
> > > > acquires locks in fixed order. See Locker::acquire_locks and
> > > > SimpleLock::ptr_lt
> > > >
> > >
> > > I sort of see. Ok.
> > >
> > >
> > > > > > I think a way to handle this is delegating locks hold by unsafe
> > > > > > request to client. For example:
> > > > > >
> > > > > > - mds processes unlink request from client A,  sends unsafe reply and
> > > > > > delegates acquired locks to client
> > > > > > - client got the delegation, it can do async unlink on the same
> > > > > > directory as  long as it hold the delegation
> > > > > >
> > > > >
> > > > > So this would be a proposed new object that we'd be handing to
> > > > > clients? I'd have to understand what this means in practice.
> > > > >
> > > >
> > > > For requests that do unlink in the same directory, they should acquire
> > > > similar locks (the only difference is xlock on dentry). When sending
> > > > unsafe reply for a unlink request, mds keeps the acquired locks for
> > > > later unlink in the same directory. The unsafe reply includes a
> > > > delegation for unlink operation in the directory.  Client can do
> > > > unlink locally while it holds the delegation. (mds keeps locks for
> > > > client as long as it holds the delegation)  This is my rough idea,
> > > > I'm still thinking how to handle xlock on dentry.
> > > >
> > >
> > > I think we might be better off reworking cap handling semantics rather
> > > than layering a new mechanism on top for this. That said, I keep
> > > finding ways that the cap subsystem doesn't work like I'd think it
> > > would, so maybe I'm wrong here.
> > >
> >
> > please be more specific. what  semantics do you want to have?
> >
>
> Basically, at a high level:
>
> 1) have the MDS avoid taking a lock when processing a request from a
> client that holds corresponding caps. We may need to take steps to
> prevent races here, particularly to deal with clients that may not
> hold references to caps when issuing a request. For instance, a client
> could return Fx caps on the dir while it has an unlink request
> outstanding. In that event, we'd need to ensure that nothing else can
> take the filelock until that request is done.
>
> 2) when the MDS is processing a request and can't get a lock because
> it's waiting on a client to return a corresponding cap, then it should
> release any locks taken so far before requeueing the request. It seems
> like that should be enough to prevent the sort of deadlock you
> outlined yesterday, but I don't know how difficult that would be. We
> may also need to take steps to avoid livelocks. For instance, ensure
> that we don't hand out caps corresponding to locks that a requeued
> call will need until it has been completed.
>

FYI: https://github.com/ceph/ceph/pull/27648

> --
> Jeff Layton <jlayton@kernel.org>
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d022e15c8378..8fbb09c761a7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2682,6 +2682,28 @@  static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 	return ret;
 }
 
+bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int err, got;
+	struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
+
+	/* Ensure we have Lx on the inode being unlinked */
+	err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
+	dout("Lx on %p err=%d got=%d\n", dentry, err, got);
+	if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
+		return false;
+
+	/* Do we have Fx on the dir ? */
+	err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
+				true, &got);
+	dout("Fx on %p err=%d got=%d\n", dir, err, got);
+	if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
+		ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
+		return false;
+	}
+	return true;
+}
+
 /*
  * Check the offset we are writing up to against our current
  * max_size.  If necessary, tell the MDS we want to write to
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3eb9bc226b77..386c9439a020 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1029,6 +1029,18 @@  static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	return err;
 }
 
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_request *req)
+{
+	struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
+
+	/* If op failed, set error on parent directory */
+	mapping_set_error(req->r_parent->i_mapping, req->r_err);
+	ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
+	ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
+	iput(req->r_old_inode);
+}
+
 /*
  * rmdir and unlink are differ only by the metadata op code
  */
@@ -1065,9 +1077,29 @@  static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
 	req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
-	err = ceph_mdsc_do_request(mdsc, dir, req);
-	if (!err && !req->r_reply_info.head->is_dentry)
-		d_delete(dentry);
+
+	if (op == CEPH_MDS_OP_UNLINK &&
+	    ceph_get_caps_for_unlink(dir, dentry)) {
+		/* Keep LINK caps to ensure continuity over async call */
+		req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
+		req->r_callback = ceph_async_unlink_cb;
+		req->r_old_inode = d_inode(dentry);
+		ihold(req->r_old_inode);
+		err = ceph_mdsc_submit_request(mdsc, dir, req);
+		if (!err) {
+			/*
+			 * We have enough caps, so we assume that the unlink
+			 * will succeed. Fix up the target inode and dcache.
+			 */
+			drop_nlink(inode);
+			d_delete(dentry);
+		}
+	} else {
+		err = ceph_mdsc_do_request(mdsc, dir, req);
+		if (!err && !req->r_reply_info.head->is_dentry)
+			d_delete(dentry);
+	}
+
 	ceph_mdsc_put_request(req);
 out:
 	return err;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3c5608f2108a..5c361dc1f47f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1033,6 +1033,7 @@  extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 			 loff_t endoff, int *got, struct page **pinned_page);
 extern int ceph_try_get_caps(struct ceph_inode_info *ci,
 			     int need, int want, bool nonblock, int *got);
+extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
 
 /* for counting open files by mode */
 extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);