diff mbox series

ceph: retransmit REQUEST_CLOSE every second if we don't get a response

Message ID 20200928220349.584709-1-jlayton@kernel.org
State New, archived
Headers show
Series ceph: retransmit REQUEST_CLOSE every second if we don't get a response | expand

Commit Message

Jeff Layton Sept. 28, 2020, 10:03 p.m. UTC
Patrick reported a case where the MDS and client client had racing
session messages to one anothe. The MDS was sending caps to the client
and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
to unmount.

Because they were sending at the same time, the REQUEST_CLOSE had too
old a sequence number, and the MDS dropped it on the floor. On the
client, this would have probably manifested as a 60s hang during umount.
The MDS ended up blocklisting the client.

Once we've decided to issue a REQUEST_CLOSE, we're finished with the
session, so just keep sending them until the MDS acknowledges that.

Change the code to retransmit a REQUEST_CLOSE every second if the
session hasn't changed state yet. Give up and throw a warning after
mount_timeout elapses if we haven't gotten a response.

URL: https://tracker.ceph.com/issues/47563
Reported-by: Patrick Donnelly <pdonnell@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

Comments

Ilya Dryomov Oct. 8, 2020, 5:27 p.m. UTC | #1
On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> Patrick reported a case where the MDS and client client had racing
> session messages to one anothe. The MDS was sending caps to the client
> and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> to unmount.
>
> Because they were sending at the same time, the REQUEST_CLOSE had too
> old a sequence number, and the MDS dropped it on the floor. On the
> client, this would have probably manifested as a 60s hang during umount.
> The MDS ended up blocklisting the client.
>
> Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> session, so just keep sending them until the MDS acknowledges that.
>
> Change the code to retransmit a REQUEST_CLOSE every second if the
> session hasn't changed state yet. Give up and throw a warning after
> mount_timeout elapses if we haven't gotten a response.
>
> URL: https://tracker.ceph.com/issues/47563
> Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
>  1 file changed, 32 insertions(+), 21 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index b07e7adf146f..d9cb74e3d5e3 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
>  static int __close_session(struct ceph_mds_client *mdsc,
>                          struct ceph_mds_session *session)
>  {
> -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
>                 return 0;
>         session->s_state = CEPH_MDS_SESSION_CLOSING;
>         return request_close_session(session);
> @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
>         return atomic_read(&mdsc->num_sessions) <= skipped;
>  }
>
> +static bool umount_timed_out(unsigned long timeo)
> +{
> +       if (time_before(jiffies, timeo))
> +               return false;
> +       pr_warn("ceph: unable to close all sessions\n");
> +       return true;
> +}
> +
>  /*
>   * called after sb is ro.
>   */
>  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>  {
> -       struct ceph_options *opts = mdsc->fsc->client->options;
>         struct ceph_mds_session *session;
> -       int i;
> -       int skipped = 0;
> +       int i, ret;
> +       int skipped;
> +       unsigned long timeo = jiffies +
> +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
>
>         dout("close_sessions\n");
>
>         /* close sessions */
> -       mutex_lock(&mdsc->mutex);
> -       for (i = 0; i < mdsc->max_sessions; i++) {
> -               session = __ceph_lookup_mds_session(mdsc, i);
> -               if (!session)
> -                       continue;
> -               mutex_unlock(&mdsc->mutex);
> -               mutex_lock(&session->s_mutex);
> -               if (__close_session(mdsc, session) <= 0)
> -                       skipped++;
> -               mutex_unlock(&session->s_mutex);
> -               ceph_put_mds_session(session);
> +       do {
> +               skipped = 0;
>                 mutex_lock(&mdsc->mutex);
> -       }
> -       mutex_unlock(&mdsc->mutex);
> +               for (i = 0; i < mdsc->max_sessions; i++) {
> +                       session = __ceph_lookup_mds_session(mdsc, i);
> +                       if (!session)
> +                               continue;
> +                       mutex_unlock(&mdsc->mutex);
> +                       mutex_lock(&session->s_mutex);
> +                       if (__close_session(mdsc, session) <= 0)
> +                               skipped++;
> +                       mutex_unlock(&session->s_mutex);
> +                       ceph_put_mds_session(session);
> +                       mutex_lock(&mdsc->mutex);
> +               }
> +               mutex_unlock(&mdsc->mutex);
>
> -       dout("waiting for sessions to close\n");
> -       wait_event_timeout(mdsc->session_close_wq,
> -                          done_closing_sessions(mdsc, skipped),
> -                          ceph_timeout_jiffies(opts->mount_timeout));
> +               dout("waiting for sessions to close\n");
> +               ret = wait_event_timeout(mdsc->session_close_wq,
> +                                        done_closing_sessions(mdsc, skipped), HZ);
> +       } while (!ret && !umount_timed_out(timeo));
>
>         /* tear down remaining sessions */
>         mutex_lock(&mdsc->mutex);
> --
> 2.26.2
>

Hi Jeff,

This seems wrong to me, at least conceptually.  Is the same patch
getting applied to ceph-fuse?

Pretending to not know anything about the client <-> MDS protocol,
two questions immediately come to mind.  Why is MDS allowed to drop
REQUEST_CLOSE?  If the client is really done with the session, why
does it block on the acknowledgement from the MDS?

Thanks,

                Ilya
Jeff Layton Oct. 8, 2020, 6:14 p.m. UTC | #2
On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
> On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
> > Patrick reported a case where the MDS and client client had racing
> > session messages to one anothe. The MDS was sending caps to the client
> > and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> > to unmount.
> > 
> > Because they were sending at the same time, the REQUEST_CLOSE had too
> > old a sequence number, and the MDS dropped it on the floor. On the
> > client, this would have probably manifested as a 60s hang during umount.
> > The MDS ended up blocklisting the client.
> > 
> > Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> > session, so just keep sending them until the MDS acknowledges that.
> > 
> > Change the code to retransmit a REQUEST_CLOSE every second if the
> > session hasn't changed state yet. Give up and throw a warning after
> > mount_timeout elapses if we haven't gotten a response.
> > 
> > URL: https://tracker.ceph.com/issues/47563
> > Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
> >  1 file changed, 32 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index b07e7adf146f..d9cb74e3d5e3 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
> >  static int __close_session(struct ceph_mds_client *mdsc,
> >                          struct ceph_mds_session *session)
> >  {
> > -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> > +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
> >                 return 0;
> >         session->s_state = CEPH_MDS_SESSION_CLOSING;
> >         return request_close_session(session);
> > @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
> >         return atomic_read(&mdsc->num_sessions) <= skipped;
> >  }
> > 
> > +static bool umount_timed_out(unsigned long timeo)
> > +{
> > +       if (time_before(jiffies, timeo))
> > +               return false;
> > +       pr_warn("ceph: unable to close all sessions\n");
> > +       return true;
> > +}
> > +
> >  /*
> >   * called after sb is ro.
> >   */
> >  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> >  {
> > -       struct ceph_options *opts = mdsc->fsc->client->options;
> >         struct ceph_mds_session *session;
> > -       int i;
> > -       int skipped = 0;
> > +       int i, ret;
> > +       int skipped;
> > +       unsigned long timeo = jiffies +
> > +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
> > 
> >         dout("close_sessions\n");
> > 
> >         /* close sessions */
> > -       mutex_lock(&mdsc->mutex);
> > -       for (i = 0; i < mdsc->max_sessions; i++) {
> > -               session = __ceph_lookup_mds_session(mdsc, i);
> > -               if (!session)
> > -                       continue;
> > -               mutex_unlock(&mdsc->mutex);
> > -               mutex_lock(&session->s_mutex);
> > -               if (__close_session(mdsc, session) <= 0)
> > -                       skipped++;
> > -               mutex_unlock(&session->s_mutex);
> > -               ceph_put_mds_session(session);
> > +       do {
> > +               skipped = 0;
> >                 mutex_lock(&mdsc->mutex);
> > -       }
> > -       mutex_unlock(&mdsc->mutex);
> > +               for (i = 0; i < mdsc->max_sessions; i++) {
> > +                       session = __ceph_lookup_mds_session(mdsc, i);
> > +                       if (!session)
> > +                               continue;
> > +                       mutex_unlock(&mdsc->mutex);
> > +                       mutex_lock(&session->s_mutex);
> > +                       if (__close_session(mdsc, session) <= 0)
> > +                               skipped++;
> > +                       mutex_unlock(&session->s_mutex);
> > +                       ceph_put_mds_session(session);
> > +                       mutex_lock(&mdsc->mutex);
> > +               }
> > +               mutex_unlock(&mdsc->mutex);
> > 
> > -       dout("waiting for sessions to close\n");
> > -       wait_event_timeout(mdsc->session_close_wq,
> > -                          done_closing_sessions(mdsc, skipped),
> > -                          ceph_timeout_jiffies(opts->mount_timeout));
> > +               dout("waiting for sessions to close\n");
> > +               ret = wait_event_timeout(mdsc->session_close_wq,
> > +                                        done_closing_sessions(mdsc, skipped), HZ);
> > +       } while (!ret && !umount_timed_out(timeo));
> > 
> >         /* tear down remaining sessions */
> >         mutex_lock(&mdsc->mutex);
> > --
> > 2.26.2
> > 
> 
> Hi Jeff,
> 
> This seems wrong to me, at least conceptually.  Is the same patch
> getting applied to ceph-fuse?
> 

It's a grotesque workaround, I will grant you. I'm not sure what we want
to do for ceph-fuse yet but it does seem to have the same issue.
Probably, we should plan to do a similar fix there once we settle on the
right approach.

> Pretending to not know anything about the client <-> MDS protocol,
> two questions immediately come to mind.  Why is MDS allowed to drop
> REQUEST_CLOSE? 

It really seems like a protocol design flaw.

IIUC, the idea overall with the low-level ceph protocol seems to be that
the client should retransmit (or reevaluate, in the case of caps) calls
that were in flight when the seq number changes.

The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
but it doesn't really make a lot of sense for that, IMO.

> If the client is really done with the session, why
> does it block on the acknowledgement from the MDS?
> 

That's certainly one thing we could do instead. That would likely
prevent the unmount hangs, but would do nothing for the MDS deciding to
blacklist it. Maybe that's good enough though and we should fix that
problem on the MDS side by having it ignore the seq number on
REQUEST_CLOSE?
Ilya Dryomov Oct. 10, 2020, 6:49 p.m. UTC | #3
On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
> > On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > Patrick reported a case where the MDS and client client had racing
> > > session messages to one anothe. The MDS was sending caps to the client
> > > and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> > > to unmount.
> > >
> > > Because they were sending at the same time, the REQUEST_CLOSE had too
> > > old a sequence number, and the MDS dropped it on the floor. On the
> > > client, this would have probably manifested as a 60s hang during umount.
> > > The MDS ended up blocklisting the client.
> > >
> > > Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> > > session, so just keep sending them until the MDS acknowledges that.
> > >
> > > Change the code to retransmit a REQUEST_CLOSE every second if the
> > > session hasn't changed state yet. Give up and throw a warning after
> > > mount_timeout elapses if we haven't gotten a response.
> > >
> > > URL: https://tracker.ceph.com/issues/47563
> > > Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
> > >  1 file changed, 32 insertions(+), 21 deletions(-)
> > >
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index b07e7adf146f..d9cb74e3d5e3 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
> > >  static int __close_session(struct ceph_mds_client *mdsc,
> > >                          struct ceph_mds_session *session)
> > >  {
> > > -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> > > +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
> > >                 return 0;
> > >         session->s_state = CEPH_MDS_SESSION_CLOSING;
> > >         return request_close_session(session);
> > > @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
> > >         return atomic_read(&mdsc->num_sessions) <= skipped;
> > >  }
> > >
> > > +static bool umount_timed_out(unsigned long timeo)
> > > +{
> > > +       if (time_before(jiffies, timeo))
> > > +               return false;
> > > +       pr_warn("ceph: unable to close all sessions\n");
> > > +       return true;
> > > +}
> > > +
> > >  /*
> > >   * called after sb is ro.
> > >   */
> > >  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > >  {
> > > -       struct ceph_options *opts = mdsc->fsc->client->options;
> > >         struct ceph_mds_session *session;
> > > -       int i;
> > > -       int skipped = 0;
> > > +       int i, ret;
> > > +       int skipped;
> > > +       unsigned long timeo = jiffies +
> > > +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
> > >
> > >         dout("close_sessions\n");
> > >
> > >         /* close sessions */
> > > -       mutex_lock(&mdsc->mutex);
> > > -       for (i = 0; i < mdsc->max_sessions; i++) {
> > > -               session = __ceph_lookup_mds_session(mdsc, i);
> > > -               if (!session)
> > > -                       continue;
> > > -               mutex_unlock(&mdsc->mutex);
> > > -               mutex_lock(&session->s_mutex);
> > > -               if (__close_session(mdsc, session) <= 0)
> > > -                       skipped++;
> > > -               mutex_unlock(&session->s_mutex);
> > > -               ceph_put_mds_session(session);
> > > +       do {
> > > +               skipped = 0;
> > >                 mutex_lock(&mdsc->mutex);
> > > -       }
> > > -       mutex_unlock(&mdsc->mutex);
> > > +               for (i = 0; i < mdsc->max_sessions; i++) {
> > > +                       session = __ceph_lookup_mds_session(mdsc, i);
> > > +                       if (!session)
> > > +                               continue;
> > > +                       mutex_unlock(&mdsc->mutex);
> > > +                       mutex_lock(&session->s_mutex);
> > > +                       if (__close_session(mdsc, session) <= 0)
> > > +                               skipped++;
> > > +                       mutex_unlock(&session->s_mutex);
> > > +                       ceph_put_mds_session(session);
> > > +                       mutex_lock(&mdsc->mutex);
> > > +               }
> > > +               mutex_unlock(&mdsc->mutex);
> > >
> > > -       dout("waiting for sessions to close\n");
> > > -       wait_event_timeout(mdsc->session_close_wq,
> > > -                          done_closing_sessions(mdsc, skipped),
> > > -                          ceph_timeout_jiffies(opts->mount_timeout));
> > > +               dout("waiting for sessions to close\n");
> > > +               ret = wait_event_timeout(mdsc->session_close_wq,
> > > +                                        done_closing_sessions(mdsc, skipped), HZ);
> > > +       } while (!ret && !umount_timed_out(timeo));
> > >
> > >         /* tear down remaining sessions */
> > >         mutex_lock(&mdsc->mutex);
> > > --
> > > 2.26.2
> > >
> >
> > Hi Jeff,
> >
> > This seems wrong to me, at least conceptually.  Is the same patch
> > getting applied to ceph-fuse?
> >
>
> It's a grotesque workaround, I will grant you. I'm not sure what we want
> to do for ceph-fuse yet but it does seem to have the same issue.
> Probably, we should plan to do a similar fix there once we settle on the
> right approach.
>
> > Pretending to not know anything about the client <-> MDS protocol,
> > two questions immediately come to mind.  Why is MDS allowed to drop
> > REQUEST_CLOSE?
>
> It really seems like a protocol design flaw.
>
> IIUC, the idea overall with the low-level ceph protocol seems to be that
> the client should retransmit (or reevaluate, in the case of caps) calls
> that were in flight when the seq number changes.
>
> The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
> but it doesn't really make a lot of sense for that, IMO.

(edit of my reply to https://github.com/ceph/ceph/pull/37619)

After taking a look at the MDS code, it really seemed like it
had been written with the expectation that REQUEST_CLOSE would be
resent, so I dug around.  I don't fully understand these "push"
sequence numbers yet, but there is probably some race that requires
the client to confirm that it saw the sequence number, even if the
session is about to go.  Sage is probably the only one who might
remember at this point.

The kernel client already has the code to retry REQUEST_CLOSE, only
every five seconds instead every second.  See check_session_state()
which is called from delayed_work() in mds_client.c.  It looks like
it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
mdsc use-after-free crash") which conditioned delayed_work() on
mdsc->stopping -- hence the misbehaviour.

Thanks,

                Ilya
Xiubo Li Oct. 12, 2020, 6:52 a.m. UTC | #4
On 2020/10/11 2:49, Ilya Dryomov wrote:
> On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
>>> On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
>>>> Patrick reported a case where the MDS and client client had racing
>>>> session messages to one anothe. The MDS was sending caps to the client
>>>> and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
>>>> to unmount.
>>>>
>>>> Because they were sending at the same time, the REQUEST_CLOSE had too
>>>> old a sequence number, and the MDS dropped it on the floor. On the
>>>> client, this would have probably manifested as a 60s hang during umount.
>>>> The MDS ended up blocklisting the client.
>>>>
>>>> Once we've decided to issue a REQUEST_CLOSE, we're finished with the
>>>> session, so just keep sending them until the MDS acknowledges that.
>>>>
>>>> Change the code to retransmit a REQUEST_CLOSE every second if the
>>>> session hasn't changed state yet. Give up and throw a warning after
>>>> mount_timeout elapses if we haven't gotten a response.
>>>>
>>>> URL: https://tracker.ceph.com/issues/47563
>>>> Reported-by: Patrick Donnelly <pdonnell@redhat.com>
>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>> ---
>>>>   fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
>>>>   1 file changed, 32 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index b07e7adf146f..d9cb74e3d5e3 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
>>>>   static int __close_session(struct ceph_mds_client *mdsc,
>>>>                           struct ceph_mds_session *session)
>>>>   {
>>>> -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
>>>> +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
>>>>                  return 0;
>>>>          session->s_state = CEPH_MDS_SESSION_CLOSING;
>>>>          return request_close_session(session);
>>>> @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
>>>>          return atomic_read(&mdsc->num_sessions) <= skipped;
>>>>   }
>>>>
>>>> +static bool umount_timed_out(unsigned long timeo)
>>>> +{
>>>> +       if (time_before(jiffies, timeo))
>>>> +               return false;
>>>> +       pr_warn("ceph: unable to close all sessions\n");
>>>> +       return true;
>>>> +}
>>>> +
>>>>   /*
>>>>    * called after sb is ro.
>>>>    */
>>>>   void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>>>>   {
>>>> -       struct ceph_options *opts = mdsc->fsc->client->options;
>>>>          struct ceph_mds_session *session;
>>>> -       int i;
>>>> -       int skipped = 0;
>>>> +       int i, ret;
>>>> +       int skipped;
>>>> +       unsigned long timeo = jiffies +
>>>> +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
>>>>
>>>>          dout("close_sessions\n");
>>>>
>>>>          /* close sessions */
>>>> -       mutex_lock(&mdsc->mutex);
>>>> -       for (i = 0; i < mdsc->max_sessions; i++) {
>>>> -               session = __ceph_lookup_mds_session(mdsc, i);
>>>> -               if (!session)
>>>> -                       continue;
>>>> -               mutex_unlock(&mdsc->mutex);
>>>> -               mutex_lock(&session->s_mutex);
>>>> -               if (__close_session(mdsc, session) <= 0)
>>>> -                       skipped++;
>>>> -               mutex_unlock(&session->s_mutex);
>>>> -               ceph_put_mds_session(session);
>>>> +       do {
>>>> +               skipped = 0;
>>>>                  mutex_lock(&mdsc->mutex);
>>>> -       }
>>>> -       mutex_unlock(&mdsc->mutex);
>>>> +               for (i = 0; i < mdsc->max_sessions; i++) {
>>>> +                       session = __ceph_lookup_mds_session(mdsc, i);
>>>> +                       if (!session)
>>>> +                               continue;
>>>> +                       mutex_unlock(&mdsc->mutex);
>>>> +                       mutex_lock(&session->s_mutex);
>>>> +                       if (__close_session(mdsc, session) <= 0)
>>>> +                               skipped++;
>>>> +                       mutex_unlock(&session->s_mutex);
>>>> +                       ceph_put_mds_session(session);
>>>> +                       mutex_lock(&mdsc->mutex);
>>>> +               }
>>>> +               mutex_unlock(&mdsc->mutex);
>>>>
>>>> -       dout("waiting for sessions to close\n");
>>>> -       wait_event_timeout(mdsc->session_close_wq,
>>>> -                          done_closing_sessions(mdsc, skipped),
>>>> -                          ceph_timeout_jiffies(opts->mount_timeout));
>>>> +               dout("waiting for sessions to close\n");
>>>> +               ret = wait_event_timeout(mdsc->session_close_wq,
>>>> +                                        done_closing_sessions(mdsc, skipped), HZ);
>>>> +       } while (!ret && !umount_timed_out(timeo));
>>>>
>>>>          /* tear down remaining sessions */
>>>>          mutex_lock(&mdsc->mutex);
>>>> --
>>>> 2.26.2
>>>>
>>> Hi Jeff,
>>>
>>> This seems wrong to me, at least conceptually.  Is the same patch
>>> getting applied to ceph-fuse?
>>>
>> It's a grotesque workaround, I will grant you. I'm not sure what we want
>> to do for ceph-fuse yet but it does seem to have the same issue.
>> Probably, we should plan to do a similar fix there once we settle on the
>> right approach.
>>
>>> Pretending to not know anything about the client <-> MDS protocol,
>>> two questions immediately come to mind.  Why is MDS allowed to drop
>>> REQUEST_CLOSE?
>> It really seems like a protocol design flaw.
>>
>> IIUC, the idea overall with the low-level ceph protocol seems to be that
>> the client should retransmit (or reevaluate, in the case of caps) calls
>> that were in flight when the seq number changes.
>>
>> The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
>> but it doesn't really make a lot of sense for that, IMO.
> (edit of my reply to https://github.com/ceph/ceph/pull/37619)
>
> After taking a look at the MDS code, it really seemed like it
> had been written with the expectation that REQUEST_CLOSE would be
> resent, so I dug around.  I don't fully understand these "push"
> sequence numbers yet, but there is probably some race that requires
> the client to confirm that it saw the sequence number, even if the
> session is about to go.  Sage is probably the only one who might
> remember at this point.
>
> The kernel client already has the code to retry REQUEST_CLOSE, only
> every five seconds instead every second.  See check_session_state()
> which is called from delayed_work() in mds_client.c.  It looks like
> it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
> mdsc use-after-free crash") which conditioned delayed_work() on
> mdsc->stopping -- hence the misbehaviour.

Without this commit it will hit this issue too. The umount old code will 
try to close sessions asynchronously, and then tries to cancel the 
delayed work, during which the last queued delayed_work() timer might be 
fired. This commit makes it easier to be reproduced.


Thanks,
Xiubo
> Thanks,
>
>                  Ilya
>
Jeff Layton Oct. 12, 2020, 11:52 a.m. UTC | #5
On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
> On 2020/10/11 2:49, Ilya Dryomov wrote:
> > On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
> > > > On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > Patrick reported a case where the MDS and client client had racing
> > > > > session messages to one anothe. The MDS was sending caps to the client
> > > > > and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> > > > > to unmount.
> > > > > 
> > > > > Because they were sending at the same time, the REQUEST_CLOSE had too
> > > > > old a sequence number, and the MDS dropped it on the floor. On the
> > > > > client, this would have probably manifested as a 60s hang during umount.
> > > > > The MDS ended up blocklisting the client.
> > > > > 
> > > > > Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> > > > > session, so just keep sending them until the MDS acknowledges that.
> > > > > 
> > > > > Change the code to retransmit a REQUEST_CLOSE every second if the
> > > > > session hasn't changed state yet. Give up and throw a warning after
> > > > > mount_timeout elapses if we haven't gotten a response.
> > > > > 
> > > > > URL: https://tracker.ceph.com/issues/47563
> > > > > Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > >   fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
> > > > >   1 file changed, 32 insertions(+), 21 deletions(-)
> > > > > 
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index b07e7adf146f..d9cb74e3d5e3 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
> > > > >   static int __close_session(struct ceph_mds_client *mdsc,
> > > > >                           struct ceph_mds_session *session)
> > > > >   {
> > > > > -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> > > > > +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
> > > > >                  return 0;
> > > > >          session->s_state = CEPH_MDS_SESSION_CLOSING;
> > > > >          return request_close_session(session);
> > > > > @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
> > > > >          return atomic_read(&mdsc->num_sessions) <= skipped;
> > > > >   }
> > > > > 
> > > > > +static bool umount_timed_out(unsigned long timeo)
> > > > > +{
> > > > > +       if (time_before(jiffies, timeo))
> > > > > +               return false;
> > > > > +       pr_warn("ceph: unable to close all sessions\n");
> > > > > +       return true;
> > > > > +}
> > > > > +
> > > > >   /*
> > > > >    * called after sb is ro.
> > > > >    */
> > > > >   void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > > > >   {
> > > > > -       struct ceph_options *opts = mdsc->fsc->client->options;
> > > > >          struct ceph_mds_session *session;
> > > > > -       int i;
> > > > > -       int skipped = 0;
> > > > > +       int i, ret;
> > > > > +       int skipped;
> > > > > +       unsigned long timeo = jiffies +
> > > > > +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
> > > > > 
> > > > >          dout("close_sessions\n");
> > > > > 
> > > > >          /* close sessions */
> > > > > -       mutex_lock(&mdsc->mutex);
> > > > > -       for (i = 0; i < mdsc->max_sessions; i++) {
> > > > > -               session = __ceph_lookup_mds_session(mdsc, i);
> > > > > -               if (!session)
> > > > > -                       continue;
> > > > > -               mutex_unlock(&mdsc->mutex);
> > > > > -               mutex_lock(&session->s_mutex);
> > > > > -               if (__close_session(mdsc, session) <= 0)
> > > > > -                       skipped++;
> > > > > -               mutex_unlock(&session->s_mutex);
> > > > > -               ceph_put_mds_session(session);
> > > > > +       do {
> > > > > +               skipped = 0;
> > > > >                  mutex_lock(&mdsc->mutex);
> > > > > -       }
> > > > > -       mutex_unlock(&mdsc->mutex);
> > > > > +               for (i = 0; i < mdsc->max_sessions; i++) {
> > > > > +                       session = __ceph_lookup_mds_session(mdsc, i);
> > > > > +                       if (!session)
> > > > > +                               continue;
> > > > > +                       mutex_unlock(&mdsc->mutex);
> > > > > +                       mutex_lock(&session->s_mutex);
> > > > > +                       if (__close_session(mdsc, session) <= 0)
> > > > > +                               skipped++;
> > > > > +                       mutex_unlock(&session->s_mutex);
> > > > > +                       ceph_put_mds_session(session);
> > > > > +                       mutex_lock(&mdsc->mutex);
> > > > > +               }
> > > > > +               mutex_unlock(&mdsc->mutex);
> > > > > 
> > > > > -       dout("waiting for sessions to close\n");
> > > > > -       wait_event_timeout(mdsc->session_close_wq,
> > > > > -                          done_closing_sessions(mdsc, skipped),
> > > > > -                          ceph_timeout_jiffies(opts->mount_timeout));
> > > > > +               dout("waiting for sessions to close\n");
> > > > > +               ret = wait_event_timeout(mdsc->session_close_wq,
> > > > > +                                        done_closing_sessions(mdsc, skipped), HZ);
> > > > > +       } while (!ret && !umount_timed_out(timeo));
> > > > > 
> > > > >          /* tear down remaining sessions */
> > > > >          mutex_lock(&mdsc->mutex);
> > > > > --
> > > > > 2.26.2
> > > > > 
> > > > Hi Jeff,
> > > > 
> > > > This seems wrong to me, at least conceptually.  Is the same patch
> > > > getting applied to ceph-fuse?
> > > > 
> > > It's a grotesque workaround, I will grant you. I'm not sure what we want
> > > to do for ceph-fuse yet but it does seem to have the same issue.
> > > Probably, we should plan to do a similar fix there once we settle on the
> > > right approach.
> > > 
> > > > Pretending to not know anything about the client <-> MDS protocol,
> > > > two questions immediately come to mind.  Why is MDS allowed to drop
> > > > REQUEST_CLOSE?
> > > It really seems like a protocol design flaw.
> > > 
> > > IIUC, the idea overall with the low-level ceph protocol seems to be that
> > > the client should retransmit (or reevaluate, in the case of caps) calls
> > > that were in flight when the seq number changes.
> > > 
> > > The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
> > > but it doesn't really make a lot of sense for that, IMO.
> > (edit of my reply to https://github.com/ceph/ceph/pull/37619)
> > 
> > After taking a look at the MDS code, it really seemed like it
> > had been written with the expectation that REQUEST_CLOSE would be
> > resent, so I dug around.  I don't fully understand these "push"
> > sequence numbers yet, but there is probably some race that requires
> > the client to confirm that it saw the sequence number, even if the
> > session is about to go.  Sage is probably the only one who might
> > remember at this point.
> > 
> > The kernel client already has the code to retry REQUEST_CLOSE, only
> > every five seconds instead every second.  See check_session_state()
> > which is called from delayed_work() in mds_client.c.  It looks like
> > it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
> > mdsc use-after-free crash") which conditioned delayed_work() on
> > mdsc->stopping -- hence the misbehaviour.
> 
> Without this commit it will hit this issue too. The umount old code will 
> try to close sessions asynchronously, and then tries to cancel the 
> delayed work, during which the last queued delayed_work() timer might be 
> fired. This commit makes it easier to be reproduced.
> 

Fixing the potential races to ensure that this is retransmitted is an
option, but I'm not sure it's the best one. Here's what I think we
probably ought to do:

1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
don't see that the sequence number has any value on that call, as it's
an indicator that the client is finished with the session, and it's
never going to change its mind and do something different if the
sequence is wrong. I have a PR for that here:

    https://github.com/ceph/ceph/pull/37619

2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
the call is sent, tear down the session and proceed with unmounting. The
client doesn't really care what the MDS has to say after that point, so
we may as well not wait on it before proceeding. 

Thoughts?
Xiubo Li Oct. 12, 2020, 12:41 p.m. UTC | #6
On 2020/10/12 19:52, Jeff Layton wrote:
> On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
>> On 2020/10/11 2:49, Ilya Dryomov wrote:
>>> On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>> On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
>>>>> On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>> Patrick reported a case where the MDS and client client had racing
>>>>>> session messages to one anothe. The MDS was sending caps to the client
>>>>>> and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
>>>>>> to unmount.
>>>>>>
>>>>>> Because they were sending at the same time, the REQUEST_CLOSE had too
>>>>>> old a sequence number, and the MDS dropped it on the floor. On the
>>>>>> client, this would have probably manifested as a 60s hang during umount.
>>>>>> The MDS ended up blocklisting the client.
>>>>>>
>>>>>> Once we've decided to issue a REQUEST_CLOSE, we're finished with the
>>>>>> session, so just keep sending them until the MDS acknowledges that.
>>>>>>
>>>>>> Change the code to retransmit a REQUEST_CLOSE every second if the
>>>>>> session hasn't changed state yet. Give up and throw a warning after
>>>>>> mount_timeout elapses if we haven't gotten a response.
>>>>>>
>>>>>> URL: https://tracker.ceph.com/issues/47563
>>>>>> Reported-by: Patrick Donnelly <pdonnell@redhat.com>
>>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>>> ---
>>>>>>    fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
>>>>>>    1 file changed, 32 insertions(+), 21 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>> index b07e7adf146f..d9cb74e3d5e3 100644
>>>>>> --- a/fs/ceph/mds_client.c
>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>> @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
>>>>>>    static int __close_session(struct ceph_mds_client *mdsc,
>>>>>>                            struct ceph_mds_session *session)
>>>>>>    {
>>>>>> -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
>>>>>> +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
>>>>>>                   return 0;
>>>>>>           session->s_state = CEPH_MDS_SESSION_CLOSING;
>>>>>>           return request_close_session(session);
>>>>>> @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
>>>>>>           return atomic_read(&mdsc->num_sessions) <= skipped;
>>>>>>    }
>>>>>>
>>>>>> +static bool umount_timed_out(unsigned long timeo)
>>>>>> +{
>>>>>> +       if (time_before(jiffies, timeo))
>>>>>> +               return false;
>>>>>> +       pr_warn("ceph: unable to close all sessions\n");
>>>>>> +       return true;
>>>>>> +}
>>>>>> +
>>>>>>    /*
>>>>>>     * called after sb is ro.
>>>>>>     */
>>>>>>    void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>>>>>>    {
>>>>>> -       struct ceph_options *opts = mdsc->fsc->client->options;
>>>>>>           struct ceph_mds_session *session;
>>>>>> -       int i;
>>>>>> -       int skipped = 0;
>>>>>> +       int i, ret;
>>>>>> +       int skipped;
>>>>>> +       unsigned long timeo = jiffies +
>>>>>> +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
>>>>>>
>>>>>>           dout("close_sessions\n");
>>>>>>
>>>>>>           /* close sessions */
>>>>>> -       mutex_lock(&mdsc->mutex);
>>>>>> -       for (i = 0; i < mdsc->max_sessions; i++) {
>>>>>> -               session = __ceph_lookup_mds_session(mdsc, i);
>>>>>> -               if (!session)
>>>>>> -                       continue;
>>>>>> -               mutex_unlock(&mdsc->mutex);
>>>>>> -               mutex_lock(&session->s_mutex);
>>>>>> -               if (__close_session(mdsc, session) <= 0)
>>>>>> -                       skipped++;
>>>>>> -               mutex_unlock(&session->s_mutex);
>>>>>> -               ceph_put_mds_session(session);
>>>>>> +       do {
>>>>>> +               skipped = 0;
>>>>>>                   mutex_lock(&mdsc->mutex);
>>>>>> -       }
>>>>>> -       mutex_unlock(&mdsc->mutex);
>>>>>> +               for (i = 0; i < mdsc->max_sessions; i++) {
>>>>>> +                       session = __ceph_lookup_mds_session(mdsc, i);
>>>>>> +                       if (!session)
>>>>>> +                               continue;
>>>>>> +                       mutex_unlock(&mdsc->mutex);
>>>>>> +                       mutex_lock(&session->s_mutex);
>>>>>> +                       if (__close_session(mdsc, session) <= 0)
>>>>>> +                               skipped++;
>>>>>> +                       mutex_unlock(&session->s_mutex);
>>>>>> +                       ceph_put_mds_session(session);
>>>>>> +                       mutex_lock(&mdsc->mutex);
>>>>>> +               }
>>>>>> +               mutex_unlock(&mdsc->mutex);
>>>>>>
>>>>>> -       dout("waiting for sessions to close\n");
>>>>>> -       wait_event_timeout(mdsc->session_close_wq,
>>>>>> -                          done_closing_sessions(mdsc, skipped),
>>>>>> -                          ceph_timeout_jiffies(opts->mount_timeout));
>>>>>> +               dout("waiting for sessions to close\n");
>>>>>> +               ret = wait_event_timeout(mdsc->session_close_wq,
>>>>>> +                                        done_closing_sessions(mdsc, skipped), HZ);
>>>>>> +       } while (!ret && !umount_timed_out(timeo));
>>>>>>
>>>>>>           /* tear down remaining sessions */
>>>>>>           mutex_lock(&mdsc->mutex);
>>>>>> --
>>>>>> 2.26.2
>>>>>>
>>>>> Hi Jeff,
>>>>>
>>>>> This seems wrong to me, at least conceptually.  Is the same patch
>>>>> getting applied to ceph-fuse?
>>>>>
>>>> It's a grotesque workaround, I will grant you. I'm not sure what we want
>>>> to do for ceph-fuse yet but it does seem to have the same issue.
>>>> Probably, we should plan to do a similar fix there once we settle on the
>>>> right approach.
>>>>
>>>>> Pretending to not know anything about the client <-> MDS protocol,
>>>>> two questions immediately come to mind.  Why is MDS allowed to drop
>>>>> REQUEST_CLOSE?
>>>> It really seems like a protocol design flaw.
>>>>
>>>> IIUC, the idea overall with the low-level ceph protocol seems to be that
>>>> the client should retransmit (or reevaluate, in the case of caps) calls
>>>> that were in flight when the seq number changes.
>>>>
>>>> The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
>>>> but it doesn't really make a lot of sense for that, IMO.
>>> (edit of my reply to https://github.com/ceph/ceph/pull/37619)
>>>
>>> After taking a look at the MDS code, it really seemed like it
>>> had been written with the expectation that REQUEST_CLOSE would be
>>> resent, so I dug around.  I don't fully understand these "push"
>>> sequence numbers yet, but there is probably some race that requires
>>> the client to confirm that it saw the sequence number, even if the
>>> session is about to go.  Sage is probably the only one who might
>>> remember at this point.
>>>
>>> The kernel client already has the code to retry REQUEST_CLOSE, only
>>> every five seconds instead every second.  See check_session_state()
>>> which is called from delayed_work() in mds_client.c.  It looks like
>>> it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
>>> mdsc use-after-free crash") which conditioned delayed_work() on
>>> mdsc->stopping -- hence the misbehaviour.
>> Without this commit it will hit this issue too. The umount old code will
>> try to close sessions asynchronously, and then tries to cancel the
>> delayed work, during which the last queued delayed_work() timer might be
>> fired. This commit makes it easier to be reproduced.
>>
> Fixing the potential races to ensure that this is retransmitted is an
> option, but I'm not sure it's the best one. Here's what I think we
> probably ought to do:
>
> 1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
> don't see that the sequence number has any value on that call, as it's
> an indicator that the client is finished with the session, and it's
> never going to change its mind and do something different if the
> sequence is wrong. I have a PR for that here:
>
>      https://github.com/ceph/ceph/pull/37619
>
> 2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
> the call is sent, tear down the session and proceed with unmounting. The
> client doesn't really care what the MDS has to say after that point, so
> we may as well not wait on it before proceeding.
>
> Thoughts?

I am thinking possibly we can just check the session's state when the 
client receives a request from MDS which will increase the s_seq number, 
if the session is in CLOSING state, the client needs to resend the 
REQUEST_CLOSE request again.

Thanks.

Xiubo
Ilya Dryomov Oct. 12, 2020, 1:16 p.m. UTC | #7
On Mon, Oct 12, 2020 at 2:41 PM Xiubo Li <xiubli@redhat.com> wrote:
>
> On 2020/10/12 19:52, Jeff Layton wrote:
> > On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
> >> On 2020/10/11 2:49, Ilya Dryomov wrote:
> >>> On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
> >>>> On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
> >>>>> On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
> >>>>>> Patrick reported a case where the MDS and client client had racing
> >>>>>> session messages to one anothe. The MDS was sending caps to the client
> >>>>>> and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> >>>>>> to unmount.
> >>>>>>
> >>>>>> Because they were sending at the same time, the REQUEST_CLOSE had too
> >>>>>> old a sequence number, and the MDS dropped it on the floor. On the
> >>>>>> client, this would have probably manifested as a 60s hang during umount.
> >>>>>> The MDS ended up blocklisting the client.
> >>>>>>
> >>>>>> Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> >>>>>> session, so just keep sending them until the MDS acknowledges that.
> >>>>>>
> >>>>>> Change the code to retransmit a REQUEST_CLOSE every second if the
> >>>>>> session hasn't changed state yet. Give up and throw a warning after
> >>>>>> mount_timeout elapses if we haven't gotten a response.
> >>>>>>
> >>>>>> URL: https://tracker.ceph.com/issues/47563
> >>>>>> Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> >>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> >>>>>> ---
> >>>>>>    fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
> >>>>>>    1 file changed, 32 insertions(+), 21 deletions(-)
> >>>>>>
> >>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >>>>>> index b07e7adf146f..d9cb74e3d5e3 100644
> >>>>>> --- a/fs/ceph/mds_client.c
> >>>>>> +++ b/fs/ceph/mds_client.c
> >>>>>> @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
> >>>>>>    static int __close_session(struct ceph_mds_client *mdsc,
> >>>>>>                            struct ceph_mds_session *session)
> >>>>>>    {
> >>>>>> -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> >>>>>> +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
> >>>>>>                   return 0;
> >>>>>>           session->s_state = CEPH_MDS_SESSION_CLOSING;
> >>>>>>           return request_close_session(session);
> >>>>>> @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
> >>>>>>           return atomic_read(&mdsc->num_sessions) <= skipped;
> >>>>>>    }
> >>>>>>
> >>>>>> +static bool umount_timed_out(unsigned long timeo)
> >>>>>> +{
> >>>>>> +       if (time_before(jiffies, timeo))
> >>>>>> +               return false;
> >>>>>> +       pr_warn("ceph: unable to close all sessions\n");
> >>>>>> +       return true;
> >>>>>> +}
> >>>>>> +
> >>>>>>    /*
> >>>>>>     * called after sb is ro.
> >>>>>>     */
> >>>>>>    void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> >>>>>>    {
> >>>>>> -       struct ceph_options *opts = mdsc->fsc->client->options;
> >>>>>>           struct ceph_mds_session *session;
> >>>>>> -       int i;
> >>>>>> -       int skipped = 0;
> >>>>>> +       int i, ret;
> >>>>>> +       int skipped;
> >>>>>> +       unsigned long timeo = jiffies +
> >>>>>> +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
> >>>>>>
> >>>>>>           dout("close_sessions\n");
> >>>>>>
> >>>>>>           /* close sessions */
> >>>>>> -       mutex_lock(&mdsc->mutex);
> >>>>>> -       for (i = 0; i < mdsc->max_sessions; i++) {
> >>>>>> -               session = __ceph_lookup_mds_session(mdsc, i);
> >>>>>> -               if (!session)
> >>>>>> -                       continue;
> >>>>>> -               mutex_unlock(&mdsc->mutex);
> >>>>>> -               mutex_lock(&session->s_mutex);
> >>>>>> -               if (__close_session(mdsc, session) <= 0)
> >>>>>> -                       skipped++;
> >>>>>> -               mutex_unlock(&session->s_mutex);
> >>>>>> -               ceph_put_mds_session(session);
> >>>>>> +       do {
> >>>>>> +               skipped = 0;
> >>>>>>                   mutex_lock(&mdsc->mutex);
> >>>>>> -       }
> >>>>>> -       mutex_unlock(&mdsc->mutex);
> >>>>>> +               for (i = 0; i < mdsc->max_sessions; i++) {
> >>>>>> +                       session = __ceph_lookup_mds_session(mdsc, i);
> >>>>>> +                       if (!session)
> >>>>>> +                               continue;
> >>>>>> +                       mutex_unlock(&mdsc->mutex);
> >>>>>> +                       mutex_lock(&session->s_mutex);
> >>>>>> +                       if (__close_session(mdsc, session) <= 0)
> >>>>>> +                               skipped++;
> >>>>>> +                       mutex_unlock(&session->s_mutex);
> >>>>>> +                       ceph_put_mds_session(session);
> >>>>>> +                       mutex_lock(&mdsc->mutex);
> >>>>>> +               }
> >>>>>> +               mutex_unlock(&mdsc->mutex);
> >>>>>>
> >>>>>> -       dout("waiting for sessions to close\n");
> >>>>>> -       wait_event_timeout(mdsc->session_close_wq,
> >>>>>> -                          done_closing_sessions(mdsc, skipped),
> >>>>>> -                          ceph_timeout_jiffies(opts->mount_timeout));
> >>>>>> +               dout("waiting for sessions to close\n");
> >>>>>> +               ret = wait_event_timeout(mdsc->session_close_wq,
> >>>>>> +                                        done_closing_sessions(mdsc, skipped), HZ);
> >>>>>> +       } while (!ret && !umount_timed_out(timeo));
> >>>>>>
> >>>>>>           /* tear down remaining sessions */
> >>>>>>           mutex_lock(&mdsc->mutex);
> >>>>>> --
> >>>>>> 2.26.2
> >>>>>>
> >>>>> Hi Jeff,
> >>>>>
> >>>>> This seems wrong to me, at least conceptually.  Is the same patch
> >>>>> getting applied to ceph-fuse?
> >>>>>
> >>>> It's a grotesque workaround, I will grant you. I'm not sure what we want
> >>>> to do for ceph-fuse yet but it does seem to have the same issue.
> >>>> Probably, we should plan to do a similar fix there once we settle on the
> >>>> right approach.
> >>>>
> >>>>> Pretending to not know anything about the client <-> MDS protocol,
> >>>>> two questions immediately come to mind.  Why is MDS allowed to drop
> >>>>> REQUEST_CLOSE?
> >>>> It really seems like a protocol design flaw.
> >>>>
> >>>> IIUC, the idea overall with the low-level ceph protocol seems to be that
> >>>> the client should retransmit (or reevaluate, in the case of caps) calls
> >>>> that were in flight when the seq number changes.
> >>>>
> >>>> The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
> >>>> but it doesn't really make a lot of sense for that, IMO.
> >>> (edit of my reply to https://github.com/ceph/ceph/pull/37619)
> >>>
> >>> After taking a look at the MDS code, it really seemed like it
> >>> had been written with the expectation that REQUEST_CLOSE would be
> >>> resent, so I dug around.  I don't fully understand these "push"
> >>> sequence numbers yet, but there is probably some race that requires
> >>> the client to confirm that it saw the sequence number, even if the
> >>> session is about to go.  Sage is probably the only one who might
> >>> remember at this point.
> >>>
> >>> The kernel client already has the code to retry REQUEST_CLOSE, only
> >>> every five seconds instead every second.  See check_session_state()
> >>> which is called from delayed_work() in mds_client.c.  It looks like
> >>> it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
> >>> mdsc use-after-free crash") which conditioned delayed_work() on
> >>> mdsc->stopping -- hence the misbehaviour.
> >> Without this commit it will hit this issue too. The umount old code will
> >> try to close sessions asynchronously, and then tries to cancel the
> >> delayed work, during which the last queued delayed_work() timer might be
> >> fired. This commit makes it easier to be reproduced.
> >>
> > Fixing the potential races to ensure that this is retransmitted is an
> > option, but I'm not sure it's the best one. Here's what I think we
> > probably ought to do:
> >
> > 1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
> > don't see that the sequence number has any value on that call, as it's
> > an indicator that the client is finished with the session, and it's
> > never going to change its mind and do something different if the
> > sequence is wrong. I have a PR for that here:
> >
> >      https://github.com/ceph/ceph/pull/37619
> >
> > 2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
> > the call is sent, tear down the session and proceed with unmounting. The
> > client doesn't really care what the MDS has to say after that point, so
> > we may as well not wait on it before proceeding.
> >
> > Thoughts?
>
> I am thinking possibly we can just check the session's state when the
> client receives a request from MDS which will increase the s_seq number,
> if the session is in CLOSING state, the client needs to resend the
> REQUEST_CLOSE request again.

This is what ceph-fuse does, so I believe it is a viable approach.

As for changing the MDS to ignore the sequence number and changing
the clients to not wait for the reply, I don't think so.  It would
be a protocol change (more or less) and I don't think it should be
done until everyone has a clear understanding of the races that
the sequence number is supposed to protect against.  That might
mean waiting for Sage to chime in...

Both clients have the code to resend REQUEST_CLOSE, added over
ten years ago specifically to handle the case of MDS dropping the
original message.  Perhaps it's bogus at this point, but I'd be
wary of ignoring it without having the full history.

Thanks,

                Ilya
Jeff Layton Oct. 12, 2020, 1:17 p.m. UTC | #8
On Mon, 2020-10-12 at 20:41 +0800, Xiubo Li wrote:
> On 2020/10/12 19:52, Jeff Layton wrote:
> > On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
> > > On 2020/10/11 2:49, Ilya Dryomov wrote:
> > > > On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
> > > > > > On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > Patrick reported a case where the MDS and client client had racing
> > > > > > > session messages to one anothe. The MDS was sending caps to the client
> > > > > > > and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> > > > > > > to unmount.
> > > > > > > 
> > > > > > > Because they were sending at the same time, the REQUEST_CLOSE had too
> > > > > > > old a sequence number, and the MDS dropped it on the floor. On the
> > > > > > > client, this would have probably manifested as a 60s hang during umount.
> > > > > > > The MDS ended up blocklisting the client.
> > > > > > > 
> > > > > > > Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> > > > > > > session, so just keep sending them until the MDS acknowledges that.
> > > > > > > 
> > > > > > > Change the code to retransmit a REQUEST_CLOSE every second if the
> > > > > > > session hasn't changed state yet. Give up and throw a warning after
> > > > > > > mount_timeout elapses if we haven't gotten a response.
> > > > > > > 
> > > > > > > URL: https://tracker.ceph.com/issues/47563
> > > > > > > Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > ---
> > > > > > >    fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
> > > > > > >    1 file changed, 32 insertions(+), 21 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > index b07e7adf146f..d9cb74e3d5e3 100644
> > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
> > > > > > >    static int __close_session(struct ceph_mds_client *mdsc,
> > > > > > >                            struct ceph_mds_session *session)
> > > > > > >    {
> > > > > > > -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> > > > > > > +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
> > > > > > >                   return 0;
> > > > > > >           session->s_state = CEPH_MDS_SESSION_CLOSING;
> > > > > > >           return request_close_session(session);
> > > > > > > @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
> > > > > > >           return atomic_read(&mdsc->num_sessions) <= skipped;
> > > > > > >    }
> > > > > > > 
> > > > > > > +static bool umount_timed_out(unsigned long timeo)
> > > > > > > +{
> > > > > > > +       if (time_before(jiffies, timeo))
> > > > > > > +               return false;
> > > > > > > +       pr_warn("ceph: unable to close all sessions\n");
> > > > > > > +       return true;
> > > > > > > +}
> > > > > > > +
> > > > > > >    /*
> > > > > > >     * called after sb is ro.
> > > > > > >     */
> > > > > > >    void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > > > > > >    {
> > > > > > > -       struct ceph_options *opts = mdsc->fsc->client->options;
> > > > > > >           struct ceph_mds_session *session;
> > > > > > > -       int i;
> > > > > > > -       int skipped = 0;
> > > > > > > +       int i, ret;
> > > > > > > +       int skipped;
> > > > > > > +       unsigned long timeo = jiffies +
> > > > > > > +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
> > > > > > > 
> > > > > > >           dout("close_sessions\n");
> > > > > > > 
> > > > > > >           /* close sessions */
> > > > > > > -       mutex_lock(&mdsc->mutex);
> > > > > > > -       for (i = 0; i < mdsc->max_sessions; i++) {
> > > > > > > -               session = __ceph_lookup_mds_session(mdsc, i);
> > > > > > > -               if (!session)
> > > > > > > -                       continue;
> > > > > > > -               mutex_unlock(&mdsc->mutex);
> > > > > > > -               mutex_lock(&session->s_mutex);
> > > > > > > -               if (__close_session(mdsc, session) <= 0)
> > > > > > > -                       skipped++;
> > > > > > > -               mutex_unlock(&session->s_mutex);
> > > > > > > -               ceph_put_mds_session(session);
> > > > > > > +       do {
> > > > > > > +               skipped = 0;
> > > > > > >                   mutex_lock(&mdsc->mutex);
> > > > > > > -       }
> > > > > > > -       mutex_unlock(&mdsc->mutex);
> > > > > > > +               for (i = 0; i < mdsc->max_sessions; i++) {
> > > > > > > +                       session = __ceph_lookup_mds_session(mdsc, i);
> > > > > > > +                       if (!session)
> > > > > > > +                               continue;
> > > > > > > +                       mutex_unlock(&mdsc->mutex);
> > > > > > > +                       mutex_lock(&session->s_mutex);
> > > > > > > +                       if (__close_session(mdsc, session) <= 0)
> > > > > > > +                               skipped++;
> > > > > > > +                       mutex_unlock(&session->s_mutex);
> > > > > > > +                       ceph_put_mds_session(session);
> > > > > > > +                       mutex_lock(&mdsc->mutex);
> > > > > > > +               }
> > > > > > > +               mutex_unlock(&mdsc->mutex);
> > > > > > > 
> > > > > > > -       dout("waiting for sessions to close\n");
> > > > > > > -       wait_event_timeout(mdsc->session_close_wq,
> > > > > > > -                          done_closing_sessions(mdsc, skipped),
> > > > > > > -                          ceph_timeout_jiffies(opts->mount_timeout));
> > > > > > > +               dout("waiting for sessions to close\n");
> > > > > > > +               ret = wait_event_timeout(mdsc->session_close_wq,
> > > > > > > +                                        done_closing_sessions(mdsc, skipped), HZ);
> > > > > > > +       } while (!ret && !umount_timed_out(timeo));
> > > > > > > 
> > > > > > >           /* tear down remaining sessions */
> > > > > > >           mutex_lock(&mdsc->mutex);
> > > > > > > --
> > > > > > > 2.26.2
> > > > > > > 
> > > > > > Hi Jeff,
> > > > > > 
> > > > > > This seems wrong to me, at least conceptually.  Is the same patch
> > > > > > getting applied to ceph-fuse?
> > > > > > 
> > > > > It's a grotesque workaround, I will grant you. I'm not sure what we want
> > > > > to do for ceph-fuse yet but it does seem to have the same issue.
> > > > > Probably, we should plan to do a similar fix there once we settle on the
> > > > > right approach.
> > > > > 
> > > > > > Pretending to not know anything about the client <-> MDS protocol,
> > > > > > two questions immediately come to mind.  Why is MDS allowed to drop
> > > > > > REQUEST_CLOSE?
> > > > > It really seems like a protocol design flaw.
> > > > > 
> > > > > IIUC, the idea overall with the low-level ceph protocol seems to be that
> > > > > the client should retransmit (or reevaluate, in the case of caps) calls
> > > > > that were in flight when the seq number changes.
> > > > > 
> > > > > The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
> > > > > but it doesn't really make a lot of sense for that, IMO.
> > > > (edit of my reply to https://github.com/ceph/ceph/pull/37619)
> > > > 
> > > > After taking a look at the MDS code, it really seemed like it
> > > > had been written with the expectation that REQUEST_CLOSE would be
> > > > resent, so I dug around.  I don't fully understand these "push"
> > > > sequence numbers yet, but there is probably some race that requires
> > > > the client to confirm that it saw the sequence number, even if the
> > > > session is about to go.  Sage is probably the only one who might
> > > > remember at this point.
> > > > 
> > > > The kernel client already has the code to retry REQUEST_CLOSE, only
> > > > every five seconds instead every second.  See check_session_state()
> > > > which is called from delayed_work() in mds_client.c.  It looks like
> > > > it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
> > > > mdsc use-after-free crash") which conditioned delayed_work() on
> > > > mdsc->stopping -- hence the misbehaviour.
> > > Without this commit it will hit this issue too. The umount old code will
> > > try to close sessions asynchronously, and then tries to cancel the
> > > delayed work, during which the last queued delayed_work() timer might be
> > > fired. This commit makes it easier to be reproduced.
> > > 
> > Fixing the potential races to ensure that this is retransmitted is an
> > option, but I'm not sure it's the best one. Here's what I think we
> > probably ought to do:
> > 
> > 1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
> > don't see that the sequence number has any value on that call, as it's
> > an indicator that the client is finished with the session, and it's
> > never going to change its mind and do something different if the
> > sequence is wrong. I have a PR for that here:
> > 
> >      https://github.com/ceph/ceph/pull/37619
> > 
> > 2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
> > the call is sent, tear down the session and proceed with unmounting. The
> > client doesn't really care what the MDS has to say after that point, so
> > we may as well not wait on it before proceeding.
> > 
> > Thoughts?
> 
> I am thinking possibly we can just check the session's state when the 
> client receives a request from MDS which will increase the s_seq number, 
> if the session is in CLOSING state, the client needs to resend the 
> REQUEST_CLOSE request again.
> 

That could be done, but that means adding extra complexity to the
session handling code, which could really stand to be simplified
instead.

mdsc->stopping and session->s_state seem to be protected by the
mdsc->mutex, but session->s_seq is protected by the session->s_mutex.

There are 4 types of messages that increment the s_seq -- caps, leases,
quotas and snaps. All of those would need to be changed to check for and
retransmit REQUEST_CLOSE if one is outstanding.

So yeah, that could be done on the client side. If we were to do that,
should we couple it with the MDS side fix to make it ignore the seq on
REQUEST_CLOSE?
Xiubo Li Oct. 12, 2020, 1:31 p.m. UTC | #9
On 2020/10/12 21:17, Jeff Layton wrote:
> On Mon, 2020-10-12 at 20:41 +0800, Xiubo Li wrote:
>> On 2020/10/12 19:52, Jeff Layton wrote:
>>> On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
>>>> On 2020/10/11 2:49, Ilya Dryomov wrote:
>>>>> On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>> On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
>>>>>>> On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>>>> Patrick reported a case where the MDS and client client had racing
>>>>>>>> session messages to one anothe. The MDS was sending caps to the client
>>>>>>>> and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
>>>>>>>> to unmount.
>>>>>>>>
>>>>>>>> Because they were sending at the same time, the REQUEST_CLOSE had too
>>>>>>>> old a sequence number, and the MDS dropped it on the floor. On the
>>>>>>>> client, this would have probably manifested as a 60s hang during umount.
>>>>>>>> The MDS ended up blocklisting the client.
>>>>>>>>
>>>>>>>> Once we've decided to issue a REQUEST_CLOSE, we're finished with the
>>>>>>>> session, so just keep sending them until the MDS acknowledges that.
>>>>>>>>
>>>>>>>> Change the code to retransmit a REQUEST_CLOSE every second if the
>>>>>>>> session hasn't changed state yet. Give up and throw a warning after
>>>>>>>> mount_timeout elapses if we haven't gotten a response.
>>>>>>>>
>>>>>>>> URL: https://tracker.ceph.com/issues/47563
>>>>>>>> Reported-by: Patrick Donnelly <pdonnell@redhat.com>
>>>>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>>>>> ---
>>>>>>>>     fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
>>>>>>>>     1 file changed, 32 insertions(+), 21 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>>>> index b07e7adf146f..d9cb74e3d5e3 100644
>>>>>>>> --- a/fs/ceph/mds_client.c
>>>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>>>> @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
>>>>>>>>     static int __close_session(struct ceph_mds_client *mdsc,
>>>>>>>>                             struct ceph_mds_session *session)
>>>>>>>>     {
>>>>>>>> -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
>>>>>>>> +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
>>>>>>>>                    return 0;
>>>>>>>>            session->s_state = CEPH_MDS_SESSION_CLOSING;
>>>>>>>>            return request_close_session(session);
>>>>>>>> @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
>>>>>>>>            return atomic_read(&mdsc->num_sessions) <= skipped;
>>>>>>>>     }
>>>>>>>>
>>>>>>>> +static bool umount_timed_out(unsigned long timeo)
>>>>>>>> +{
>>>>>>>> +       if (time_before(jiffies, timeo))
>>>>>>>> +               return false;
>>>>>>>> +       pr_warn("ceph: unable to close all sessions\n");
>>>>>>>> +       return true;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>     /*
>>>>>>>>      * called after sb is ro.
>>>>>>>>      */
>>>>>>>>     void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>>>>>>>>     {
>>>>>>>> -       struct ceph_options *opts = mdsc->fsc->client->options;
>>>>>>>>            struct ceph_mds_session *session;
>>>>>>>> -       int i;
>>>>>>>> -       int skipped = 0;
>>>>>>>> +       int i, ret;
>>>>>>>> +       int skipped;
>>>>>>>> +       unsigned long timeo = jiffies +
>>>>>>>> +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
>>>>>>>>
>>>>>>>>            dout("close_sessions\n");
>>>>>>>>
>>>>>>>>            /* close sessions */
>>>>>>>> -       mutex_lock(&mdsc->mutex);
>>>>>>>> -       for (i = 0; i < mdsc->max_sessions; i++) {
>>>>>>>> -               session = __ceph_lookup_mds_session(mdsc, i);
>>>>>>>> -               if (!session)
>>>>>>>> -                       continue;
>>>>>>>> -               mutex_unlock(&mdsc->mutex);
>>>>>>>> -               mutex_lock(&session->s_mutex);
>>>>>>>> -               if (__close_session(mdsc, session) <= 0)
>>>>>>>> -                       skipped++;
>>>>>>>> -               mutex_unlock(&session->s_mutex);
>>>>>>>> -               ceph_put_mds_session(session);
>>>>>>>> +       do {
>>>>>>>> +               skipped = 0;
>>>>>>>>                    mutex_lock(&mdsc->mutex);
>>>>>>>> -       }
>>>>>>>> -       mutex_unlock(&mdsc->mutex);
>>>>>>>> +               for (i = 0; i < mdsc->max_sessions; i++) {
>>>>>>>> +                       session = __ceph_lookup_mds_session(mdsc, i);
>>>>>>>> +                       if (!session)
>>>>>>>> +                               continue;
>>>>>>>> +                       mutex_unlock(&mdsc->mutex);
>>>>>>>> +                       mutex_lock(&session->s_mutex);
>>>>>>>> +                       if (__close_session(mdsc, session) <= 0)
>>>>>>>> +                               skipped++;
>>>>>>>> +                       mutex_unlock(&session->s_mutex);
>>>>>>>> +                       ceph_put_mds_session(session);
>>>>>>>> +                       mutex_lock(&mdsc->mutex);
>>>>>>>> +               }
>>>>>>>> +               mutex_unlock(&mdsc->mutex);
>>>>>>>>
>>>>>>>> -       dout("waiting for sessions to close\n");
>>>>>>>> -       wait_event_timeout(mdsc->session_close_wq,
>>>>>>>> -                          done_closing_sessions(mdsc, skipped),
>>>>>>>> -                          ceph_timeout_jiffies(opts->mount_timeout));
>>>>>>>> +               dout("waiting for sessions to close\n");
>>>>>>>> +               ret = wait_event_timeout(mdsc->session_close_wq,
>>>>>>>> +                                        done_closing_sessions(mdsc, skipped), HZ);
>>>>>>>> +       } while (!ret && !umount_timed_out(timeo));
>>>>>>>>
>>>>>>>>            /* tear down remaining sessions */
>>>>>>>>            mutex_lock(&mdsc->mutex);
>>>>>>>> --
>>>>>>>> 2.26.2
>>>>>>>>
>>>>>>> Hi Jeff,
>>>>>>>
>>>>>>> This seems wrong to me, at least conceptually.  Is the same patch
>>>>>>> getting applied to ceph-fuse?
>>>>>>>
>>>>>> It's a grotesque workaround, I will grant you. I'm not sure what we want
>>>>>> to do for ceph-fuse yet but it does seem to have the same issue.
>>>>>> Probably, we should plan to do a similar fix there once we settle on the
>>>>>> right approach.
>>>>>>
>>>>>>> Pretending to not know anything about the client <-> MDS protocol,
>>>>>>> two questions immediately come to mind.  Why is MDS allowed to drop
>>>>>>> REQUEST_CLOSE?
>>>>>> It really seems like a protocol design flaw.
>>>>>>
>>>>>> IIUC, the idea overall with the low-level ceph protocol seems to be that
>>>>>> the client should retransmit (or reevaluate, in the case of caps) calls
>>>>>> that were in flight when the seq number changes.
>>>>>>
>>>>>> The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
>>>>>> but it doesn't really make a lot of sense for that, IMO.
>>>>> (edit of my reply to https://github.com/ceph/ceph/pull/37619)
>>>>>
>>>>> After taking a look at the MDS code, it really seemed like it
>>>>> had been written with the expectation that REQUEST_CLOSE would be
>>>>> resent, so I dug around.  I don't fully understand these "push"
>>>>> sequence numbers yet, but there is probably some race that requires
>>>>> the client to confirm that it saw the sequence number, even if the
>>>>> session is about to go.  Sage is probably the only one who might
>>>>> remember at this point.
>>>>>
>>>>> The kernel client already has the code to retry REQUEST_CLOSE, only
>>>>> every five seconds instead every second.  See check_session_state()
>>>>> which is called from delayed_work() in mds_client.c.  It looks like
>>>>> it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
>>>>> mdsc use-after-free crash") which conditioned delayed_work() on
>>>>> mdsc->stopping -- hence the misbehaviour.
>>>> Without this commit it will hit this issue too. The umount old code will
>>>> try to close sessions asynchronously, and then tries to cancel the
>>>> delayed work, during which the last queued delayed_work() timer might be
>>>> fired. This commit makes it easier to be reproduced.
>>>>
>>> Fixing the potential races to ensure that this is retransmitted is an
>>> option, but I'm not sure it's the best one. Here's what I think we
>>> probably ought to do:
>>>
>>> 1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
>>> don't see that the sequence number has any value on that call, as it's
>>> an indicator that the client is finished with the session, and it's
>>> never going to change its mind and do something different if the
>>> sequence is wrong. I have a PR for that here:
>>>
>>>       https://github.com/ceph/ceph/pull/37619
>>>
>>> 2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
>>> the call is sent, tear down the session and proceed with unmounting. The
>>> client doesn't really care what the MDS has to say after that point, so
>>> we may as well not wait on it before proceeding.
>>>
>>> Thoughts?
>> I am thinking possibly we can just check the session's state when the
>> client receives a request from MDS which will increase the s_seq number,
>> if the session is in CLOSING state, the client needs to resend the
>> REQUEST_CLOSE request again.
>>
> That could be done, but that means adding extra complexity to the
> session handling code, which could really stand to be simplified
> instead.
>
> mdsc->stopping and session->s_state seem to be protected by the
> mdsc->mutex, but session->s_seq is protected by the session->s_mutex.
>
> There are 4 types of messages that increment the s_seq -- caps, leases,
> quotas and snaps. All of those would need to be changed to check for and
> retransmit REQUEST_CLOSE if one is outstanding.

How about deferring resending the CLOSE request in the above case ?


> So yeah, that could be done on the client side. If we were to do that,
> should we couple it with the MDS side fix to make it ignore the seq on
> REQUEST_CLOSE?
Jeff Layton Oct. 12, 2020, 1:49 p.m. UTC | #10
On Mon, 2020-10-12 at 21:31 +0800, Xiubo Li wrote:
> On 2020/10/12 21:17, Jeff Layton wrote:
> > On Mon, 2020-10-12 at 20:41 +0800, Xiubo Li wrote:
> > > On 2020/10/12 19:52, Jeff Layton wrote:
> > > > On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
> > > > > On 2020/10/11 2:49, Ilya Dryomov wrote:
> > > > > > On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
> > > > > > > > On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > Patrick reported a case where the MDS and client client had racing
> > > > > > > > > session messages to one anothe. The MDS was sending caps to the client
> > > > > > > > > and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
> > > > > > > > > to unmount.
> > > > > > > > > 
> > > > > > > > > Because they were sending at the same time, the REQUEST_CLOSE had too
> > > > > > > > > old a sequence number, and the MDS dropped it on the floor. On the
> > > > > > > > > client, this would have probably manifested as a 60s hang during umount.
> > > > > > > > > The MDS ended up blocklisting the client.
> > > > > > > > > 
> > > > > > > > > Once we've decided to issue a REQUEST_CLOSE, we're finished with the
> > > > > > > > > session, so just keep sending them until the MDS acknowledges that.
> > > > > > > > > 
> > > > > > > > > Change the code to retransmit a REQUEST_CLOSE every second if the
> > > > > > > > > session hasn't changed state yet. Give up and throw a warning after
> > > > > > > > > mount_timeout elapses if we haven't gotten a response.
> > > > > > > > > 
> > > > > > > > > URL: https://tracker.ceph.com/issues/47563
> > > > > > > > > Reported-by: Patrick Donnelly <pdonnell@redhat.com>
> > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > ---
> > > > > > > > >     fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
> > > > > > > > >     1 file changed, 32 insertions(+), 21 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > > > index b07e7adf146f..d9cb74e3d5e3 100644
> > > > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > > > @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
> > > > > > > > >     static int __close_session(struct ceph_mds_client *mdsc,
> > > > > > > > >                             struct ceph_mds_session *session)
> > > > > > > > >     {
> > > > > > > > > -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
> > > > > > > > > +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
> > > > > > > > >                    return 0;
> > > > > > > > >            session->s_state = CEPH_MDS_SESSION_CLOSING;
> > > > > > > > >            return request_close_session(session);
> > > > > > > > > @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
> > > > > > > > >            return atomic_read(&mdsc->num_sessions) <= skipped;
> > > > > > > > >     }
> > > > > > > > > 
> > > > > > > > > +static bool umount_timed_out(unsigned long timeo)
> > > > > > > > > +{
> > > > > > > > > +       if (time_before(jiffies, timeo))
> > > > > > > > > +               return false;
> > > > > > > > > +       pr_warn("ceph: unable to close all sessions\n");
> > > > > > > > > +       return true;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >     /*
> > > > > > > > >      * called after sb is ro.
> > > > > > > > >      */
> > > > > > > > >     void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > > > > > > > >     {
> > > > > > > > > -       struct ceph_options *opts = mdsc->fsc->client->options;
> > > > > > > > >            struct ceph_mds_session *session;
> > > > > > > > > -       int i;
> > > > > > > > > -       int skipped = 0;
> > > > > > > > > +       int i, ret;
> > > > > > > > > +       int skipped;
> > > > > > > > > +       unsigned long timeo = jiffies +
> > > > > > > > > +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
> > > > > > > > > 
> > > > > > > > >            dout("close_sessions\n");
> > > > > > > > > 
> > > > > > > > >            /* close sessions */
> > > > > > > > > -       mutex_lock(&mdsc->mutex);
> > > > > > > > > -       for (i = 0; i < mdsc->max_sessions; i++) {
> > > > > > > > > -               session = __ceph_lookup_mds_session(mdsc, i);
> > > > > > > > > -               if (!session)
> > > > > > > > > -                       continue;
> > > > > > > > > -               mutex_unlock(&mdsc->mutex);
> > > > > > > > > -               mutex_lock(&session->s_mutex);
> > > > > > > > > -               if (__close_session(mdsc, session) <= 0)
> > > > > > > > > -                       skipped++;
> > > > > > > > > -               mutex_unlock(&session->s_mutex);
> > > > > > > > > -               ceph_put_mds_session(session);
> > > > > > > > > +       do {
> > > > > > > > > +               skipped = 0;
> > > > > > > > >                    mutex_lock(&mdsc->mutex);
> > > > > > > > > -       }
> > > > > > > > > -       mutex_unlock(&mdsc->mutex);
> > > > > > > > > +               for (i = 0; i < mdsc->max_sessions; i++) {
> > > > > > > > > +                       session = __ceph_lookup_mds_session(mdsc, i);
> > > > > > > > > +                       if (!session)
> > > > > > > > > +                               continue;
> > > > > > > > > +                       mutex_unlock(&mdsc->mutex);
> > > > > > > > > +                       mutex_lock(&session->s_mutex);
> > > > > > > > > +                       if (__close_session(mdsc, session) <= 0)
> > > > > > > > > +                               skipped++;
> > > > > > > > > +                       mutex_unlock(&session->s_mutex);
> > > > > > > > > +                       ceph_put_mds_session(session);
> > > > > > > > > +                       mutex_lock(&mdsc->mutex);
> > > > > > > > > +               }
> > > > > > > > > +               mutex_unlock(&mdsc->mutex);
> > > > > > > > > 
> > > > > > > > > -       dout("waiting for sessions to close\n");
> > > > > > > > > -       wait_event_timeout(mdsc->session_close_wq,
> > > > > > > > > -                          done_closing_sessions(mdsc, skipped),
> > > > > > > > > -                          ceph_timeout_jiffies(opts->mount_timeout));
> > > > > > > > > +               dout("waiting for sessions to close\n");
> > > > > > > > > +               ret = wait_event_timeout(mdsc->session_close_wq,
> > > > > > > > > +                                        done_closing_sessions(mdsc, skipped), HZ);
> > > > > > > > > +       } while (!ret && !umount_timed_out(timeo));
> > > > > > > > > 
> > > > > > > > >            /* tear down remaining sessions */
> > > > > > > > >            mutex_lock(&mdsc->mutex);
> > > > > > > > > --
> > > > > > > > > 2.26.2
> > > > > > > > > 
> > > > > > > > Hi Jeff,
> > > > > > > > 
> > > > > > > > This seems wrong to me, at least conceptually.  Is the same patch
> > > > > > > > getting applied to ceph-fuse?
> > > > > > > > 
> > > > > > > It's a grotesque workaround, I will grant you. I'm not sure what we want
> > > > > > > to do for ceph-fuse yet but it does seem to have the same issue.
> > > > > > > Probably, we should plan to do a similar fix there once we settle on the
> > > > > > > right approach.
> > > > > > > 
> > > > > > > > Pretending to not know anything about the client <-> MDS protocol,
> > > > > > > > two questions immediately come to mind.  Why is MDS allowed to drop
> > > > > > > > REQUEST_CLOSE?
> > > > > > > It really seems like a protocol design flaw.
> > > > > > > 
> > > > > > > IIUC, the idea overall with the low-level ceph protocol seems to be that
> > > > > > > the client should retransmit (or reevaluate, in the case of caps) calls
> > > > > > > that were in flight when the seq number changes.
> > > > > > > 
> > > > > > > The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
> > > > > > > but it doesn't really make a lot of sense for that, IMO.
> > > > > > (edit of my reply to https://github.com/ceph/ceph/pull/37619)
> > > > > > 
> > > > > > After taking a look at the MDS code, it really seemed like it
> > > > > > had been written with the expectation that REQUEST_CLOSE would be
> > > > > > resent, so I dug around.  I don't fully understand these "push"
> > > > > > sequence numbers yet, but there is probably some race that requires
> > > > > > the client to confirm that it saw the sequence number, even if the
> > > > > > session is about to go.  Sage is probably the only one who might
> > > > > > remember at this point.
> > > > > > 
> > > > > > The kernel client already has the code to retry REQUEST_CLOSE, only
> > > > > > every five seconds instead every second.  See check_session_state()
> > > > > > which is called from delayed_work() in mds_client.c.  It looks like
> > > > > > it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
> > > > > > mdsc use-after-free crash") which conditioned delayed_work() on
> > > > > > mdsc->stopping -- hence the misbehaviour.
> > > > > Without this commit it will hit this issue too. The umount old code will
> > > > > try to close sessions asynchronously, and then tries to cancel the
> > > > > delayed work, during which the last queued delayed_work() timer might be
> > > > > fired. This commit makes it easier to be reproduced.
> > > > > 
> > > > Fixing the potential races to ensure that this is retransmitted is an
> > > > option, but I'm not sure it's the best one. Here's what I think we
> > > > probably ought to do:
> > > > 
> > > > 1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
> > > > don't see that the sequence number has any value on that call, as it's
> > > > an indicator that the client is finished with the session, and it's
> > > > never going to change its mind and do something different if the
> > > > sequence is wrong. I have a PR for that here:
> > > > 
> > > >       https://github.com/ceph/ceph/pull/37619
> > > > 
> > > > 2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
> > > > the call is sent, tear down the session and proceed with unmounting. The
> > > > client doesn't really care what the MDS has to say after that point, so
> > > > we may as well not wait on it before proceeding.
> > > > 
> > > > Thoughts?
> > > I am thinking possibly we can just check the session's state when the
> > > client receives a request from MDS which will increase the s_seq number,
> > > if the session is in CLOSING state, the client needs to resend the
> > > REQUEST_CLOSE request again.
> > > 
> > That could be done, but that means adding extra complexity to the
> > session handling code, which could really stand to be simplified
> > instead.
> > 
> > mdsc->stopping and session->s_state seem to be protected by the
> > mdsc->mutex, but session->s_seq is protected by the session->s_mutex.
> > 
> > There are 4 types of messages that increment the s_seq -- caps, leases,
> > quotas and snaps. All of those would need to be changed to check for and
> > retransmit REQUEST_CLOSE if one is outstanding.
> 
> How about deferring resending the CLOSE request in the above case ?
> 

I'm not sure I follow you here. What would deferring the resend help
with here? Bear in mind that the client is unmounting at this point so
we really do want to get on with the business of tearing things down and
not wait around any longer than we need.


> 
> > So yeah, that could be done on the client side. If we were to do that,
> > should we couple it with the MDS side fix to make it ignore the seq on
> > REQUEST_CLOSE?
> 
>
Xiubo Li Oct. 12, 2020, 1:52 p.m. UTC | #11
On 2020/10/12 21:49, Jeff Layton wrote:
> On Mon, 2020-10-12 at 21:31 +0800, Xiubo Li wrote:
>> On 2020/10/12 21:17, Jeff Layton wrote:
>>> On Mon, 2020-10-12 at 20:41 +0800, Xiubo Li wrote:
>>>> On 2020/10/12 19:52, Jeff Layton wrote:
>>>>> On Mon, 2020-10-12 at 14:52 +0800, Xiubo Li wrote:
>>>>>> On 2020/10/11 2:49, Ilya Dryomov wrote:
>>>>>>> On Thu, Oct 8, 2020 at 8:14 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>>>> On Thu, 2020-10-08 at 19:27 +0200, Ilya Dryomov wrote:
>>>>>>>>> On Tue, Sep 29, 2020 at 12:03 AM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>>>>>> Patrick reported a case where the MDS and client client had racing
>>>>>>>>>> session messages to one anothe. The MDS was sending caps to the client
>>>>>>>>>> and the client was sending a CEPH_SESSION_REQUEST_CLOSE message in order
>>>>>>>>>> to unmount.
>>>>>>>>>>
>>>>>>>>>> Because they were sending at the same time, the REQUEST_CLOSE had too
>>>>>>>>>> old a sequence number, and the MDS dropped it on the floor. On the
>>>>>>>>>> client, this would have probably manifested as a 60s hang during umount.
>>>>>>>>>> The MDS ended up blocklisting the client.
>>>>>>>>>>
>>>>>>>>>> Once we've decided to issue a REQUEST_CLOSE, we're finished with the
>>>>>>>>>> session, so just keep sending them until the MDS acknowledges that.
>>>>>>>>>>
>>>>>>>>>> Change the code to retransmit a REQUEST_CLOSE every second if the
>>>>>>>>>> session hasn't changed state yet. Give up and throw a warning after
>>>>>>>>>> mount_timeout elapses if we haven't gotten a response.
>>>>>>>>>>
>>>>>>>>>> URL: https://tracker.ceph.com/issues/47563
>>>>>>>>>> Reported-by: Patrick Donnelly <pdonnell@redhat.com>
>>>>>>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>>>>>>> ---
>>>>>>>>>>      fs/ceph/mds_client.c | 53 ++++++++++++++++++++++++++------------------
>>>>>>>>>>      1 file changed, 32 insertions(+), 21 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>>>>>> index b07e7adf146f..d9cb74e3d5e3 100644
>>>>>>>>>> --- a/fs/ceph/mds_client.c
>>>>>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>>>>>> @@ -1878,7 +1878,7 @@ static int request_close_session(struct ceph_mds_session *session)
>>>>>>>>>>      static int __close_session(struct ceph_mds_client *mdsc,
>>>>>>>>>>                              struct ceph_mds_session *session)
>>>>>>>>>>      {
>>>>>>>>>> -       if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
>>>>>>>>>> +       if (session->s_state > CEPH_MDS_SESSION_CLOSING)
>>>>>>>>>>                     return 0;
>>>>>>>>>>             session->s_state = CEPH_MDS_SESSION_CLOSING;
>>>>>>>>>>             return request_close_session(session);
>>>>>>>>>> @@ -4692,38 +4692,49 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
>>>>>>>>>>             return atomic_read(&mdsc->num_sessions) <= skipped;
>>>>>>>>>>      }
>>>>>>>>>>
>>>>>>>>>> +static bool umount_timed_out(unsigned long timeo)
>>>>>>>>>> +{
>>>>>>>>>> +       if (time_before(jiffies, timeo))
>>>>>>>>>> +               return false;
>>>>>>>>>> +       pr_warn("ceph: unable to close all sessions\n");
>>>>>>>>>> +       return true;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>      /*
>>>>>>>>>>       * called after sb is ro.
>>>>>>>>>>       */
>>>>>>>>>>      void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>>>>>>>>>>      {
>>>>>>>>>> -       struct ceph_options *opts = mdsc->fsc->client->options;
>>>>>>>>>>             struct ceph_mds_session *session;
>>>>>>>>>> -       int i;
>>>>>>>>>> -       int skipped = 0;
>>>>>>>>>> +       int i, ret;
>>>>>>>>>> +       int skipped;
>>>>>>>>>> +       unsigned long timeo = jiffies +
>>>>>>>>>> +                             ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
>>>>>>>>>>
>>>>>>>>>>             dout("close_sessions\n");
>>>>>>>>>>
>>>>>>>>>>             /* close sessions */
>>>>>>>>>> -       mutex_lock(&mdsc->mutex);
>>>>>>>>>> -       for (i = 0; i < mdsc->max_sessions; i++) {
>>>>>>>>>> -               session = __ceph_lookup_mds_session(mdsc, i);
>>>>>>>>>> -               if (!session)
>>>>>>>>>> -                       continue;
>>>>>>>>>> -               mutex_unlock(&mdsc->mutex);
>>>>>>>>>> -               mutex_lock(&session->s_mutex);
>>>>>>>>>> -               if (__close_session(mdsc, session) <= 0)
>>>>>>>>>> -                       skipped++;
>>>>>>>>>> -               mutex_unlock(&session->s_mutex);
>>>>>>>>>> -               ceph_put_mds_session(session);
>>>>>>>>>> +       do {
>>>>>>>>>> +               skipped = 0;
>>>>>>>>>>                     mutex_lock(&mdsc->mutex);
>>>>>>>>>> -       }
>>>>>>>>>> -       mutex_unlock(&mdsc->mutex);
>>>>>>>>>> +               for (i = 0; i < mdsc->max_sessions; i++) {
>>>>>>>>>> +                       session = __ceph_lookup_mds_session(mdsc, i);
>>>>>>>>>> +                       if (!session)
>>>>>>>>>> +                               continue;
>>>>>>>>>> +                       mutex_unlock(&mdsc->mutex);
>>>>>>>>>> +                       mutex_lock(&session->s_mutex);
>>>>>>>>>> +                       if (__close_session(mdsc, session) <= 0)
>>>>>>>>>> +                               skipped++;
>>>>>>>>>> +                       mutex_unlock(&session->s_mutex);
>>>>>>>>>> +                       ceph_put_mds_session(session);
>>>>>>>>>> +                       mutex_lock(&mdsc->mutex);
>>>>>>>>>> +               }
>>>>>>>>>> +               mutex_unlock(&mdsc->mutex);
>>>>>>>>>>
>>>>>>>>>> -       dout("waiting for sessions to close\n");
>>>>>>>>>> -       wait_event_timeout(mdsc->session_close_wq,
>>>>>>>>>> -                          done_closing_sessions(mdsc, skipped),
>>>>>>>>>> -                          ceph_timeout_jiffies(opts->mount_timeout));
>>>>>>>>>> +               dout("waiting for sessions to close\n");
>>>>>>>>>> +               ret = wait_event_timeout(mdsc->session_close_wq,
>>>>>>>>>> +                                        done_closing_sessions(mdsc, skipped), HZ);
>>>>>>>>>> +       } while (!ret && !umount_timed_out(timeo));
>>>>>>>>>>
>>>>>>>>>>             /* tear down remaining sessions */
>>>>>>>>>>             mutex_lock(&mdsc->mutex);
>>>>>>>>>> --
>>>>>>>>>> 2.26.2
>>>>>>>>>>
>>>>>>>>> Hi Jeff,
>>>>>>>>>
>>>>>>>>> This seems wrong to me, at least conceptually.  Is the same patch
>>>>>>>>> getting applied to ceph-fuse?
>>>>>>>>>
>>>>>>>> It's a grotesque workaround, I will grant you. I'm not sure what we want
>>>>>>>> to do for ceph-fuse yet but it does seem to have the same issue.
>>>>>>>> Probably, we should plan to do a similar fix there once we settle on the
>>>>>>>> right approach.
>>>>>>>>
>>>>>>>>> Pretending to not know anything about the client <-> MDS protocol,
>>>>>>>>> two questions immediately come to mind.  Why is MDS allowed to drop
>>>>>>>>> REQUEST_CLOSE?
>>>>>>>> It really seems like a protocol design flaw.
>>>>>>>>
>>>>>>>> IIUC, the idea overall with the low-level ceph protocol seems to be that
>>>>>>>> the client should retransmit (or reevaluate, in the case of caps) calls
>>>>>>>> that were in flight when the seq number changes.
>>>>>>>>
>>>>>>>> The REQUEST_CLOSE handling seems to have followed suit on the MDS side,
>>>>>>>> but it doesn't really make a lot of sense for that, IMO.
>>>>>>> (edit of my reply to https://github.com/ceph/ceph/pull/37619)
>>>>>>>
>>>>>>> After taking a look at the MDS code, it really seemed like it
>>>>>>> had been written with the expectation that REQUEST_CLOSE would be
>>>>>>> resent, so I dug around.  I don't fully understand these "push"
>>>>>>> sequence numbers yet, but there is probably some race that requires
>>>>>>> the client to confirm that it saw the sequence number, even if the
>>>>>>> session is about to go.  Sage is probably the only one who might
>>>>>>> remember at this point.
>>>>>>>
>>>>>>> The kernel client already has the code to retry REQUEST_CLOSE, only
>>>>>>> every five seconds instead every second.  See check_session_state()
>>>>>>> which is called from delayed_work() in mds_client.c.  It looks like
>>>>>>> it got broken by Xiubo's commit fa9967734227 ("ceph: fix potential
>>>>>>> mdsc use-after-free crash") which conditioned delayed_work() on
>>>>>>> mdsc->stopping -- hence the misbehaviour.
>>>>>> Without this commit it will hit this issue too. The umount old code will
>>>>>> try to close sessions asynchronously, and then tries to cancel the
>>>>>> delayed work, during which the last queued delayed_work() timer might be
>>>>>> fired. This commit makes it easier to be reproduced.
>>>>>>
>>>>> Fixing the potential races to ensure that this is retransmitted is an
>>>>> option, but I'm not sure it's the best one. Here's what I think we
>>>>> probably ought to do:
>>>>>
>>>>> 1/ fix the MDS to just ignore the sequence number on REQUEST_CLOSE. I
>>>>> don't see that the sequence number has any value on that call, as it's
>>>>> an indicator that the client is finished with the session, and it's
>>>>> never going to change its mind and do something different if the
>>>>> sequence is wrong. I have a PR for that here:
>>>>>
>>>>>        https://github.com/ceph/ceph/pull/37619
>>>>>
>>>>> 2/ fix the clients to not wait on the REQUEST_CLOSE reply. As soon as
>>>>> the call is sent, tear down the session and proceed with unmounting. The
>>>>> client doesn't really care what the MDS has to say after that point, so
>>>>> we may as well not wait on it before proceeding.
>>>>>
>>>>> Thoughts?
>>>> I am thinking possibly we can just check the session's state when the
>>>> client receives a request from MDS which will increase the s_seq number,
>>>> if the session is in CLOSING state, the client needs to resend the
>>>> REQUEST_CLOSE request again.
>>>>
>>> That could be done, but that means adding extra complexity to the
>>> session handling code, which could really stand to be simplified
>>> instead.
>>>
>>> mdsc->stopping and session->s_state seem to be protected by the
>>> mdsc->mutex, but session->s_seq is protected by the session->s_mutex.
>>>
>>> There are 4 types of messages that increment the s_seq -- caps, leases,
>>> quotas and snaps. All of those would need to be changed to check for and
>>> retransmit REQUEST_CLOSE if one is outstanding.
>> How about deferring resending the CLOSE request in the above case ?
>>
> I'm not sure I follow you here. What would deferring the resend help
> with here? Bear in mind that the client is unmounting at this point so
> we really do want to get on with the business of tearing things down and
> not wait around any longer than we need.

Okay, please just ignore it.

BRs


>
>>> So yeah, that could be done on the client side. If we were to do that,
>>> should we couple it with the MDS side fix to make it ignore the seq on
>>> REQUEST_CLOSE?
>>
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b07e7adf146f..d9cb74e3d5e3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1878,7 +1878,7 @@  static int request_close_session(struct ceph_mds_session *session)
 static int __close_session(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session)
 {
-	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+	if (session->s_state > CEPH_MDS_SESSION_CLOSING)
 		return 0;
 	session->s_state = CEPH_MDS_SESSION_CLOSING;
 	return request_close_session(session);
@@ -4692,38 +4692,49 @@  static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 	return atomic_read(&mdsc->num_sessions) <= skipped;
 }
 
+static bool umount_timed_out(unsigned long timeo)
+{
+	if (time_before(jiffies, timeo))
+		return false;
+	pr_warn("ceph: unable to close all sessions\n");
+	return true;
+}
+
 /*
  * called after sb is ro.
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
-	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_session *session;
-	int i;
-	int skipped = 0;
+	int i, ret;
+	int skipped;
+	unsigned long timeo = jiffies +
+			      ceph_timeout_jiffies(mdsc->fsc->client->options->mount_timeout);
 
 	dout("close_sessions\n");
 
 	/* close sessions */
-	mutex_lock(&mdsc->mutex);
-	for (i = 0; i < mdsc->max_sessions; i++) {
-		session = __ceph_lookup_mds_session(mdsc, i);
-		if (!session)
-			continue;
-		mutex_unlock(&mdsc->mutex);
-		mutex_lock(&session->s_mutex);
-		if (__close_session(mdsc, session) <= 0)
-			skipped++;
-		mutex_unlock(&session->s_mutex);
-		ceph_put_mds_session(session);
+	do {
+		skipped = 0;
 		mutex_lock(&mdsc->mutex);
-	}
-	mutex_unlock(&mdsc->mutex);
+		for (i = 0; i < mdsc->max_sessions; i++) {
+			session = __ceph_lookup_mds_session(mdsc, i);
+			if (!session)
+				continue;
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			if (__close_session(mdsc, session) <= 0)
+				skipped++;
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+		}
+		mutex_unlock(&mdsc->mutex);
 
-	dout("waiting for sessions to close\n");
-	wait_event_timeout(mdsc->session_close_wq,
-			   done_closing_sessions(mdsc, skipped),
-			   ceph_timeout_jiffies(opts->mount_timeout));
+		dout("waiting for sessions to close\n");
+		ret = wait_event_timeout(mdsc->session_close_wq,
+					 done_closing_sessions(mdsc, skipped), HZ);
+	} while (!ret && !umount_timed_out(timeo));
 
 	/* tear down remaining sessions */
 	mutex_lock(&mdsc->mutex);