diff mbox series

[v2] ceph: fix NULL pointer dereference for req->r_session

Message ID 20221108054954.307554-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2] ceph: fix NULL pointer dereference for req->r_session | expand

Commit Message

Xiubo Li Nov. 8, 2022, 5:49 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

The request's r_session maybe changed when it was forwarded or
resent.

Cc: stable@vger.kernel.org
URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c | 88 +++++++++++++++++++-------------------------------
 1 file changed, 33 insertions(+), 55 deletions(-)

Comments

Ilya Dryomov Nov. 8, 2022, 10:50 a.m. UTC | #1
On Tue, Nov 8, 2022 at 6:50 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The request's r_session maybe changed when it was forwarded or
> resent.
>
> Cc: stable@vger.kernel.org
> URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/caps.c | 88 +++++++++++++++++++-------------------------------
>  1 file changed, 33 insertions(+), 55 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 894adfb4a092..172f18f7459d 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2297,8 +2297,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>         struct ceph_inode_info *ci = ceph_inode(inode);
>         struct ceph_mds_request *req1 = NULL, *req2 = NULL;
> +       struct ceph_mds_session *s, **sessions = NULL;

Hi Xiubo,

Nit: mixing pointers and double pointers coupled with differing
initialization is generally frowned upon.  Keep it on two lines as
before:

    struct ceph_mds_session **sessions = NULL;
    struct ceph_mds_session *s;

>         unsigned int max_sessions;
> -       int ret, err = 0;
> +       int i, ret, err = 0;
>
>         spin_lock(&ci->i_unsafe_lock);
>         if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
> @@ -2315,31 +2316,22 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>         }
>         spin_unlock(&ci->i_unsafe_lock);
>
> -       /*
> -        * The mdsc->max_sessions is unlikely to be changed
> -        * mostly, here we will retry it by reallocating the
> -        * sessions array memory to get rid of the mdsc->mutex
> -        * lock.
> -        */
> -retry:
> -       max_sessions = mdsc->max_sessions;
> -
>         /*
>          * Trigger to flush the journal logs in all the relevant MDSes
>          * manually, or in the worst case we must wait at most 5 seconds
>          * to wait the journal logs to be flushed by the MDSes periodically.
>          */
> +       mutex_lock(&mdsc->mutex);
> +       max_sessions = mdsc->max_sessions;
> +       sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
> +       if (!sessions) {
> +               mutex_unlock(&mdsc->mutex);
> +               err = -ENOMEM;
> +               goto out;
> +       }
> +
>         if ((req1 || req2) && likely(max_sessions)) {

Just curious, when can max_sessions be zero here?

> -               struct ceph_mds_session **sessions = NULL;
> -               struct ceph_mds_session *s;
>                 struct ceph_mds_request *req;
> -               int i;
> -
> -               sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
> -               if (!sessions) {
> -                       err = -ENOMEM;
> -                       goto out;
> -               }
>
>                 spin_lock(&ci->i_unsafe_lock);
>                 if (req1) {
> @@ -2348,16 +2340,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>                                 s = req->r_session;
>                                 if (!s)
>                                         continue;
> -                               if (unlikely(s->s_mds >= max_sessions)) {
> -                                       spin_unlock(&ci->i_unsafe_lock);
> -                                       for (i = 0; i < max_sessions; i++) {
> -                                               s = sessions[i];
> -                                               if (s)
> -                                                       ceph_put_mds_session(s);
> -                                       }
> -                                       kfree(sessions);
> -                                       goto retry;
> -                               }
> +                               if (unlikely(s->s_mds >= max_sessions))
> +                                       continue;

Nit: this could be combined with the previous condition:

    if (!s || unlikely(s->s_mds >= max_sessions))
            continue;

>                                 if (!sessions[s->s_mds]) {
>                                         s = ceph_get_mds_session(s);
>                                         sessions[s->s_mds] = s;
> @@ -2370,16 +2354,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>                                 s = req->r_session;
>                                 if (!s)
>                                         continue;
> -                               if (unlikely(s->s_mds >= max_sessions)) {
> -                                       spin_unlock(&ci->i_unsafe_lock);
> -                                       for (i = 0; i < max_sessions; i++) {
> -                                               s = sessions[i];
> -                                               if (s)
> -                                                       ceph_put_mds_session(s);
> -                                       }
> -                                       kfree(sessions);
> -                                       goto retry;
> -                               }
> +                               if (unlikely(s->s_mds >= max_sessions))
> +                                       continue;

ditto

>                                 if (!sessions[s->s_mds]) {
>                                         s = ceph_get_mds_session(s);
>                                         sessions[s->s_mds] = s;
> @@ -2387,25 +2363,26 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>                         }
>                 }
>                 spin_unlock(&ci->i_unsafe_lock);
> +       }
> +       mutex_unlock(&mdsc->mutex);
>
> -               /* the auth MDS */
> -               spin_lock(&ci->i_ceph_lock);
> -               if (ci->i_auth_cap) {
> -                     s = ci->i_auth_cap->session;
> -                     if (!sessions[s->s_mds])
> -                             sessions[s->s_mds] = ceph_get_mds_session(s);
> -               }
> -               spin_unlock(&ci->i_ceph_lock);
> +       /* the auth MDS */
> +       spin_lock(&ci->i_ceph_lock);

Why was this "auth MDS" block moved outside of max_sessions > 0
branch?  Logically, it very much belongs there.  Is there a problem
with taking ci->i_ceph_lock under mdsc->mutex?

> +       if (ci->i_auth_cap) {
> +               s = ci->i_auth_cap->session;
> +               if (!sessions[s->s_mds] &&
> +                   likely(s->s_mds < max_sessions))

This is wrong: s->s_mds must be checked against max_sessions before
indexing into sessions array.  Also, the entire condition should fit on
a single line.

> +                       sessions[s->s_mds] = ceph_get_mds_session(s);
> +       }
> +       spin_unlock(&ci->i_ceph_lock);
>
> -               /* send flush mdlog request to MDSes */
> -               for (i = 0; i < max_sessions; i++) {
> -                       s = sessions[i];
> -                       if (s) {
> -                               send_flush_mdlog(s);
> -                               ceph_put_mds_session(s);
> -                       }
> +       /* send flush mdlog request to MDSes */
> +       for (i = 0; i < max_sessions; i++) {
> +               s = sessions[i];
> +               if (s) {
> +                       send_flush_mdlog(s);
> +                       ceph_put_mds_session(s);
>                 }
> -               kfree(sessions);
>         }
>
>         dout("%s %p wait on tid %llu %llu\n", __func__,
> @@ -2428,6 +2405,7 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>                 ceph_mdsc_put_request(req1);
>         if (req2)
>                 ceph_mdsc_put_request(req2);
> +       kfree(sessions);

Nit: since sessions array is allocated after references to req1 and
req2 are grabbed, I would free it before these references are put.

Thanks,

                Ilya
Xiubo Li Nov. 8, 2022, 12:38 p.m. UTC | #2
On 08/11/2022 18:50, Ilya Dryomov wrote:
> On Tue, Nov 8, 2022 at 6:50 AM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The request's r_session maybe changed when it was forwarded or
>> resent.
>>
>> Cc: stable@vger.kernel.org
>> URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c | 88 +++++++++++++++++++-------------------------------
>>   1 file changed, 33 insertions(+), 55 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 894adfb4a092..172f18f7459d 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -2297,8 +2297,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>          struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>>          struct ceph_inode_info *ci = ceph_inode(inode);
>>          struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>> +       struct ceph_mds_session *s, **sessions = NULL;
> Hi Xiubo,
>
> Nit: mixing pointers and double pointers coupled with differing
> initialization is generally frowned upon.  Keep it on two lines as
> before:
>
>      struct ceph_mds_session **sessions = NULL;
>      struct ceph_mds_session *s;

Sure, will fix it.

>>          unsigned int max_sessions;
>> -       int ret, err = 0;
>> +       int i, ret, err = 0;
>>
>>          spin_lock(&ci->i_unsafe_lock);
>>          if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
>> @@ -2315,31 +2316,22 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>          }
>>          spin_unlock(&ci->i_unsafe_lock);
>>
>> -       /*
>> -        * The mdsc->max_sessions is unlikely to be changed
>> -        * mostly, here we will retry it by reallocating the
>> -        * sessions array memory to get rid of the mdsc->mutex
>> -        * lock.
>> -        */
>> -retry:
>> -       max_sessions = mdsc->max_sessions;
>> -
>>          /*
>>           * Trigger to flush the journal logs in all the relevant MDSes
>>           * manually, or in the worst case we must wait at most 5 seconds
>>           * to wait the journal logs to be flushed by the MDSes periodically.
>>           */
>> +       mutex_lock(&mdsc->mutex);
>> +       max_sessions = mdsc->max_sessions;
>> +       sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
>> +       if (!sessions) {
>> +               mutex_unlock(&mdsc->mutex);
>> +               err = -ENOMEM;
>> +               goto out;
>> +       }
>> +
>>          if ((req1 || req2) && likely(max_sessions)) {
> Just curious, when can max_sessions be zero here?

Checked the code again, just before registering the first session, and 
this is monotone increasing. It should be safe to remove this here.


>
>> -               struct ceph_mds_session **sessions = NULL;
>> -               struct ceph_mds_session *s;
>>                  struct ceph_mds_request *req;
>> -               int i;
>> -
>> -               sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
>> -               if (!sessions) {
>> -                       err = -ENOMEM;
>> -                       goto out;
>> -               }
>>
>>                  spin_lock(&ci->i_unsafe_lock);
>>                  if (req1) {
>> @@ -2348,16 +2340,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>                                  s = req->r_session;
>>                                  if (!s)
>>                                          continue;
>> -                               if (unlikely(s->s_mds >= max_sessions)) {
>> -                                       spin_unlock(&ci->i_unsafe_lock);
>> -                                       for (i = 0; i < max_sessions; i++) {
>> -                                               s = sessions[i];
>> -                                               if (s)
>> -                                                       ceph_put_mds_session(s);
>> -                                       }
>> -                                       kfree(sessions);
>> -                                       goto retry;
>> -                               }
>> +                               if (unlikely(s->s_mds >= max_sessions))
>> +                                       continue;
> Nit: this could be combined with the previous condition:
>
>      if (!s || unlikely(s->s_mds >= max_sessions))
>              continue;

Sure.


>>                                  if (!sessions[s->s_mds]) {
>>                                          s = ceph_get_mds_session(s);
>>                                          sessions[s->s_mds] = s;
>> @@ -2370,16 +2354,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>                                  s = req->r_session;
>>                                  if (!s)
>>                                          continue;
>> -                               if (unlikely(s->s_mds >= max_sessions)) {
>> -                                       spin_unlock(&ci->i_unsafe_lock);
>> -                                       for (i = 0; i < max_sessions; i++) {
>> -                                               s = sessions[i];
>> -                                               if (s)
>> -                                                       ceph_put_mds_session(s);
>> -                                       }
>> -                                       kfree(sessions);
>> -                                       goto retry;
>> -                               }
>> +                               if (unlikely(s->s_mds >= max_sessions))
>> +                                       continue;
> ditto
>
>>                                  if (!sessions[s->s_mds]) {
>>                                          s = ceph_get_mds_session(s);
>>                                          sessions[s->s_mds] = s;
>> @@ -2387,25 +2363,26 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>                          }
>>                  }
>>                  spin_unlock(&ci->i_unsafe_lock);
>> +       }
>> +       mutex_unlock(&mdsc->mutex);
>>
>> -               /* the auth MDS */
>> -               spin_lock(&ci->i_ceph_lock);
>> -               if (ci->i_auth_cap) {
>> -                     s = ci->i_auth_cap->session;
>> -                     if (!sessions[s->s_mds])
>> -                             sessions[s->s_mds] = ceph_get_mds_session(s);
>> -               }
>> -               spin_unlock(&ci->i_ceph_lock);
>> +       /* the auth MDS */
>> +       spin_lock(&ci->i_ceph_lock);
> Why was this "auth MDS" block moved outside of max_sessions > 0
> branch?  Logically, it very much belongs there.  Is there a problem
> with taking ci->i_ceph_lock under mdsc->mutex?

I will remove the 'likely(max_session)' and there is no any problem for 
this.

>
>> +       if (ci->i_auth_cap) {
>> +               s = ci->i_auth_cap->session;
>> +               if (!sessions[s->s_mds] &&
>> +                   likely(s->s_mds < max_sessions))
> This is wrong: s->s_mds must be checked against max_sessions before
> indexing into sessions array.  Also, the entire condition should fit on
> a single line.
I am moving it to the if(req1 || req2) {} scope and it will exceed 80 
chars. And will keep it in two lines.
>> +                       sessions[s->s_mds] = ceph_get_mds_session(s);
>> +       }
>> +       spin_unlock(&ci->i_ceph_lock);
>>
>> -               /* send flush mdlog request to MDSes */
>> -               for (i = 0; i < max_sessions; i++) {
>> -                       s = sessions[i];
>> -                       if (s) {
>> -                               send_flush_mdlog(s);
>> -                               ceph_put_mds_session(s);
>> -                       }
>> +       /* send flush mdlog request to MDSes */
>> +       for (i = 0; i < max_sessions; i++) {
>> +               s = sessions[i];
>> +               if (s) {
>> +                       send_flush_mdlog(s);
>> +                       ceph_put_mds_session(s);
>>                  }
>> -               kfree(sessions);
>>          }
>>
>>          dout("%s %p wait on tid %llu %llu\n", __func__,
>> @@ -2428,6 +2405,7 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>                  ceph_mdsc_put_request(req1);
>>          if (req2)
>>                  ceph_mdsc_put_request(req2);
>> +       kfree(sessions);
> Nit: since sessions array is allocated after references to req1 and
> req2 are grabbed, I would free it before these references are put.

Sure!

Thanks!

- Xiubo

> Thanks,
>
>                  Ilya
>
Ilya Dryomov Nov. 8, 2022, 12:44 p.m. UTC | #3
On Tue, Nov 8, 2022 at 1:38 PM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 08/11/2022 18:50, Ilya Dryomov wrote:
> > On Tue, Nov 8, 2022 at 6:50 AM <xiubli@redhat.com> wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> >> The request's r_session maybe changed when it was forwarded or
> >> resent.
> >>
> >> Cc: stable@vger.kernel.org
> >> URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >>   fs/ceph/caps.c | 88 +++++++++++++++++++-------------------------------
> >>   1 file changed, 33 insertions(+), 55 deletions(-)
> >>
> >> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> >> index 894adfb4a092..172f18f7459d 100644
> >> --- a/fs/ceph/caps.c
> >> +++ b/fs/ceph/caps.c
> >> @@ -2297,8 +2297,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
> >>          struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> >>          struct ceph_inode_info *ci = ceph_inode(inode);
> >>          struct ceph_mds_request *req1 = NULL, *req2 = NULL;
> >> +       struct ceph_mds_session *s, **sessions = NULL;
> > Hi Xiubo,
> >
> > Nit: mixing pointers and double pointers coupled with differing
> > initialization is generally frowned upon.  Keep it on two lines as
> > before:
> >
> >      struct ceph_mds_session **sessions = NULL;
> >      struct ceph_mds_session *s;
>
> Sure, will fix it.
>
> >>          unsigned int max_sessions;
> >> -       int ret, err = 0;
> >> +       int i, ret, err = 0;
> >>
> >>          spin_lock(&ci->i_unsafe_lock);
> >>          if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
> >> @@ -2315,31 +2316,22 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
> >>          }
> >>          spin_unlock(&ci->i_unsafe_lock);
> >>
> >> -       /*
> >> -        * The mdsc->max_sessions is unlikely to be changed
> >> -        * mostly, here we will retry it by reallocating the
> >> -        * sessions array memory to get rid of the mdsc->mutex
> >> -        * lock.
> >> -        */
> >> -retry:
> >> -       max_sessions = mdsc->max_sessions;
> >> -
> >>          /*
> >>           * Trigger to flush the journal logs in all the relevant MDSes
> >>           * manually, or in the worst case we must wait at most 5 seconds
> >>           * to wait the journal logs to be flushed by the MDSes periodically.
> >>           */
> >> +       mutex_lock(&mdsc->mutex);
> >> +       max_sessions = mdsc->max_sessions;
> >> +       sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
> >> +       if (!sessions) {
> >> +               mutex_unlock(&mdsc->mutex);
> >> +               err = -ENOMEM;
> >> +               goto out;
> >> +       }
> >> +
> >>          if ((req1 || req2) && likely(max_sessions)) {
> > Just curious, when can max_sessions be zero here?
>
> Checked the code again, just before registering the first session, and
> this is monotone increasing. It should be safe to remove this here.
>
>
> >
> >> -               struct ceph_mds_session **sessions = NULL;
> >> -               struct ceph_mds_session *s;
> >>                  struct ceph_mds_request *req;
> >> -               int i;
> >> -
> >> -               sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
> >> -               if (!sessions) {
> >> -                       err = -ENOMEM;
> >> -                       goto out;
> >> -               }
> >>
> >>                  spin_lock(&ci->i_unsafe_lock);
> >>                  if (req1) {
> >> @@ -2348,16 +2340,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
> >>                                  s = req->r_session;
> >>                                  if (!s)
> >>                                          continue;
> >> -                               if (unlikely(s->s_mds >= max_sessions)) {
> >> -                                       spin_unlock(&ci->i_unsafe_lock);
> >> -                                       for (i = 0; i < max_sessions; i++) {
> >> -                                               s = sessions[i];
> >> -                                               if (s)
> >> -                                                       ceph_put_mds_session(s);
> >> -                                       }
> >> -                                       kfree(sessions);
> >> -                                       goto retry;
> >> -                               }
> >> +                               if (unlikely(s->s_mds >= max_sessions))
> >> +                                       continue;
> > Nit: this could be combined with the previous condition:
> >
> >      if (!s || unlikely(s->s_mds >= max_sessions))
> >              continue;
>
> Sure.
>
>
> >>                                  if (!sessions[s->s_mds]) {
> >>                                          s = ceph_get_mds_session(s);
> >>                                          sessions[s->s_mds] = s;
> >> @@ -2370,16 +2354,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
> >>                                  s = req->r_session;
> >>                                  if (!s)
> >>                                          continue;
> >> -                               if (unlikely(s->s_mds >= max_sessions)) {
> >> -                                       spin_unlock(&ci->i_unsafe_lock);
> >> -                                       for (i = 0; i < max_sessions; i++) {
> >> -                                               s = sessions[i];
> >> -                                               if (s)
> >> -                                                       ceph_put_mds_session(s);
> >> -                                       }
> >> -                                       kfree(sessions);
> >> -                                       goto retry;
> >> -                               }
> >> +                               if (unlikely(s->s_mds >= max_sessions))
> >> +                                       continue;
> > ditto
> >
> >>                                  if (!sessions[s->s_mds]) {
> >>                                          s = ceph_get_mds_session(s);
> >>                                          sessions[s->s_mds] = s;
> >> @@ -2387,25 +2363,26 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
> >>                          }
> >>                  }
> >>                  spin_unlock(&ci->i_unsafe_lock);
> >> +       }
> >> +       mutex_unlock(&mdsc->mutex);
> >>
> >> -               /* the auth MDS */
> >> -               spin_lock(&ci->i_ceph_lock);
> >> -               if (ci->i_auth_cap) {
> >> -                     s = ci->i_auth_cap->session;
> >> -                     if (!sessions[s->s_mds])
> >> -                             sessions[s->s_mds] = ceph_get_mds_session(s);
> >> -               }
> >> -               spin_unlock(&ci->i_ceph_lock);
> >> +       /* the auth MDS */
> >> +       spin_lock(&ci->i_ceph_lock);
> > Why was this "auth MDS" block moved outside of max_sessions > 0
> > branch?  Logically, it very much belongs there.  Is there a problem
> > with taking ci->i_ceph_lock under mdsc->mutex?
>
> I will remove the 'likely(max_session)' and there is no any problem for
> this.
>
> >
> >> +       if (ci->i_auth_cap) {
> >> +               s = ci->i_auth_cap->session;
> >> +               if (!sessions[s->s_mds] &&
> >> +                   likely(s->s_mds < max_sessions))
> > This is wrong: s->s_mds must be checked against max_sessions before
> > indexing into sessions array.  Also, the entire condition should fit on
> > a single line.
> I am moving it to the if(req1 || req2) {} scope and it will exceed 80
> chars. And will keep it in two lines.

If you are removing max_session > 0 condition, I don't see a need to
move this to "if (req1 || req2)" scope.  I suggested that only because
existing code was explicitly guarding against max_session == 0.

Thanks,

                Ilya
Xiubo Li Nov. 8, 2022, 12:49 p.m. UTC | #4
On 08/11/2022 20:44, Ilya Dryomov wrote:
> On Tue, Nov 8, 2022 at 1:38 PM Xiubo Li <xiubli@redhat.com> wrote:
>>
>> On 08/11/2022 18:50, Ilya Dryomov wrote:
>>> On Tue, Nov 8, 2022 at 6:50 AM <xiubli@redhat.com> wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> The request's r_session maybe changed when it was forwarded or
>>>> resent.
>>>>
>>>> Cc: stable@vger.kernel.org
>>>> URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/caps.c | 88 +++++++++++++++++++-------------------------------
>>>>    1 file changed, 33 insertions(+), 55 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>> index 894adfb4a092..172f18f7459d 100644
>>>> --- a/fs/ceph/caps.c
>>>> +++ b/fs/ceph/caps.c
>>>> @@ -2297,8 +2297,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>>>           struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>>>>           struct ceph_inode_info *ci = ceph_inode(inode);
>>>>           struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>>>> +       struct ceph_mds_session *s, **sessions = NULL;
>>> Hi Xiubo,
>>>
>>> Nit: mixing pointers and double pointers coupled with differing
>>> initialization is generally frowned upon.  Keep it on two lines as
>>> before:
>>>
>>>       struct ceph_mds_session **sessions = NULL;
>>>       struct ceph_mds_session *s;
>> Sure, will fix it.
>>
>>>>           unsigned int max_sessions;
>>>> -       int ret, err = 0;
>>>> +       int i, ret, err = 0;
>>>>
>>>>           spin_lock(&ci->i_unsafe_lock);
>>>>           if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
>>>> @@ -2315,31 +2316,22 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>>>           }
>>>>           spin_unlock(&ci->i_unsafe_lock);
>>>>
>>>> -       /*
>>>> -        * The mdsc->max_sessions is unlikely to be changed
>>>> -        * mostly, here we will retry it by reallocating the
>>>> -        * sessions array memory to get rid of the mdsc->mutex
>>>> -        * lock.
>>>> -        */
>>>> -retry:
>>>> -       max_sessions = mdsc->max_sessions;
>>>> -
>>>>           /*
>>>>            * Trigger to flush the journal logs in all the relevant MDSes
>>>>            * manually, or in the worst case we must wait at most 5 seconds
>>>>            * to wait the journal logs to be flushed by the MDSes periodically.
>>>>            */
>>>> +       mutex_lock(&mdsc->mutex);
>>>> +       max_sessions = mdsc->max_sessions;
>>>> +       sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
>>>> +       if (!sessions) {
>>>> +               mutex_unlock(&mdsc->mutex);
>>>> +               err = -ENOMEM;
>>>> +               goto out;
>>>> +       }
>>>> +
>>>>           if ((req1 || req2) && likely(max_sessions)) {
>>> Just curious, when can max_sessions be zero here?
>> Checked the code again, just before registering the first session, and
>> this is monotone increasing. It should be safe to remove this here.
>>
>>
>>>> -               struct ceph_mds_session **sessions = NULL;
>>>> -               struct ceph_mds_session *s;
>>>>                   struct ceph_mds_request *req;
>>>> -               int i;
>>>> -
>>>> -               sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
>>>> -               if (!sessions) {
>>>> -                       err = -ENOMEM;
>>>> -                       goto out;
>>>> -               }
>>>>
>>>>                   spin_lock(&ci->i_unsafe_lock);
>>>>                   if (req1) {
>>>> @@ -2348,16 +2340,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>>>                                   s = req->r_session;
>>>>                                   if (!s)
>>>>                                           continue;
>>>> -                               if (unlikely(s->s_mds >= max_sessions)) {
>>>> -                                       spin_unlock(&ci->i_unsafe_lock);
>>>> -                                       for (i = 0; i < max_sessions; i++) {
>>>> -                                               s = sessions[i];
>>>> -                                               if (s)
>>>> -                                                       ceph_put_mds_session(s);
>>>> -                                       }
>>>> -                                       kfree(sessions);
>>>> -                                       goto retry;
>>>> -                               }
>>>> +                               if (unlikely(s->s_mds >= max_sessions))
>>>> +                                       continue;
>>> Nit: this could be combined with the previous condition:
>>>
>>>       if (!s || unlikely(s->s_mds >= max_sessions))
>>>               continue;
>> Sure.
>>
>>
>>>>                                   if (!sessions[s->s_mds]) {
>>>>                                           s = ceph_get_mds_session(s);
>>>>                                           sessions[s->s_mds] = s;
>>>> @@ -2370,16 +2354,8 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>>>                                   s = req->r_session;
>>>>                                   if (!s)
>>>>                                           continue;
>>>> -                               if (unlikely(s->s_mds >= max_sessions)) {
>>>> -                                       spin_unlock(&ci->i_unsafe_lock);
>>>> -                                       for (i = 0; i < max_sessions; i++) {
>>>> -                                               s = sessions[i];
>>>> -                                               if (s)
>>>> -                                                       ceph_put_mds_session(s);
>>>> -                                       }
>>>> -                                       kfree(sessions);
>>>> -                                       goto retry;
>>>> -                               }
>>>> +                               if (unlikely(s->s_mds >= max_sessions))
>>>> +                                       continue;
>>> ditto
>>>
>>>>                                   if (!sessions[s->s_mds]) {
>>>>                                           s = ceph_get_mds_session(s);
>>>>                                           sessions[s->s_mds] = s;
>>>> @@ -2387,25 +2363,26 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
>>>>                           }
>>>>                   }
>>>>                   spin_unlock(&ci->i_unsafe_lock);
>>>> +       }
>>>> +       mutex_unlock(&mdsc->mutex);
>>>>
>>>> -               /* the auth MDS */
>>>> -               spin_lock(&ci->i_ceph_lock);
>>>> -               if (ci->i_auth_cap) {
>>>> -                     s = ci->i_auth_cap->session;
>>>> -                     if (!sessions[s->s_mds])
>>>> -                             sessions[s->s_mds] = ceph_get_mds_session(s);
>>>> -               }
>>>> -               spin_unlock(&ci->i_ceph_lock);
>>>> +       /* the auth MDS */
>>>> +       spin_lock(&ci->i_ceph_lock);
>>> Why was this "auth MDS" block moved outside of max_sessions > 0
>>> branch?  Logically, it very much belongs there.  Is there a problem
>>> with taking ci->i_ceph_lock under mdsc->mutex?
>> I will remove the 'likely(max_session)' and there is no any problem for
>> this.
>>
>>>> +       if (ci->i_auth_cap) {
>>>> +               s = ci->i_auth_cap->session;
>>>> +               if (!sessions[s->s_mds] &&
>>>> +                   likely(s->s_mds < max_sessions))
>>> This is wrong: s->s_mds must be checked against max_sessions before
>>> indexing into sessions array.  Also, the entire condition should fit on
>>> a single line.
>> I am moving it to the if(req1 || req2) {} scope and it will exceed 80
>> chars. And will keep it in two lines.
> If you are removing max_session > 0 condition, I don't see a need to
> move this to "if (req1 || req2)" scope.  I suggested that only because
> existing code was explicitly guarding against max_session == 0.

I checked old code again, I think we should move it to this scope. 
Because only when both req1 and req2 are not NULL will it make sense to 
send the mdlog flushing request to MDS.

Thanks!

- Xiubo

> Thanks,
>
>                  Ilya
>
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 894adfb4a092..172f18f7459d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2297,8 +2297,9 @@  static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+	struct ceph_mds_session *s, **sessions = NULL;
 	unsigned int max_sessions;
-	int ret, err = 0;
+	int i, ret, err = 0;
 
 	spin_lock(&ci->i_unsafe_lock);
 	if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
@@ -2315,31 +2316,22 @@  static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 	}
 	spin_unlock(&ci->i_unsafe_lock);
 
-	/*
-	 * The mdsc->max_sessions is unlikely to be changed
-	 * mostly, here we will retry it by reallocating the
-	 * sessions array memory to get rid of the mdsc->mutex
-	 * lock.
-	 */
-retry:
-	max_sessions = mdsc->max_sessions;
-
 	/*
 	 * Trigger to flush the journal logs in all the relevant MDSes
 	 * manually, or in the worst case we must wait at most 5 seconds
 	 * to wait the journal logs to be flushed by the MDSes periodically.
 	 */
+	mutex_lock(&mdsc->mutex);
+	max_sessions = mdsc->max_sessions;
+	sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
+	if (!sessions) {
+		mutex_unlock(&mdsc->mutex);
+		err = -ENOMEM;
+		goto out;
+	}
+
 	if ((req1 || req2) && likely(max_sessions)) {
-		struct ceph_mds_session **sessions = NULL;
-		struct ceph_mds_session *s;
 		struct ceph_mds_request *req;
-		int i;
-
-		sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
-		if (!sessions) {
-			err = -ENOMEM;
-			goto out;
-		}
 
 		spin_lock(&ci->i_unsafe_lock);
 		if (req1) {
@@ -2348,16 +2340,8 @@  static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 				s = req->r_session;
 				if (!s)
 					continue;
-				if (unlikely(s->s_mds >= max_sessions)) {
-					spin_unlock(&ci->i_unsafe_lock);
-					for (i = 0; i < max_sessions; i++) {
-						s = sessions[i];
-						if (s)
-							ceph_put_mds_session(s);
-					}
-					kfree(sessions);
-					goto retry;
-				}
+				if (unlikely(s->s_mds >= max_sessions))
+					continue;
 				if (!sessions[s->s_mds]) {
 					s = ceph_get_mds_session(s);
 					sessions[s->s_mds] = s;
@@ -2370,16 +2354,8 @@  static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 				s = req->r_session;
 				if (!s)
 					continue;
-				if (unlikely(s->s_mds >= max_sessions)) {
-					spin_unlock(&ci->i_unsafe_lock);
-					for (i = 0; i < max_sessions; i++) {
-						s = sessions[i];
-						if (s)
-							ceph_put_mds_session(s);
-					}
-					kfree(sessions);
-					goto retry;
-				}
+				if (unlikely(s->s_mds >= max_sessions))
+					continue;
 				if (!sessions[s->s_mds]) {
 					s = ceph_get_mds_session(s);
 					sessions[s->s_mds] = s;
@@ -2387,25 +2363,26 @@  static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 			}
 		}
 		spin_unlock(&ci->i_unsafe_lock);
+	}
+	mutex_unlock(&mdsc->mutex);
 
-		/* the auth MDS */
-		spin_lock(&ci->i_ceph_lock);
-		if (ci->i_auth_cap) {
-		      s = ci->i_auth_cap->session;
-		      if (!sessions[s->s_mds])
-			      sessions[s->s_mds] = ceph_get_mds_session(s);
-		}
-		spin_unlock(&ci->i_ceph_lock);
+	/* the auth MDS */
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_auth_cap) {
+		s = ci->i_auth_cap->session;
+		if (!sessions[s->s_mds] &&
+		    likely(s->s_mds < max_sessions))
+			sessions[s->s_mds] = ceph_get_mds_session(s);
+	}
+	spin_unlock(&ci->i_ceph_lock);
 
-		/* send flush mdlog request to MDSes */
-		for (i = 0; i < max_sessions; i++) {
-			s = sessions[i];
-			if (s) {
-				send_flush_mdlog(s);
-				ceph_put_mds_session(s);
-			}
+	/* send flush mdlog request to MDSes */
+	for (i = 0; i < max_sessions; i++) {
+		s = sessions[i];
+		if (s) {
+			send_flush_mdlog(s);
+			ceph_put_mds_session(s);
 		}
-		kfree(sessions);
 	}
 
 	dout("%s %p wait on tid %llu %llu\n", __func__,
@@ -2428,6 +2405,7 @@  static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 		ceph_mdsc_put_request(req1);
 	if (req2)
 		ceph_mdsc_put_request(req2);
+	kfree(sessions);
 	return err;
 }