ceph: always clone dentry name when building MDS request
diff mbox series

Message ID 20190415164515.25767-1-jlayton@kernel.org
State New
Headers show
Series
  • ceph: always clone dentry name when building MDS request
Related show

Commit Message

Jeff Layton April 15, 2019, 4:45 p.m. UTC
Ben reported tripping the BUG_ON in create_request_message during some
performance testing. Analysis of the vmcore showed that the length of
the r_dentry->d_name string changed after we allocated the buffer, but
before we encoded it.

build_dentry_path returns pointers to d_name in the common case of
non-snapped dentries, but this optimization isn't safe. Instead, make a
copy of the string in this case. That is less efficient, but safe.

Reported-by: Ben England <bengland@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

Comments

Jeff Layton April 15, 2019, 6:12 p.m. UTC | #1
On Mon, Apr 15, 2019 at 12:45 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> Ben reported tripping the BUG_ON in create_request_message during some
> performance testing. Analysis of the vmcore showed that the length of
> the r_dentry->d_name string changed after we allocated the buffer, but
> before we encoded it.
>
> build_dentry_path returns pointers to d_name in the common case of
> non-snapped dentries, but this optimization isn't safe. Instead, make a
> copy of the string in this case. That is less efficient, but safe.
>
> Reported-by: Ben England <bengland@redhat.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
>  1 file changed, 32 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index e09feadd2ae1..4cfefe118128 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
>         return path;
>  }
>
> +/* Duplicate the dentry->d_name.name safely */
> +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> +                            int *ppathlen)
> +{
> +       u32 len;
> +       char *name;
> +retry:
> +       len = READ_ONCE(dentry->d_name.len);
> +       name = kmalloc(len + 1, GFP_NOFS);
> +       if (!name)
> +               return -ENOMEM;
> +
> +       spin_lock(&dentry->d_lock);
> +       if (dentry->d_name.len != len) {
> +               spin_unlock(&dentry->d_lock);
> +               kfree(name);
> +               goto retry;
> +       }
> +       memcpy(name, dentry->d_name.name, len);
> +       spin_unlock(&dentry->d_lock);
> +
> +       name[len] = '\0';
> +       *ppath = name;
> +       *ppathlen = len;
> +       return 0;
> +}
> +
>  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> -                            const char **ppath, int *ppathlen, u64 *pino,
> -                            int *pfreepath)
> +                            const char **ppath, int *ppathlen, u64 *pino)
>  {
>         char *path;
>
> @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
>         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
>                 *pino = ceph_ino(dir);
>                 rcu_read_unlock();
> -               *ppath = dentry->d_name.name;
> -               *ppathlen = dentry->d_name.len;
> -               return 0;
> +               return clone_dentry_name(dentry, ppath, ppathlen);
>         }
>         rcu_read_unlock();
>         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
>         if (IS_ERR(path))
>                 return PTR_ERR(path);
>         *ppath = path;
> -       *pfreepath = 1;
>         return 0;
>  }
>
> @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
>                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
>                      ceph_snap(rinode));
>         } else if (rdentry) {
> -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> -                                       freepath);
> +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> +               if (!r)
> +                       *freepath = 1;
>                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
>                      *ppath);
>         } else if (rpath || rino) {
> --
> 2.20.1
>

We should probably get this into v5.1 and mark it for stable as well?
It's a longstanding bug.
Patrick Donnelly April 15, 2019, 8:41 p.m. UTC | #2
On Mon, Apr 15, 2019 at 9:45 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> Ben reported tripping the BUG_ON in create_request_message during some
> performance testing. Analysis of the vmcore showed that the length of
> the r_dentry->d_name string changed after we allocated the buffer, but
> before we encoded it.
>
> build_dentry_path returns pointers to d_name in the common case of
> non-snapped dentries, but this optimization isn't safe. Instead, make a
> copy of the string in this case. That is less efficient, but safe.
>
> Reported-by: Ben England <bengland@redhat.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
>  1 file changed, 32 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index e09feadd2ae1..4cfefe118128 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
>         return path;
>  }
>
> +/* Duplicate the dentry->d_name.name safely */
> +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> +                            int *ppathlen)
> +{
> +       u32 len;
> +       char *name;
> +retry:
> +       len = READ_ONCE(dentry->d_name.len);
> +       name = kmalloc(len + 1, GFP_NOFS);
> +       if (!name)
> +               return -ENOMEM;
> +
> +       spin_lock(&dentry->d_lock);
> +       if (dentry->d_name.len != len) {
> +               spin_unlock(&dentry->d_lock);
> +               kfree(name);
> +               goto retry;
> +       }
> +       memcpy(name, dentry->d_name.name, len);
> +       spin_unlock(&dentry->d_lock);
> +
> +       name[len] = '\0';
> +       *ppath = name;
> +       *ppathlen = len;
> +       return 0;
> +}

I would think passing a char array of PATH_MAX would be preferable to
a heap allocation. Is that not typical?
Jeff Layton April 15, 2019, 9:14 p.m. UTC | #3
On Mon, Apr 15, 2019 at 4:41 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
>
> On Mon, Apr 15, 2019 at 9:45 AM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > Ben reported tripping the BUG_ON in create_request_message during some
> > performance testing. Analysis of the vmcore showed that the length of
> > the r_dentry->d_name string changed after we allocated the buffer, but
> > before we encoded it.
> >
> > build_dentry_path returns pointers to d_name in the common case of
> > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > copy of the string in this case. That is less efficient, but safe.
> >
> > Reported-by: Ben England <bengland@redhat.com>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> >  1 file changed, 32 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index e09feadd2ae1..4cfefe118128 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> >         return path;
> >  }
> >
> > +/* Duplicate the dentry->d_name.name safely */
> > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > +                            int *ppathlen)
> > +{
> > +       u32 len;
> > +       char *name;
> > +retry:
> > +       len = READ_ONCE(dentry->d_name.len);
> > +       name = kmalloc(len + 1, GFP_NOFS);
> > +       if (!name)
> > +               return -ENOMEM;
> > +
> > +       spin_lock(&dentry->d_lock);
> > +       if (dentry->d_name.len != len) {
> > +               spin_unlock(&dentry->d_lock);
> > +               kfree(name);
> > +               goto retry;
> > +       }
> > +       memcpy(name, dentry->d_name.name, len);
> > +       spin_unlock(&dentry->d_lock);
> > +
> > +       name[len] = '\0';
> > +       *ppath = name;
> > +       *ppathlen = len;
> > +       return 0;
> > +}
>
> I would think passing a char array of PATH_MAX would be preferable to
> a heap allocation. Is that not typical?
>

In general I don't like doing big allocations on the stack in the
kernel, as it is a finite resource.

That said, we'd technically only need a NAME_MAX+1 (256 byte) array
here, which is not _too_ bad. If that's what the maintainers prefer,
we could do that instead.
--
Jeff Layton <jlayton@kernel.org>
Yan, Zheng April 16, 2019, 2:24 a.m. UTC | #4
On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> Ben reported tripping the BUG_ON in create_request_message during some
> performance testing. Analysis of the vmcore showed that the length of
> the r_dentry->d_name string changed after we allocated the buffer, but
> before we encoded it.
>
> build_dentry_path returns pointers to d_name in the common case of
> non-snapped dentries, but this optimization isn't safe. Instead, make a
> copy of the string in this case. That is less efficient, but safe.
>
> Reported-by: Ben England <bengland@redhat.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
>  1 file changed, 32 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index e09feadd2ae1..4cfefe118128 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
>         return path;
>  }
>
> +/* Duplicate the dentry->d_name.name safely */
> +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> +                            int *ppathlen)
> +{
> +       u32 len;
> +       char *name;
> +retry:
> +       len = READ_ONCE(dentry->d_name.len);
> +       name = kmalloc(len + 1, GFP_NOFS);
> +       if (!name)
> +               return -ENOMEM;
> +
> +       spin_lock(&dentry->d_lock);
> +       if (dentry->d_name.len != len) {
> +               spin_unlock(&dentry->d_lock);
> +               kfree(name);
> +               goto retry;
> +       }
> +       memcpy(name, dentry->d_name.name, len);
> +       spin_unlock(&dentry->d_lock);
> +
> +       name[len] = '\0';
> +       *ppath = name;
> +       *ppathlen = len;
> +       return 0;
> +}
> +
>  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> -                            const char **ppath, int *ppathlen, u64 *pino,
> -                            int *pfreepath)
> +                            const char **ppath, int *ppathlen, u64 *pino)
>  {
>         char *path;
>
> @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
>         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
>                 *pino = ceph_ino(dir);
>                 rcu_read_unlock();
> -               *ppath = dentry->d_name.name;
> -               *ppathlen = dentry->d_name.len;
> -               return 0;
> +               return clone_dentry_name(dentry, ppath, ppathlen);
>         }
>         rcu_read_unlock();
>         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
>         if (IS_ERR(path))
>                 return PTR_ERR(path);
>         *ppath = path;
> -       *pfreepath = 1;
>         return 0;
>  }
>
> @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
>                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
>                      ceph_snap(rinode));
>         } else if (rdentry) {
> -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> -                                       freepath);
> +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> +               if (!r)
> +                       *freepath = 1;
>                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
>                      *ppath);
>         } else if (rpath || rino) {
> --
> 2.20.1
>

Looks good. I think we also need to update ceph_fill_trace(), make it
handle that case rinfo->dname != req->r_dentry->d_name.

Regards
Yan, Zheng
Jeff Layton April 16, 2019, 11:10 a.m. UTC | #5
On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > Ben reported tripping the BUG_ON in create_request_message during some
> > performance testing. Analysis of the vmcore showed that the length of
> > the r_dentry->d_name string changed after we allocated the buffer, but
> > before we encoded it.
> >
> > build_dentry_path returns pointers to d_name in the common case of
> > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > copy of the string in this case. That is less efficient, but safe.
> >
> > Reported-by: Ben England <bengland@redhat.com>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> >  1 file changed, 32 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index e09feadd2ae1..4cfefe118128 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> >         return path;
> >  }
> >
> > +/* Duplicate the dentry->d_name.name safely */
> > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > +                            int *ppathlen)
> > +{
> > +       u32 len;
> > +       char *name;
> > +retry:
> > +       len = READ_ONCE(dentry->d_name.len);
> > +       name = kmalloc(len + 1, GFP_NOFS);
> > +       if (!name)
> > +               return -ENOMEM;
> > +
> > +       spin_lock(&dentry->d_lock);
> > +       if (dentry->d_name.len != len) {
> > +               spin_unlock(&dentry->d_lock);
> > +               kfree(name);
> > +               goto retry;
> > +       }
> > +       memcpy(name, dentry->d_name.name, len);
> > +       spin_unlock(&dentry->d_lock);
> > +
> > +       name[len] = '\0';
> > +       *ppath = name;
> > +       *ppathlen = len;
> > +       return 0;
> > +}
> > +
> >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > -                            const char **ppath, int *ppathlen, u64 *pino,
> > -                            int *pfreepath)
> > +                            const char **ppath, int *ppathlen, u64 *pino)
> >  {
> >         char *path;
> >
> > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> >                 *pino = ceph_ino(dir);
> >                 rcu_read_unlock();
> > -               *ppath = dentry->d_name.name;
> > -               *ppathlen = dentry->d_name.len;
> > -               return 0;
> > +               return clone_dentry_name(dentry, ppath, ppathlen);
> >         }
> >         rcu_read_unlock();
> >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> >         if (IS_ERR(path))
> >                 return PTR_ERR(path);
> >         *ppath = path;
> > -       *pfreepath = 1;
> >         return 0;
> >  }
> >
> > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> >                      ceph_snap(rinode));
> >         } else if (rdentry) {
> > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > -                                       freepath);
> > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > +               if (!r)
> > +                       *freepath = 1;
> >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> >                      *ppath);
> >         } else if (rpath || rino) {
> > --
> > 2.20.1
> >
>
> Looks good. I think we also need to update ceph_fill_trace(), make it
> handle that case rinfo->dname != req->r_dentry->d_name.
>

Good catch. I'll send a separate patch for that one.
Jeff Layton April 16, 2019, 2:18 p.m. UTC | #6
On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
> >
> > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > Ben reported tripping the BUG_ON in create_request_message during some
> > > performance testing. Analysis of the vmcore showed that the length of
> > > the r_dentry->d_name string changed after we allocated the buffer, but
> > > before we encoded it.
> > >
> > > build_dentry_path returns pointers to d_name in the common case of
> > > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > > copy of the string in this case. That is less efficient, but safe.
> > >
> > > Reported-by: Ben England <bengland@redhat.com>
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> > >  1 file changed, 32 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index e09feadd2ae1..4cfefe118128 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> > >         return path;
> > >  }
> > >
> > > +/* Duplicate the dentry->d_name.name safely */
> > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > > +                            int *ppathlen)
> > > +{
> > > +       u32 len;
> > > +       char *name;
> > > +retry:
> > > +       len = READ_ONCE(dentry->d_name.len);
> > > +       name = kmalloc(len + 1, GFP_NOFS);
> > > +       if (!name)
> > > +               return -ENOMEM;
> > > +
> > > +       spin_lock(&dentry->d_lock);
> > > +       if (dentry->d_name.len != len) {
> > > +               spin_unlock(&dentry->d_lock);
> > > +               kfree(name);
> > > +               goto retry;
> > > +       }
> > > +       memcpy(name, dentry->d_name.name, len);
> > > +       spin_unlock(&dentry->d_lock);
> > > +
> > > +       name[len] = '\0';
> > > +       *ppath = name;
> > > +       *ppathlen = len;
> > > +       return 0;
> > > +}
> > > +
> > >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > -                            const char **ppath, int *ppathlen, u64 *pino,
> > > -                            int *pfreepath)
> > > +                            const char **ppath, int *ppathlen, u64 *pino)
> > >  {
> > >         char *path;
> > >
> > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> > >                 *pino = ceph_ino(dir);
> > >                 rcu_read_unlock();
> > > -               *ppath = dentry->d_name.name;
> > > -               *ppathlen = dentry->d_name.len;
> > > -               return 0;
> > > +               return clone_dentry_name(dentry, ppath, ppathlen);
> > >         }
> > >         rcu_read_unlock();
> > >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> > >         if (IS_ERR(path))
> > >                 return PTR_ERR(path);
> > >         *ppath = path;
> > > -       *pfreepath = 1;
> > >         return 0;
> > >  }
> > >
> > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> > >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> > >                      ceph_snap(rinode));
> > >         } else if (rdentry) {
> > > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > > -                                       freepath);
> > > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > > +               if (!r)
> > > +                       *freepath = 1;
> > >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> > >                      *ppath);
> > >         } else if (rpath || rino) {
> > > --
> > > 2.20.1
> > >
> >
> > Looks good. I think we also need to update ceph_fill_trace(), make it
> > handle that case rinfo->dname != req->r_dentry->d_name.
> >
>
> Good catch. I'll send a separate patch for that one.
> --
> Jeff Layton <jlayton@kernel.org>


So to make sure I understand, the worry here is that we can end up
getting back an rinfo->dname that doesn't match what we have in
r_dentry from the initial call?

That is quite a different problem from what this patch fixes if so.
What should we do in the case where that occurs?
--
Jeff Layton <jlayton@kernel.org>
Yan, Zheng April 17, 2019, 2:14 a.m. UTC | #7
On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > >
> > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > Ben reported tripping the BUG_ON in create_request_message during some
> > > > performance testing. Analysis of the vmcore showed that the length of
> > > > the r_dentry->d_name string changed after we allocated the buffer, but
> > > > before we encoded it.
> > > >
> > > > build_dentry_path returns pointers to d_name in the common case of
> > > > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > > > copy of the string in this case. That is less efficient, but safe.
> > > >
> > > > Reported-by: Ben England <bengland@redhat.com>
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> > > >  1 file changed, 32 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index e09feadd2ae1..4cfefe118128 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> > > >         return path;
> > > >  }
> > > >
> > > > +/* Duplicate the dentry->d_name.name safely */
> > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > > > +                            int *ppathlen)
> > > > +{
> > > > +       u32 len;
> > > > +       char *name;
> > > > +retry:
> > > > +       len = READ_ONCE(dentry->d_name.len);
> > > > +       name = kmalloc(len + 1, GFP_NOFS);
> > > > +       if (!name)
> > > > +               return -ENOMEM;
> > > > +
> > > > +       spin_lock(&dentry->d_lock);
> > > > +       if (dentry->d_name.len != len) {
> > > > +               spin_unlock(&dentry->d_lock);
> > > > +               kfree(name);
> > > > +               goto retry;
> > > > +       }
> > > > +       memcpy(name, dentry->d_name.name, len);
> > > > +       spin_unlock(&dentry->d_lock);
> > > > +
> > > > +       name[len] = '\0';
> > > > +       *ppath = name;
> > > > +       *ppathlen = len;
> > > > +       return 0;
> > > > +}
> > > > +
> > > >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > -                            const char **ppath, int *ppathlen, u64 *pino,
> > > > -                            int *pfreepath)
> > > > +                            const char **ppath, int *ppathlen, u64 *pino)
> > > >  {
> > > >         char *path;
> > > >
> > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> > > >                 *pino = ceph_ino(dir);
> > > >                 rcu_read_unlock();
> > > > -               *ppath = dentry->d_name.name;
> > > > -               *ppathlen = dentry->d_name.len;
> > > > -               return 0;
> > > > +               return clone_dentry_name(dentry, ppath, ppathlen);
> > > >         }
> > > >         rcu_read_unlock();
> > > >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> > > >         if (IS_ERR(path))
> > > >                 return PTR_ERR(path);
> > > >         *ppath = path;
> > > > -       *pfreepath = 1;
> > > >         return 0;
> > > >  }
> > > >
> > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> > > >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> > > >                      ceph_snap(rinode));
> > > >         } else if (rdentry) {
> > > > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > > > -                                       freepath);
> > > > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > > > +               if (!r)
> > > > +                       *freepath = 1;
> > > >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> > > >                      *ppath);
> > > >         } else if (rpath || rino) {
> > > > --
> > > > 2.20.1
> > > >
> > >
> > > Looks good. I think we also need to update ceph_fill_trace(), make it
> > > handle that case rinfo->dname != req->r_dentry->d_name.
> > >
> >
> > Good catch. I'll send a separate patch for that one.
> > --
> > Jeff Layton <jlayton@kernel.org>
>
>
> So to make sure I understand, the worry here is that we can end up
> getting back an rinfo->dname that doesn't match what we have in
> r_dentry from the initial call?
>
> That is quite a different problem from what this patch fixes if so.
> What should we do in the case where that occurs?

keep r_dentry untouched and return a error?  ceph_d_revalidate()
return -ECHILD after it gets the error.

Regards
Yan, Zheng

> --
> Jeff Layton <jlayton@kernel.org>
Jeff Layton April 17, 2019, 11:48 a.m. UTC | #8
On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > > >
> > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > >
> > > > > Ben reported tripping the BUG_ON in create_request_message during some
> > > > > performance testing. Analysis of the vmcore showed that the length of
> > > > > the r_dentry->d_name string changed after we allocated the buffer, but
> > > > > before we encoded it.
> > > > >
> > > > > build_dentry_path returns pointers to d_name in the common case of
> > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > > > > copy of the string in this case. That is less efficient, but safe.
> > > > >
> > > > > Reported-by: Ben England <bengland@redhat.com>
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> > > > >  1 file changed, 32 insertions(+), 8 deletions(-)
> > > > >
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index e09feadd2ae1..4cfefe118128 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> > > > >         return path;
> > > > >  }
> > > > >
> > > > > +/* Duplicate the dentry->d_name.name safely */
> > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > > > > +                            int *ppathlen)
> > > > > +{
> > > > > +       u32 len;
> > > > > +       char *name;
> > > > > +retry:
> > > > > +       len = READ_ONCE(dentry->d_name.len);
> > > > > +       name = kmalloc(len + 1, GFP_NOFS);
> > > > > +       if (!name)
> > > > > +               return -ENOMEM;
> > > > > +
> > > > > +       spin_lock(&dentry->d_lock);
> > > > > +       if (dentry->d_name.len != len) {
> > > > > +               spin_unlock(&dentry->d_lock);
> > > > > +               kfree(name);
> > > > > +               goto retry;
> > > > > +       }
> > > > > +       memcpy(name, dentry->d_name.name, len);
> > > > > +       spin_unlock(&dentry->d_lock);
> > > > > +
> > > > > +       name[len] = '\0';
> > > > > +       *ppath = name;
> > > > > +       *ppathlen = len;
> > > > > +       return 0;
> > > > > +}
> > > > > +
> > > > >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > -                            const char **ppath, int *ppathlen, u64 *pino,
> > > > > -                            int *pfreepath)
> > > > > +                            const char **ppath, int *ppathlen, u64 *pino)
> > > > >  {
> > > > >         char *path;
> > > > >
> > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> > > > >                 *pino = ceph_ino(dir);
> > > > >                 rcu_read_unlock();
> > > > > -               *ppath = dentry->d_name.name;
> > > > > -               *ppathlen = dentry->d_name.len;
> > > > > -               return 0;
> > > > > +               return clone_dentry_name(dentry, ppath, ppathlen);
> > > > >         }
> > > > >         rcu_read_unlock();
> > > > >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> > > > >         if (IS_ERR(path))
> > > > >                 return PTR_ERR(path);
> > > > >         *ppath = path;
> > > > > -       *pfreepath = 1;
> > > > >         return 0;
> > > > >  }
> > > > >
> > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> > > > >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> > > > >                      ceph_snap(rinode));
> > > > >         } else if (rdentry) {
> > > > > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > > > > -                                       freepath);
> > > > > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > > > > +               if (!r)
> > > > > +                       *freepath = 1;
> > > > >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> > > > >                      *ppath);
> > > > >         } else if (rpath || rino) {
> > > > > --
> > > > > 2.20.1
> > > > >
> > > >
> > > > Looks good. I think we also need to update ceph_fill_trace(), make it
> > > > handle that case rinfo->dname != req->r_dentry->d_name.
> > > >
> > >
> > > Good catch. I'll send a separate patch for that one.
> > > --
> > > Jeff Layton <jlayton@kernel.org>
> >
> >
> > So to make sure I understand, the worry here is that we can end up
> > getting back an rinfo->dname that doesn't match what we have in
> > r_dentry from the initial call?
> >
> > That is quite a different problem from what this patch fixes if so.
> > What should we do in the case where that occurs?
>
> keep r_dentry untouched and return a error?  ceph_d_revalidate()
> return -ECHILD after it gets the error.
>

We wouldn't do a synchronous MDS call in rcuwalk mode, so returning
-ECHILD would probably not be what we'd want to do. We could have this
happen in any reply (not just d_revalidate), in principle, so I think
we need a general solution here.

I think you're probably right that we should not touch r_dentry in
that case though. I'll look at grafting that logic into this.

In the meantime, I'd like to see this patch and the one I sent
yesterday in v5.1 and stable kernels. Did you want to add your
Reviewed-by?
Yan, Zheng April 17, 2019, 1:04 p.m. UTC | #9
On Wed, Apr 17, 2019 at 7:48 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote:
> >
> > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > > > >
> > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > >
> > > > > > Ben reported tripping the BUG_ON in create_request_message during some
> > > > > > performance testing. Analysis of the vmcore showed that the length of
> > > > > > the r_dentry->d_name string changed after we allocated the buffer, but
> > > > > > before we encoded it.
> > > > > >
> > > > > > build_dentry_path returns pointers to d_name in the common case of
> > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > > > > > copy of the string in this case. That is less efficient, but safe.
> > > > > >
> > > > > > Reported-by: Ben England <bengland@redhat.com>
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > ---
> > > > > >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> > > > > >  1 file changed, 32 insertions(+), 8 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > index e09feadd2ae1..4cfefe118128 100644
> > > > > > --- a/fs/ceph/mds_client.c
> > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> > > > > >         return path;
> > > > > >  }
> > > > > >
> > > > > > +/* Duplicate the dentry->d_name.name safely */
> > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > > > > > +                            int *ppathlen)
> > > > > > +{
> > > > > > +       u32 len;
> > > > > > +       char *name;
> > > > > > +retry:
> > > > > > +       len = READ_ONCE(dentry->d_name.len);
> > > > > > +       name = kmalloc(len + 1, GFP_NOFS);
> > > > > > +       if (!name)
> > > > > > +               return -ENOMEM;
> > > > > > +
> > > > > > +       spin_lock(&dentry->d_lock);
> > > > > > +       if (dentry->d_name.len != len) {
> > > > > > +               spin_unlock(&dentry->d_lock);
> > > > > > +               kfree(name);
> > > > > > +               goto retry;
> > > > > > +       }
> > > > > > +       memcpy(name, dentry->d_name.name, len);
> > > > > > +       spin_unlock(&dentry->d_lock);
> > > > > > +
> > > > > > +       name[len] = '\0';
> > > > > > +       *ppath = name;
> > > > > > +       *ppathlen = len;
> > > > > > +       return 0;
> > > > > > +}
> > > > > > +
> > > > > >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > > -                            const char **ppath, int *ppathlen, u64 *pino,
> > > > > > -                            int *pfreepath)
> > > > > > +                            const char **ppath, int *ppathlen, u64 *pino)
> > > > > >  {
> > > > > >         char *path;
> > > > > >
> > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> > > > > >                 *pino = ceph_ino(dir);
> > > > > >                 rcu_read_unlock();
> > > > > > -               *ppath = dentry->d_name.name;
> > > > > > -               *ppathlen = dentry->d_name.len;
> > > > > > -               return 0;
> > > > > > +               return clone_dentry_name(dentry, ppath, ppathlen);
> > > > > >         }
> > > > > >         rcu_read_unlock();
> > > > > >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> > > > > >         if (IS_ERR(path))
> > > > > >                 return PTR_ERR(path);
> > > > > >         *ppath = path;
> > > > > > -       *pfreepath = 1;
> > > > > >         return 0;
> > > > > >  }
> > > > > >
> > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> > > > > >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> > > > > >                      ceph_snap(rinode));
> > > > > >         } else if (rdentry) {
> > > > > > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > > > > > -                                       freepath);
> > > > > > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > > > > > +               if (!r)
> > > > > > +                       *freepath = 1;
> > > > > >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> > > > > >                      *ppath);
> > > > > >         } else if (rpath || rino) {
> > > > > > --
> > > > > > 2.20.1
> > > > > >
> > > > >
> > > > > Looks good. I think we also need to update ceph_fill_trace(), make it
> > > > > handle that case rinfo->dname != req->r_dentry->d_name.
> > > > >
> > > >
> > > > Good catch. I'll send a separate patch for that one.
> > > > --
> > > > Jeff Layton <jlayton@kernel.org>
> > >
> > >
> > > So to make sure I understand, the worry here is that we can end up
> > > getting back an rinfo->dname that doesn't match what we have in
> > > r_dentry from the initial call?
> > >
> > > That is quite a different problem from what this patch fixes if so.
> > > What should we do in the case where that occurs?
> >
> > keep r_dentry untouched and return a error?  ceph_d_revalidate()
> > return -ECHILD after it gets the error.
> >
>
> We wouldn't do a synchronous MDS call in rcuwalk mode, so returning
> -ECHILD would probably not be what we'd want to do. We could have this
> happen in any reply (not just d_revalidate), in principle, so I think
> we need a general solution here.
>

why it can happen in any reply? I think it can happen only when
i_mutex of parent dir is not locked.


> I think you're probably right that we should not touch r_dentry in
> that case though. I'll look at grafting that logic into this.
>
> In the meantime, I'd like to see this patch and the one I sent
> yesterday in v5.1 and stable kernels. Did you want to add your
> Reviewed-by?

Sorry for the delay

Yan, Zheng


> --
> Jeff Layton <jlayton@kernel.org>
Jeff Layton April 17, 2019, 1:28 p.m. UTC | #10
On Wed, Apr 17, 2019 at 9:05 AM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Wed, Apr 17, 2019 at 7:48 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > >
> > > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > >
> > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > >
> > > > > > > Ben reported tripping the BUG_ON in create_request_message during some
> > > > > > > performance testing. Analysis of the vmcore showed that the length of
> > > > > > > the r_dentry->d_name string changed after we allocated the buffer, but
> > > > > > > before we encoded it.
> > > > > > >
> > > > > > > build_dentry_path returns pointers to d_name in the common case of
> > > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > > > > > > copy of the string in this case. That is less efficient, but safe.
> > > > > > >
> > > > > > > Reported-by: Ben England <bengland@redhat.com>
> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > ---
> > > > > > >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> > > > > > >  1 file changed, 32 insertions(+), 8 deletions(-)
> > > > > > >
> > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > index e09feadd2ae1..4cfefe118128 100644
> > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> > > > > > >         return path;
> > > > > > >  }
> > > > > > >
> > > > > > > +/* Duplicate the dentry->d_name.name safely */
> > > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > > > > > > +                            int *ppathlen)
> > > > > > > +{
> > > > > > > +       u32 len;
> > > > > > > +       char *name;
> > > > > > > +retry:
> > > > > > > +       len = READ_ONCE(dentry->d_name.len);
> > > > > > > +       name = kmalloc(len + 1, GFP_NOFS);
> > > > > > > +       if (!name)
> > > > > > > +               return -ENOMEM;
> > > > > > > +
> > > > > > > +       spin_lock(&dentry->d_lock);
> > > > > > > +       if (dentry->d_name.len != len) {
> > > > > > > +               spin_unlock(&dentry->d_lock);
> > > > > > > +               kfree(name);
> > > > > > > +               goto retry;
> > > > > > > +       }
> > > > > > > +       memcpy(name, dentry->d_name.name, len);
> > > > > > > +       spin_unlock(&dentry->d_lock);
> > > > > > > +
> > > > > > > +       name[len] = '\0';
> > > > > > > +       *ppath = name;
> > > > > > > +       *ppathlen = len;
> > > > > > > +       return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > > > -                            const char **ppath, int *ppathlen, u64 *pino,
> > > > > > > -                            int *pfreepath)
> > > > > > > +                            const char **ppath, int *ppathlen, u64 *pino)
> > > > > > >  {
> > > > > > >         char *path;
> > > > > > >
> > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > > >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> > > > > > >                 *pino = ceph_ino(dir);
> > > > > > >                 rcu_read_unlock();
> > > > > > > -               *ppath = dentry->d_name.name;
> > > > > > > -               *ppathlen = dentry->d_name.len;
> > > > > > > -               return 0;
> > > > > > > +               return clone_dentry_name(dentry, ppath, ppathlen);
> > > > > > >         }
> > > > > > >         rcu_read_unlock();
> > > > > > >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> > > > > > >         if (IS_ERR(path))
> > > > > > >                 return PTR_ERR(path);
> > > > > > >         *ppath = path;
> > > > > > > -       *pfreepath = 1;
> > > > > > >         return 0;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> > > > > > >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> > > > > > >                      ceph_snap(rinode));
> > > > > > >         } else if (rdentry) {
> > > > > > > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > > > > > > -                                       freepath);
> > > > > > > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > > > > > > +               if (!r)
> > > > > > > +                       *freepath = 1;
> > > > > > >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> > > > > > >                      *ppath);
> > > > > > >         } else if (rpath || rino) {
> > > > > > > --
> > > > > > > 2.20.1
> > > > > > >
> > > > > >
> > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it
> > > > > > handle that case rinfo->dname != req->r_dentry->d_name.
> > > > > >
> > > > >
> > > > > Good catch. I'll send a separate patch for that one.
> > > > > --
> > > > > Jeff Layton <jlayton@kernel.org>
> > > >
> > > >
> > > > So to make sure I understand, the worry here is that we can end up
> > > > getting back an rinfo->dname that doesn't match what we have in
> > > > r_dentry from the initial call?
> > > >
> > > > That is quite a different problem from what this patch fixes if so.
> > > > What should we do in the case where that occurs?
> > >
> > > keep r_dentry untouched and return a error?  ceph_d_revalidate()
> > > return -ECHILD after it gets the error.
> > >
> >
> > We wouldn't do a synchronous MDS call in rcuwalk mode, so returning
> > -ECHILD would probably not be what we'd want to do. We could have this
> > happen in any reply (not just d_revalidate), in principle, so I think
> > we need a general solution here.
> >
>
> why it can happen in any reply? I think it can happen only when
> i_mutex of parent dir is not locked.
>

Could we not race against operations being done by other clients?

AFAIU, an MDS could set a dentry in the trace whenever it likes, and
it doesn't necessarily know anything about what r_dentry we have
stashed (other than the info that gets encoded into the request).

>
> > I think you're probably right that we should not touch r_dentry in
> > that case though. I'll look at grafting that logic into this.
> >
> > In the meantime, I'd like to see this patch and the one I sent
> > yesterday in v5.1 and stable kernels. Did you want to add your
> > Reviewed-by?
>
> Sorry for the delay
>

No problem. More important to get it right than fast.
Yan, Zheng April 17, 2019, 1:42 p.m. UTC | #11
On Wed, Apr 17, 2019 at 9:28 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, Apr 17, 2019 at 9:05 AM Yan, Zheng <ukernel@gmail.com> wrote:
> >
> > On Wed, Apr 17, 2019 at 7:48 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > > >
> > > > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > >
> > > > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > > > > > >
> > > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > >
> > > > > > > > Ben reported tripping the BUG_ON in create_request_message during some
> > > > > > > > performance testing. Analysis of the vmcore showed that the length of
> > > > > > > > the r_dentry->d_name string changed after we allocated the buffer, but
> > > > > > > > before we encoded it.
> > > > > > > >
> > > > > > > > build_dentry_path returns pointers to d_name in the common case of
> > > > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a
> > > > > > > > copy of the string in this case. That is less efficient, but safe.
> > > > > > > >
> > > > > > > > Reported-by: Ben England <bengland@redhat.com>
> > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > ---
> > > > > > > >  fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++--------
> > > > > > > >  1 file changed, 32 insertions(+), 8 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > > index e09feadd2ae1..4cfefe118128 100644
> > > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
> > > > > > > >         return path;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +/* Duplicate the dentry->d_name.name safely */
> > > > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath,
> > > > > > > > +                            int *ppathlen)
> > > > > > > > +{
> > > > > > > > +       u32 len;
> > > > > > > > +       char *name;
> > > > > > > > +retry:
> > > > > > > > +       len = READ_ONCE(dentry->d_name.len);
> > > > > > > > +       name = kmalloc(len + 1, GFP_NOFS);
> > > > > > > > +       if (!name)
> > > > > > > > +               return -ENOMEM;
> > > > > > > > +
> > > > > > > > +       spin_lock(&dentry->d_lock);
> > > > > > > > +       if (dentry->d_name.len != len) {
> > > > > > > > +               spin_unlock(&dentry->d_lock);
> > > > > > > > +               kfree(name);
> > > > > > > > +               goto retry;
> > > > > > > > +       }
> > > > > > > > +       memcpy(name, dentry->d_name.name, len);
> > > > > > > > +       spin_unlock(&dentry->d_lock);
> > > > > > > > +
> > > > > > > > +       name[len] = '\0';
> > > > > > > > +       *ppath = name;
> > > > > > > > +       *ppathlen = len;
> > > > > > > > +       return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > > > > -                            const char **ppath, int *ppathlen, u64 *pino,
> > > > > > > > -                            int *pfreepath)
> > > > > > > > +                            const char **ppath, int *ppathlen, u64 *pino)
> > > > > > > >  {
> > > > > > > >         char *path;
> > > > > > > >
> > > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
> > > > > > > >         if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
> > > > > > > >                 *pino = ceph_ino(dir);
> > > > > > > >                 rcu_read_unlock();
> > > > > > > > -               *ppath = dentry->d_name.name;
> > > > > > > > -               *ppathlen = dentry->d_name.len;
> > > > > > > > -               return 0;
> > > > > > > > +               return clone_dentry_name(dentry, ppath, ppathlen);
> > > > > > > >         }
> > > > > > > >         rcu_read_unlock();
> > > > > > > >         path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
> > > > > > > >         if (IS_ERR(path))
> > > > > > > >                 return PTR_ERR(path);
> > > > > > > >         *ppath = path;
> > > > > > > > -       *pfreepath = 1;
> > > > > > > >         return 0;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
> > > > > > > >                 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> > > > > > > >                      ceph_snap(rinode));
> > > > > > > >         } else if (rdentry) {
> > > > > > > > -               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
> > > > > > > > -                                       freepath);
> > > > > > > > +               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
> > > > > > > > +               if (!r)
> > > > > > > > +                       *freepath = 1;
> > > > > > > >                 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
> > > > > > > >                      *ppath);
> > > > > > > >         } else if (rpath || rino) {
> > > > > > > > --
> > > > > > > > 2.20.1
> > > > > > > >
> > > > > > >
> > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it
> > > > > > > handle that case rinfo->dname != req->r_dentry->d_name.
> > > > > > >
> > > > > >
> > > > > > Good catch. I'll send a separate patch for that one.
> > > > > > --
> > > > > > Jeff Layton <jlayton@kernel.org>
> > > > >
> > > > >
> > > > > So to make sure I understand, the worry here is that we can end up
> > > > > getting back an rinfo->dname that doesn't match what we have in
> > > > > r_dentry from the initial call?
> > > > >
> > > > > That is quite a different problem from what this patch fixes if so.
> > > > > What should we do in the case where that occurs?
> > > >
> > > > keep r_dentry untouched and return a error?  ceph_d_revalidate()
> > > > return -ECHILD after it gets the error.
> > > >
> > >
> > > We wouldn't do a synchronous MDS call in rcuwalk mode, so returning
> > > -ECHILD would probably not be what we'd want to do. We could have this
> > > happen in any reply (not just d_revalidate), in principle, so I think
> > > we need a general solution here.
> > >
> >
> > why it can happen in any reply? I think it can happen only when
> > i_mutex of parent dir is not locked.
> >
>
> Could we not race against operations being done by other clients?
>

I think no. operation by other clients just make dentry lease invalid.

> AFAIU, an MDS could set a dentry in the trace whenever it likes, and
> it doesn't necessarily know anything about what r_dentry we have
> stashed (other than the info that gets encoded into the request).
>

I think r_dentry is stable when i_mutex of parent dir is locked

> >
> > > I think you're probably right that we should not touch r_dentry in
> > > that case though. I'll look at grafting that logic into this.
> > >
> > > In the meantime, I'd like to see this patch and the one I sent
> > > yesterday in v5.1 and stable kernels. Did you want to add your
> > > Reviewed-by?
> >
> > Sorry for the delay
> >
>
> No problem. More important to get it right than fast.
> --
> Jeff Layton <jlayton@kernel.org>

Patch
diff mbox series

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e09feadd2ae1..4cfefe118128 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2159,9 +2159,35 @@  char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
 	return path;
 }
 
+/* Duplicate the dentry->d_name.name safely */
+static int clone_dentry_name(struct dentry *dentry, const char **ppath,
+			     int *ppathlen)
+{
+	u32 len;
+	char *name;
+retry:
+	len = READ_ONCE(dentry->d_name.len);
+	name = kmalloc(len + 1, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_name.len != len) {
+		spin_unlock(&dentry->d_lock);
+		kfree(name);
+		goto retry;
+	}
+	memcpy(name, dentry->d_name.name, len);
+	spin_unlock(&dentry->d_lock);
+
+	name[len] = '\0';
+	*ppath = name;
+	*ppathlen = len;
+	return 0;
+}
+
 static int build_dentry_path(struct dentry *dentry, struct inode *dir,
-			     const char **ppath, int *ppathlen, u64 *pino,
-			     int *pfreepath)
+			     const char **ppath, int *ppathlen, u64 *pino)
 {
 	char *path;
 
@@ -2171,16 +2197,13 @@  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 	if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
 		*pino = ceph_ino(dir);
 		rcu_read_unlock();
-		*ppath = dentry->d_name.name;
-		*ppathlen = dentry->d_name.len;
-		return 0;
+		return clone_dentry_name(dentry, ppath, ppathlen);
 	}
 	rcu_read_unlock();
 	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
-	*pfreepath = 1;
 	return 0;
 }
 
@@ -2222,8 +2245,9 @@  static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
 		     ceph_snap(rinode));
 	} else if (rdentry) {
-		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
-					freepath);
+		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino);
+		if (!r)
+			*freepath = 1;
 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
 		     *ppath);
 	} else if (rpath || rino) {