Message ID | 20190415164515.25767-1-jlayton@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: always clone dentry name when building MDS request | expand |
On Mon, Apr 15, 2019 at 12:45 PM Jeff Layton <jlayton@kernel.org> wrote: > > Ben reported tripping the BUG_ON in create_request_message during some > performance testing. Analysis of the vmcore showed that the length of > the r_dentry->d_name string changed after we allocated the buffer, but > before we encoded it. > > build_dentry_path returns pointers to d_name in the common case of > non-snapped dentries, but this optimization isn't safe. Instead, make a > copy of the string in this case. That is less efficient, but safe. > > Reported-by: Ben England <bengland@redhat.com> > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > 1 file changed, 32 insertions(+), 8 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index e09feadd2ae1..4cfefe118128 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > return path; > } > > +/* Duplicate the dentry->d_name.name safely */ > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > + int *ppathlen) > +{ > + u32 len; > + char *name; > +retry: > + len = READ_ONCE(dentry->d_name.len); > + name = kmalloc(len + 1, GFP_NOFS); > + if (!name) > + return -ENOMEM; > + > + spin_lock(&dentry->d_lock); > + if (dentry->d_name.len != len) { > + spin_unlock(&dentry->d_lock); > + kfree(name); > + goto retry; > + } > + memcpy(name, dentry->d_name.name, len); > + spin_unlock(&dentry->d_lock); > + > + name[len] = '\0'; > + *ppath = name; > + *ppathlen = len; > + return 0; > +} > + > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > - const char **ppath, int *ppathlen, u64 *pino, > - int *pfreepath) > + const char **ppath, int *ppathlen, u64 *pino) > { > char *path; > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > *pino = ceph_ino(dir); > rcu_read_unlock(); > - *ppath = dentry->d_name.name; > - *ppathlen = dentry->d_name.len; > - return 0; > + return clone_dentry_name(dentry, ppath, ppathlen); > } > rcu_read_unlock(); > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > if (IS_ERR(path)) > return PTR_ERR(path); > *ppath = path; > - *pfreepath = 1; > return 0; > } > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > ceph_snap(rinode)); > } else if (rdentry) { > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > - freepath); > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > + if (!r) > + *freepath = 1; > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > *ppath); > } else if (rpath || rino) { > -- > 2.20.1 > We should probably get this into v5.1 and mark it for stable as well? It's a longstanding bug.
On Mon, Apr 15, 2019 at 9:45 AM Jeff Layton <jlayton@kernel.org> wrote: > > Ben reported tripping the BUG_ON in create_request_message during some > performance testing. Analysis of the vmcore showed that the length of > the r_dentry->d_name string changed after we allocated the buffer, but > before we encoded it. > > build_dentry_path returns pointers to d_name in the common case of > non-snapped dentries, but this optimization isn't safe. Instead, make a > copy of the string in this case. That is less efficient, but safe. > > Reported-by: Ben England <bengland@redhat.com> > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > 1 file changed, 32 insertions(+), 8 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index e09feadd2ae1..4cfefe118128 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > return path; > } > > +/* Duplicate the dentry->d_name.name safely */ > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > + int *ppathlen) > +{ > + u32 len; > + char *name; > +retry: > + len = READ_ONCE(dentry->d_name.len); > + name = kmalloc(len + 1, GFP_NOFS); > + if (!name) > + return -ENOMEM; > + > + spin_lock(&dentry->d_lock); > + if (dentry->d_name.len != len) { > + spin_unlock(&dentry->d_lock); > + kfree(name); > + goto retry; > + } > + memcpy(name, dentry->d_name.name, len); > + spin_unlock(&dentry->d_lock); > + > + name[len] = '\0'; > + *ppath = name; > + *ppathlen = len; > + return 0; > +} I would think passing a char array of PATH_MAX would be preferable to a heap allocation. Is that not typical?
On Mon, Apr 15, 2019 at 4:41 PM Patrick Donnelly <pdonnell@redhat.com> wrote: > > On Mon, Apr 15, 2019 at 9:45 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > Ben reported tripping the BUG_ON in create_request_message during some > > performance testing. Analysis of the vmcore showed that the length of > > the r_dentry->d_name string changed after we allocated the buffer, but > > before we encoded it. > > > > build_dentry_path returns pointers to d_name in the common case of > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > copy of the string in this case. That is less efficient, but safe. > > > > Reported-by: Ben England <bengland@redhat.com> > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > index e09feadd2ae1..4cfefe118128 100644 > > --- a/fs/ceph/mds_client.c > > +++ b/fs/ceph/mds_client.c > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > return path; > > } > > > > +/* Duplicate the dentry->d_name.name safely */ > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > + int *ppathlen) > > +{ > > + u32 len; > > + char *name; > > +retry: > > + len = READ_ONCE(dentry->d_name.len); > > + name = kmalloc(len + 1, GFP_NOFS); > > + if (!name) > > + return -ENOMEM; > > + > > + spin_lock(&dentry->d_lock); > > + if (dentry->d_name.len != len) { > > + spin_unlock(&dentry->d_lock); > > + kfree(name); > > + goto retry; > > + } > > + memcpy(name, dentry->d_name.name, len); > > + spin_unlock(&dentry->d_lock); > > + > > + name[len] = '\0'; > > + *ppath = name; > > + *ppathlen = len; > > + return 0; > > +} > > I would think passing a char array of PATH_MAX would be preferable to > a heap allocation. Is that not typical? > In general I don't like doing big allocations on the stack in the kernel, as it is a finite resource. That said, we'd technically only need a NAME_MAX+1 (256 byte) array here, which is not _too_ bad. If that's what the maintainers prefer, we could do that instead. -- Jeff Layton <jlayton@kernel.org>
On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > Ben reported tripping the BUG_ON in create_request_message during some > performance testing. Analysis of the vmcore showed that the length of > the r_dentry->d_name string changed after we allocated the buffer, but > before we encoded it. > > build_dentry_path returns pointers to d_name in the common case of > non-snapped dentries, but this optimization isn't safe. Instead, make a > copy of the string in this case. That is less efficient, but safe. > > Reported-by: Ben England <bengland@redhat.com> > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > 1 file changed, 32 insertions(+), 8 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index e09feadd2ae1..4cfefe118128 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > return path; > } > > +/* Duplicate the dentry->d_name.name safely */ > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > + int *ppathlen) > +{ > + u32 len; > + char *name; > +retry: > + len = READ_ONCE(dentry->d_name.len); > + name = kmalloc(len + 1, GFP_NOFS); > + if (!name) > + return -ENOMEM; > + > + spin_lock(&dentry->d_lock); > + if (dentry->d_name.len != len) { > + spin_unlock(&dentry->d_lock); > + kfree(name); > + goto retry; > + } > + memcpy(name, dentry->d_name.name, len); > + spin_unlock(&dentry->d_lock); > + > + name[len] = '\0'; > + *ppath = name; > + *ppathlen = len; > + return 0; > +} > + > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > - const char **ppath, int *ppathlen, u64 *pino, > - int *pfreepath) > + const char **ppath, int *ppathlen, u64 *pino) > { > char *path; > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > *pino = ceph_ino(dir); > rcu_read_unlock(); > - *ppath = dentry->d_name.name; > - *ppathlen = dentry->d_name.len; > - return 0; > + return clone_dentry_name(dentry, ppath, ppathlen); > } > rcu_read_unlock(); > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > if (IS_ERR(path)) > return PTR_ERR(path); > *ppath = path; > - *pfreepath = 1; > return 0; > } > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > ceph_snap(rinode)); > } else if (rdentry) { > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > - freepath); > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > + if (!r) > + *freepath = 1; > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > *ppath); > } else if (rpath || rino) { > -- > 2.20.1 > Looks good. I think we also need to update ceph_fill_trace(), make it handle that case rinfo->dname != req->r_dentry->d_name. Regards Yan, Zheng
On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > Ben reported tripping the BUG_ON in create_request_message during some > > performance testing. Analysis of the vmcore showed that the length of > > the r_dentry->d_name string changed after we allocated the buffer, but > > before we encoded it. > > > > build_dentry_path returns pointers to d_name in the common case of > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > copy of the string in this case. That is less efficient, but safe. > > > > Reported-by: Ben England <bengland@redhat.com> > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > index e09feadd2ae1..4cfefe118128 100644 > > --- a/fs/ceph/mds_client.c > > +++ b/fs/ceph/mds_client.c > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > return path; > > } > > > > +/* Duplicate the dentry->d_name.name safely */ > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > + int *ppathlen) > > +{ > > + u32 len; > > + char *name; > > +retry: > > + len = READ_ONCE(dentry->d_name.len); > > + name = kmalloc(len + 1, GFP_NOFS); > > + if (!name) > > + return -ENOMEM; > > + > > + spin_lock(&dentry->d_lock); > > + if (dentry->d_name.len != len) { > > + spin_unlock(&dentry->d_lock); > > + kfree(name); > > + goto retry; > > + } > > + memcpy(name, dentry->d_name.name, len); > > + spin_unlock(&dentry->d_lock); > > + > > + name[len] = '\0'; > > + *ppath = name; > > + *ppathlen = len; > > + return 0; > > +} > > + > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > - const char **ppath, int *ppathlen, u64 *pino, > > - int *pfreepath) > > + const char **ppath, int *ppathlen, u64 *pino) > > { > > char *path; > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > *pino = ceph_ino(dir); > > rcu_read_unlock(); > > - *ppath = dentry->d_name.name; > > - *ppathlen = dentry->d_name.len; > > - return 0; > > + return clone_dentry_name(dentry, ppath, ppathlen); > > } > > rcu_read_unlock(); > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > if (IS_ERR(path)) > > return PTR_ERR(path); > > *ppath = path; > > - *pfreepath = 1; > > return 0; > > } > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > ceph_snap(rinode)); > > } else if (rdentry) { > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > - freepath); > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > + if (!r) > > + *freepath = 1; > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > *ppath); > > } else if (rpath || rino) { > > -- > > 2.20.1 > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > handle that case rinfo->dname != req->r_dentry->d_name. > Good catch. I'll send a separate patch for that one.
On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote: > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > Ben reported tripping the BUG_ON in create_request_message during some > > > performance testing. Analysis of the vmcore showed that the length of > > > the r_dentry->d_name string changed after we allocated the buffer, but > > > before we encoded it. > > > > > > build_dentry_path returns pointers to d_name in the common case of > > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > > copy of the string in this case. That is less efficient, but safe. > > > > > > Reported-by: Ben England <bengland@redhat.com> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > --- > > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > index e09feadd2ae1..4cfefe118128 100644 > > > --- a/fs/ceph/mds_client.c > > > +++ b/fs/ceph/mds_client.c > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > > return path; > > > } > > > > > > +/* Duplicate the dentry->d_name.name safely */ > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > > + int *ppathlen) > > > +{ > > > + u32 len; > > > + char *name; > > > +retry: > > > + len = READ_ONCE(dentry->d_name.len); > > > + name = kmalloc(len + 1, GFP_NOFS); > > > + if (!name) > > > + return -ENOMEM; > > > + > > > + spin_lock(&dentry->d_lock); > > > + if (dentry->d_name.len != len) { > > > + spin_unlock(&dentry->d_lock); > > > + kfree(name); > > > + goto retry; > > > + } > > > + memcpy(name, dentry->d_name.name, len); > > > + spin_unlock(&dentry->d_lock); > > > + > > > + name[len] = '\0'; > > > + *ppath = name; > > > + *ppathlen = len; > > > + return 0; > > > +} > > > + > > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > - const char **ppath, int *ppathlen, u64 *pino, > > > - int *pfreepath) > > > + const char **ppath, int *ppathlen, u64 *pino) > > > { > > > char *path; > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > > *pino = ceph_ino(dir); > > > rcu_read_unlock(); > > > - *ppath = dentry->d_name.name; > > > - *ppathlen = dentry->d_name.len; > > > - return 0; > > > + return clone_dentry_name(dentry, ppath, ppathlen); > > > } > > > rcu_read_unlock(); > > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > > if (IS_ERR(path)) > > > return PTR_ERR(path); > > > *ppath = path; > > > - *pfreepath = 1; > > > return 0; > > > } > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > > ceph_snap(rinode)); > > > } else if (rdentry) { > > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > > - freepath); > > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > > + if (!r) > > > + *freepath = 1; > > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > > *ppath); > > > } else if (rpath || rino) { > > > -- > > > 2.20.1 > > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > > handle that case rinfo->dname != req->r_dentry->d_name. > > > > Good catch. I'll send a separate patch for that one. > -- > Jeff Layton <jlayton@kernel.org> So to make sure I understand, the worry here is that we can end up getting back an rinfo->dname that doesn't match what we have in r_dentry from the initial call? That is quite a different problem from what this patch fixes if so. What should we do in the case where that occurs? -- Jeff Layton <jlayton@kernel.org>
On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > Ben reported tripping the BUG_ON in create_request_message during some > > > > performance testing. Analysis of the vmcore showed that the length of > > > > the r_dentry->d_name string changed after we allocated the buffer, but > > > > before we encoded it. > > > > > > > > build_dentry_path returns pointers to d_name in the common case of > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > > > copy of the string in this case. That is less efficient, but safe. > > > > > > > > Reported-by: Ben England <bengland@redhat.com> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > > --- > > > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > > index e09feadd2ae1..4cfefe118128 100644 > > > > --- a/fs/ceph/mds_client.c > > > > +++ b/fs/ceph/mds_client.c > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > > > return path; > > > > } > > > > > > > > +/* Duplicate the dentry->d_name.name safely */ > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > > > + int *ppathlen) > > > > +{ > > > > + u32 len; > > > > + char *name; > > > > +retry: > > > > + len = READ_ONCE(dentry->d_name.len); > > > > + name = kmalloc(len + 1, GFP_NOFS); > > > > + if (!name) > > > > + return -ENOMEM; > > > > + > > > > + spin_lock(&dentry->d_lock); > > > > + if (dentry->d_name.len != len) { > > > > + spin_unlock(&dentry->d_lock); > > > > + kfree(name); > > > > + goto retry; > > > > + } > > > > + memcpy(name, dentry->d_name.name, len); > > > > + spin_unlock(&dentry->d_lock); > > > > + > > > > + name[len] = '\0'; > > > > + *ppath = name; > > > > + *ppathlen = len; > > > > + return 0; > > > > +} > > > > + > > > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > - const char **ppath, int *ppathlen, u64 *pino, > > > > - int *pfreepath) > > > > + const char **ppath, int *ppathlen, u64 *pino) > > > > { > > > > char *path; > > > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > > > *pino = ceph_ino(dir); > > > > rcu_read_unlock(); > > > > - *ppath = dentry->d_name.name; > > > > - *ppathlen = dentry->d_name.len; > > > > - return 0; > > > > + return clone_dentry_name(dentry, ppath, ppathlen); > > > > } > > > > rcu_read_unlock(); > > > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > > > if (IS_ERR(path)) > > > > return PTR_ERR(path); > > > > *ppath = path; > > > > - *pfreepath = 1; > > > > return 0; > > > > } > > > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > > > ceph_snap(rinode)); > > > > } else if (rdentry) { > > > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > > > - freepath); > > > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > > > + if (!r) > > > > + *freepath = 1; > > > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > > > *ppath); > > > > } else if (rpath || rino) { > > > > -- > > > > 2.20.1 > > > > > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > > > handle that case rinfo->dname != req->r_dentry->d_name. > > > > > > > Good catch. I'll send a separate patch for that one. > > -- > > Jeff Layton <jlayton@kernel.org> > > > So to make sure I understand, the worry here is that we can end up > getting back an rinfo->dname that doesn't match what we have in > r_dentry from the initial call? > > That is quite a different problem from what this patch fixes if so. > What should we do in the case where that occurs? keep r_dentry untouched and return a error? ceph_d_revalidate() return -ECHILD after it gets the error. Regards Yan, Zheng > -- > Jeff Layton <jlayton@kernel.org>
On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote: > > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > Ben reported tripping the BUG_ON in create_request_message during some > > > > > performance testing. Analysis of the vmcore showed that the length of > > > > > the r_dentry->d_name string changed after we allocated the buffer, but > > > > > before we encoded it. > > > > > > > > > > build_dentry_path returns pointers to d_name in the common case of > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > > > > copy of the string in this case. That is less efficient, but safe. > > > > > > > > > > Reported-by: Ben England <bengland@redhat.com> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > > > --- > > > > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > > > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > > > index e09feadd2ae1..4cfefe118128 100644 > > > > > --- a/fs/ceph/mds_client.c > > > > > +++ b/fs/ceph/mds_client.c > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > > > > return path; > > > > > } > > > > > > > > > > +/* Duplicate the dentry->d_name.name safely */ > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > > > > + int *ppathlen) > > > > > +{ > > > > > + u32 len; > > > > > + char *name; > > > > > +retry: > > > > > + len = READ_ONCE(dentry->d_name.len); > > > > > + name = kmalloc(len + 1, GFP_NOFS); > > > > > + if (!name) > > > > > + return -ENOMEM; > > > > > + > > > > > + spin_lock(&dentry->d_lock); > > > > > + if (dentry->d_name.len != len) { > > > > > + spin_unlock(&dentry->d_lock); > > > > > + kfree(name); > > > > > + goto retry; > > > > > + } > > > > > + memcpy(name, dentry->d_name.name, len); > > > > > + spin_unlock(&dentry->d_lock); > > > > > + > > > > > + name[len] = '\0'; > > > > > + *ppath = name; > > > > > + *ppathlen = len; > > > > > + return 0; > > > > > +} > > > > > + > > > > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > - const char **ppath, int *ppathlen, u64 *pino, > > > > > - int *pfreepath) > > > > > + const char **ppath, int *ppathlen, u64 *pino) > > > > > { > > > > > char *path; > > > > > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > > > > *pino = ceph_ino(dir); > > > > > rcu_read_unlock(); > > > > > - *ppath = dentry->d_name.name; > > > > > - *ppathlen = dentry->d_name.len; > > > > > - return 0; > > > > > + return clone_dentry_name(dentry, ppath, ppathlen); > > > > > } > > > > > rcu_read_unlock(); > > > > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > > > > if (IS_ERR(path)) > > > > > return PTR_ERR(path); > > > > > *ppath = path; > > > > > - *pfreepath = 1; > > > > > return 0; > > > > > } > > > > > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > > > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > > > > ceph_snap(rinode)); > > > > > } else if (rdentry) { > > > > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > > > > - freepath); > > > > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > > > > + if (!r) > > > > > + *freepath = 1; > > > > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > > > > *ppath); > > > > > } else if (rpath || rino) { > > > > > -- > > > > > 2.20.1 > > > > > > > > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > > > > handle that case rinfo->dname != req->r_dentry->d_name. > > > > > > > > > > Good catch. I'll send a separate patch for that one. > > > -- > > > Jeff Layton <jlayton@kernel.org> > > > > > > So to make sure I understand, the worry here is that we can end up > > getting back an rinfo->dname that doesn't match what we have in > > r_dentry from the initial call? > > > > That is quite a different problem from what this patch fixes if so. > > What should we do in the case where that occurs? > > keep r_dentry untouched and return a error? ceph_d_revalidate() > return -ECHILD after it gets the error. > We wouldn't do a synchronous MDS call in rcuwalk mode, so returning -ECHILD would probably not be what we'd want to do. We could have this happen in any reply (not just d_revalidate), in principle, so I think we need a general solution here. I think you're probably right that we should not touch r_dentry in that case though. I'll look at grafting that logic into this. In the meantime, I'd like to see this patch and the one I sent yesterday in v5.1 and stable kernels. Did you want to add your Reviewed-by?
On Wed, Apr 17, 2019 at 7:48 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > > > Ben reported tripping the BUG_ON in create_request_message during some > > > > > > performance testing. Analysis of the vmcore showed that the length of > > > > > > the r_dentry->d_name string changed after we allocated the buffer, but > > > > > > before we encoded it. > > > > > > > > > > > > build_dentry_path returns pointers to d_name in the common case of > > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > > > > > copy of the string in this case. That is less efficient, but safe. > > > > > > > > > > > > Reported-by: Ben England <bengland@redhat.com> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > > > > --- > > > > > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > > > > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > > > > index e09feadd2ae1..4cfefe118128 100644 > > > > > > --- a/fs/ceph/mds_client.c > > > > > > +++ b/fs/ceph/mds_client.c > > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > > > > > return path; > > > > > > } > > > > > > > > > > > > +/* Duplicate the dentry->d_name.name safely */ > > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > > > > > + int *ppathlen) > > > > > > +{ > > > > > > + u32 len; > > > > > > + char *name; > > > > > > +retry: > > > > > > + len = READ_ONCE(dentry->d_name.len); > > > > > > + name = kmalloc(len + 1, GFP_NOFS); > > > > > > + if (!name) > > > > > > + return -ENOMEM; > > > > > > + > > > > > > + spin_lock(&dentry->d_lock); > > > > > > + if (dentry->d_name.len != len) { > > > > > > + spin_unlock(&dentry->d_lock); > > > > > > + kfree(name); > > > > > > + goto retry; > > > > > > + } > > > > > > + memcpy(name, dentry->d_name.name, len); > > > > > > + spin_unlock(&dentry->d_lock); > > > > > > + > > > > > > + name[len] = '\0'; > > > > > > + *ppath = name; > > > > > > + *ppathlen = len; > > > > > > + return 0; > > > > > > +} > > > > > > + > > > > > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > > - const char **ppath, int *ppathlen, u64 *pino, > > > > > > - int *pfreepath) > > > > > > + const char **ppath, int *ppathlen, u64 *pino) > > > > > > { > > > > > > char *path; > > > > > > > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > > > > > *pino = ceph_ino(dir); > > > > > > rcu_read_unlock(); > > > > > > - *ppath = dentry->d_name.name; > > > > > > - *ppathlen = dentry->d_name.len; > > > > > > - return 0; > > > > > > + return clone_dentry_name(dentry, ppath, ppathlen); > > > > > > } > > > > > > rcu_read_unlock(); > > > > > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > > > > > if (IS_ERR(path)) > > > > > > return PTR_ERR(path); > > > > > > *ppath = path; > > > > > > - *pfreepath = 1; > > > > > > return 0; > > > > > > } > > > > > > > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > > > > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > > > > > ceph_snap(rinode)); > > > > > > } else if (rdentry) { > > > > > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > > > > > - freepath); > > > > > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > > > > > + if (!r) > > > > > > + *freepath = 1; > > > > > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > > > > > *ppath); > > > > > > } else if (rpath || rino) { > > > > > > -- > > > > > > 2.20.1 > > > > > > > > > > > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > > > > > handle that case rinfo->dname != req->r_dentry->d_name. > > > > > > > > > > > > > Good catch. I'll send a separate patch for that one. > > > > -- > > > > Jeff Layton <jlayton@kernel.org> > > > > > > > > > So to make sure I understand, the worry here is that we can end up > > > getting back an rinfo->dname that doesn't match what we have in > > > r_dentry from the initial call? > > > > > > That is quite a different problem from what this patch fixes if so. > > > What should we do in the case where that occurs? > > > > keep r_dentry untouched and return a error? ceph_d_revalidate() > > return -ECHILD after it gets the error. > > > > We wouldn't do a synchronous MDS call in rcuwalk mode, so returning > -ECHILD would probably not be what we'd want to do. We could have this > happen in any reply (not just d_revalidate), in principle, so I think > we need a general solution here. > why it can happen in any reply? I think it can happen only when i_mutex of parent dir is not locked. > I think you're probably right that we should not touch r_dentry in > that case though. I'll look at grafting that logic into this. > > In the meantime, I'd like to see this patch and the one I sent > yesterday in v5.1 and stable kernels. Did you want to add your > Reviewed-by? Sorry for the delay Yan, Zheng > -- > Jeff Layton <jlayton@kernel.org>
On Wed, Apr 17, 2019 at 9:05 AM Yan, Zheng <ukernel@gmail.com> wrote: > > On Wed, Apr 17, 2019 at 7:48 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > > > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > > > > > Ben reported tripping the BUG_ON in create_request_message during some > > > > > > > performance testing. Analysis of the vmcore showed that the length of > > > > > > > the r_dentry->d_name string changed after we allocated the buffer, but > > > > > > > before we encoded it. > > > > > > > > > > > > > > build_dentry_path returns pointers to d_name in the common case of > > > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > > > > > > copy of the string in this case. That is less efficient, but safe. > > > > > > > > > > > > > > Reported-by: Ben England <bengland@redhat.com> > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > > > > > --- > > > > > > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > > > > > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > > > > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > > > > > index e09feadd2ae1..4cfefe118128 100644 > > > > > > > --- a/fs/ceph/mds_client.c > > > > > > > +++ b/fs/ceph/mds_client.c > > > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > > > > > > return path; > > > > > > > } > > > > > > > > > > > > > > +/* Duplicate the dentry->d_name.name safely */ > > > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > > > > > > + int *ppathlen) > > > > > > > +{ > > > > > > > + u32 len; > > > > > > > + char *name; > > > > > > > +retry: > > > > > > > + len = READ_ONCE(dentry->d_name.len); > > > > > > > + name = kmalloc(len + 1, GFP_NOFS); > > > > > > > + if (!name) > > > > > > > + return -ENOMEM; > > > > > > > + > > > > > > > + spin_lock(&dentry->d_lock); > > > > > > > + if (dentry->d_name.len != len) { > > > > > > > + spin_unlock(&dentry->d_lock); > > > > > > > + kfree(name); > > > > > > > + goto retry; > > > > > > > + } > > > > > > > + memcpy(name, dentry->d_name.name, len); > > > > > > > + spin_unlock(&dentry->d_lock); > > > > > > > + > > > > > > > + name[len] = '\0'; > > > > > > > + *ppath = name; > > > > > > > + *ppathlen = len; > > > > > > > + return 0; > > > > > > > +} > > > > > > > + > > > > > > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > > > - const char **ppath, int *ppathlen, u64 *pino, > > > > > > > - int *pfreepath) > > > > > > > + const char **ppath, int *ppathlen, u64 *pino) > > > > > > > { > > > > > > > char *path; > > > > > > > > > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > > > > > > *pino = ceph_ino(dir); > > > > > > > rcu_read_unlock(); > > > > > > > - *ppath = dentry->d_name.name; > > > > > > > - *ppathlen = dentry->d_name.len; > > > > > > > - return 0; > > > > > > > + return clone_dentry_name(dentry, ppath, ppathlen); > > > > > > > } > > > > > > > rcu_read_unlock(); > > > > > > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > > > > > > if (IS_ERR(path)) > > > > > > > return PTR_ERR(path); > > > > > > > *ppath = path; > > > > > > > - *pfreepath = 1; > > > > > > > return 0; > > > > > > > } > > > > > > > > > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > > > > > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > > > > > > ceph_snap(rinode)); > > > > > > > } else if (rdentry) { > > > > > > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > > > > > > - freepath); > > > > > > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > > > > > > + if (!r) > > > > > > > + *freepath = 1; > > > > > > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > > > > > > *ppath); > > > > > > > } else if (rpath || rino) { > > > > > > > -- > > > > > > > 2.20.1 > > > > > > > > > > > > > > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > > > > > > handle that case rinfo->dname != req->r_dentry->d_name. > > > > > > > > > > > > > > > > Good catch. I'll send a separate patch for that one. > > > > > -- > > > > > Jeff Layton <jlayton@kernel.org> > > > > > > > > > > > > So to make sure I understand, the worry here is that we can end up > > > > getting back an rinfo->dname that doesn't match what we have in > > > > r_dentry from the initial call? > > > > > > > > That is quite a different problem from what this patch fixes if so. > > > > What should we do in the case where that occurs? > > > > > > keep r_dentry untouched and return a error? ceph_d_revalidate() > > > return -ECHILD after it gets the error. > > > > > > > We wouldn't do a synchronous MDS call in rcuwalk mode, so returning > > -ECHILD would probably not be what we'd want to do. We could have this > > happen in any reply (not just d_revalidate), in principle, so I think > > we need a general solution here. > > > > why it can happen in any reply? I think it can happen only when > i_mutex of parent dir is not locked. > Could we not race against operations being done by other clients? AFAIU, an MDS could set a dentry in the trace whenever it likes, and it doesn't necessarily know anything about what r_dentry we have stashed (other than the info that gets encoded into the request). > > > I think you're probably right that we should not touch r_dentry in > > that case though. I'll look at grafting that logic into this. > > > > In the meantime, I'd like to see this patch and the one I sent > > yesterday in v5.1 and stable kernels. Did you want to add your > > Reviewed-by? > > Sorry for the delay > No problem. More important to get it right than fast.
On Wed, Apr 17, 2019 at 9:28 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Wed, Apr 17, 2019 at 9:05 AM Yan, Zheng <ukernel@gmail.com> wrote: > > > > On Wed, Apr 17, 2019 at 7:48 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > On Tue, Apr 16, 2019 at 10:14 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > > > On Tue, Apr 16, 2019 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > On Tue, Apr 16, 2019 at 7:10 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > > > On Mon, Apr 15, 2019 at 10:24 PM Yan, Zheng <ukernel@gmail.com> wrote: > > > > > > > > > > > > > > On Tue, Apr 16, 2019 at 12:47 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > > > > > > > > > > Ben reported tripping the BUG_ON in create_request_message during some > > > > > > > > performance testing. Analysis of the vmcore showed that the length of > > > > > > > > the r_dentry->d_name string changed after we allocated the buffer, but > > > > > > > > before we encoded it. > > > > > > > > > > > > > > > > build_dentry_path returns pointers to d_name in the common case of > > > > > > > > non-snapped dentries, but this optimization isn't safe. Instead, make a > > > > > > > > copy of the string in this case. That is less efficient, but safe. > > > > > > > > > > > > > > > > Reported-by: Ben England <bengland@redhat.com> > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > > > > > > --- > > > > > > > > fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- > > > > > > > > 1 file changed, 32 insertions(+), 8 deletions(-) > > > > > > > > > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > > > > > > index e09feadd2ae1..4cfefe118128 100644 > > > > > > > > --- a/fs/ceph/mds_client.c > > > > > > > > +++ b/fs/ceph/mds_client.c > > > > > > > > @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, > > > > > > > > return path; > > > > > > > > } > > > > > > > > > > > > > > > > +/* Duplicate the dentry->d_name.name safely */ > > > > > > > > +static int clone_dentry_name(struct dentry *dentry, const char **ppath, > > > > > > > > + int *ppathlen) > > > > > > > > +{ > > > > > > > > + u32 len; > > > > > > > > + char *name; > > > > > > > > +retry: > > > > > > > > + len = READ_ONCE(dentry->d_name.len); > > > > > > > > + name = kmalloc(len + 1, GFP_NOFS); > > > > > > > > + if (!name) > > > > > > > > + return -ENOMEM; > > > > > > > > + > > > > > > > > + spin_lock(&dentry->d_lock); > > > > > > > > + if (dentry->d_name.len != len) { > > > > > > > > + spin_unlock(&dentry->d_lock); > > > > > > > > + kfree(name); > > > > > > > > + goto retry; > > > > > > > > + } > > > > > > > > + memcpy(name, dentry->d_name.name, len); > > > > > > > > + spin_unlock(&dentry->d_lock); > > > > > > > > + > > > > > > > > + name[len] = '\0'; > > > > > > > > + *ppath = name; > > > > > > > > + *ppathlen = len; > > > > > > > > + return 0; > > > > > > > > +} > > > > > > > > + > > > > > > > > static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > > > > - const char **ppath, int *ppathlen, u64 *pino, > > > > > > > > - int *pfreepath) > > > > > > > > + const char **ppath, int *ppathlen, u64 *pino) > > > > > > > > { > > > > > > > > char *path; > > > > > > > > > > > > > > > > @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, > > > > > > > > if (dir && ceph_snap(dir) == CEPH_NOSNAP) { > > > > > > > > *pino = ceph_ino(dir); > > > > > > > > rcu_read_unlock(); > > > > > > > > - *ppath = dentry->d_name.name; > > > > > > > > - *ppathlen = dentry->d_name.len; > > > > > > > > - return 0; > > > > > > > > + return clone_dentry_name(dentry, ppath, ppathlen); > > > > > > > > } > > > > > > > > rcu_read_unlock(); > > > > > > > > path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); > > > > > > > > if (IS_ERR(path)) > > > > > > > > return PTR_ERR(path); > > > > > > > > *ppath = path; > > > > > > > > - *pfreepath = 1; > > > > > > > > return 0; > > > > > > > > } > > > > > > > > > > > > > > > > @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, > > > > > > > > dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), > > > > > > > > ceph_snap(rinode)); > > > > > > > > } else if (rdentry) { > > > > > > > > - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, > > > > > > > > - freepath); > > > > > > > > + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); > > > > > > > > + if (!r) > > > > > > > > + *freepath = 1; > > > > > > > > dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, > > > > > > > > *ppath); > > > > > > > > } else if (rpath || rino) { > > > > > > > > -- > > > > > > > > 2.20.1 > > > > > > > > > > > > > > > > > > > > > > Looks good. I think we also need to update ceph_fill_trace(), make it > > > > > > > handle that case rinfo->dname != req->r_dentry->d_name. > > > > > > > > > > > > > > > > > > > Good catch. I'll send a separate patch for that one. > > > > > > -- > > > > > > Jeff Layton <jlayton@kernel.org> > > > > > > > > > > > > > > > So to make sure I understand, the worry here is that we can end up > > > > > getting back an rinfo->dname that doesn't match what we have in > > > > > r_dentry from the initial call? > > > > > > > > > > That is quite a different problem from what this patch fixes if so. > > > > > What should we do in the case where that occurs? > > > > > > > > keep r_dentry untouched and return a error? ceph_d_revalidate() > > > > return -ECHILD after it gets the error. > > > > > > > > > > We wouldn't do a synchronous MDS call in rcuwalk mode, so returning > > > -ECHILD would probably not be what we'd want to do. We could have this > > > happen in any reply (not just d_revalidate), in principle, so I think > > > we need a general solution here. > > > > > > > why it can happen in any reply? I think it can happen only when > > i_mutex of parent dir is not locked. > > > > Could we not race against operations being done by other clients? > I think no. operation by other clients just make dentry lease invalid. > AFAIU, an MDS could set a dentry in the trace whenever it likes, and > it doesn't necessarily know anything about what r_dentry we have > stashed (other than the info that gets encoded into the request). > I think r_dentry is stable when i_mutex of parent dir is locked > > > > > I think you're probably right that we should not touch r_dentry in > > > that case though. I'll look at grafting that logic into this. > > > > > > In the meantime, I'd like to see this patch and the one I sent > > > yesterday in v5.1 and stable kernels. Did you want to add your > > > Reviewed-by? > > > > Sorry for the delay > > > > No problem. More important to get it right than fast. > -- > Jeff Layton <jlayton@kernel.org>
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e09feadd2ae1..4cfefe118128 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2159,9 +2159,35 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, - const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + const char **ppath, int *ppathlen, u64 *pino) { char *path; @@ -2171,16 +2197,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; - return 0; + return clone_dentry_name(dentry, ppath, ppathlen); } rcu_read_unlock(); path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; return 0; } @@ -2222,8 +2245,9 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino); + if (!r) + *freepath = 1; dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) {
Ben reported tripping the BUG_ON in create_request_message during some performance testing. Analysis of the vmcore showed that the length of the r_dentry->d_name string changed after we allocated the buffer, but before we encoded it. build_dentry_path returns pointers to d_name in the common case of non-snapped dentries, but this optimization isn't safe. Instead, make a copy of the string in this case. That is less efficient, but safe. Reported-by: Ben England <bengland@redhat.com> Signed-off-by: Jeff Layton <jlayton@kernel.org> --- fs/ceph/mds_client.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-)