ceph: encode inodes' parent/d_name in cap reconnect message
diff mbox series

Message ID 20200811072303.24322-1-zyan@redhat.com
State New
Headers show
Series
  • ceph: encode inodes' parent/d_name in cap reconnect message
Related show

Commit Message

Yan, Zheng Aug. 11, 2020, 7:23 a.m. UTC
Since nautilus, MDS tracks dirfrags whose child inodes have caps in open
file table. When MDS recovers, it prefetches all of these dirfrags. This
avoids using backtrace to load inodes. But dirfrags prefetch may load
lots of useless inodes into cache, and make MDS run out of memory.

Recent MDS adds an option that disables dirfrags prefetch. When dirfrags
prefetch is disabled. Recovering MDS only prefetches corresponding dir
inodes. Including inodes' parent/d_name in cap reconnect message can
help MDS to load inodes into its cache.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++--------------
 1 file changed, 61 insertions(+), 28 deletions(-)

Comments

Jeff Layton Aug. 11, 2020, 11:19 a.m. UTC | #1
On Tue, 2020-08-11 at 15:23 +0800, Yan, Zheng wrote:
> Since nautilus, MDS tracks dirfrags whose child inodes have caps in open
> file table. When MDS recovers, it prefetches all of these dirfrags. This
> avoids using backtrace to load inodes. But dirfrags prefetch may load
> lots of useless inodes into cache, and make MDS run out of memory.
> 
> Recent MDS adds an option that disables dirfrags prefetch. When dirfrags
> prefetch is disabled. Recovering MDS only prefetches corresponding dir
> inodes. Including inodes' parent/d_name in cap reconnect message can
> help MDS to load inodes into its cache.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++--------------
>  1 file changed, 61 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 9a09d12569bd..4eaed12b4b4c 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3553,6 +3553,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
>  	return err;
>  }
>  
> +static struct dentry* d_find_primary(struct inode *inode)
> +{
> +	struct dentry *alias, *dn = NULL;
> +
> +	if (hlist_empty(&inode->i_dentry))
> +		return NULL;
> +
> +	spin_lock(&inode->i_lock);
> +	if (hlist_empty(&inode->i_dentry))
> +		goto out_unlock;
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
> +		if (!IS_ROOT(alias))
> +			dn = dget(alias);
> +		goto out_unlock;
> +	}
> +
> +	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
> +		spin_lock(&alias->d_lock);
> +		if (!d_unhashed(alias) &&
> +		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
> +			dn = dget_dlock(alias);
> +		}
> +		spin_unlock(&alias->d_lock);
> +		if (dn)
> +			break;
> +	}
> +out_unlock:
> +	spin_unlock(&inode->i_lock);
> +	return dn;
> +}
> +
>  /*
>   * Encode information about a cap for a reconnect with the MDS.
>   */
> @@ -3566,13 +3599,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  	struct ceph_inode_info *ci = cap->ci;
>  	struct ceph_reconnect_state *recon_state = arg;
>  	struct ceph_pagelist *pagelist = recon_state->pagelist;
> -	int err;
> +	struct dentry *dentry;
> +	char *path;
> +	int pathlen, err;
> +	u64 pathbase;
>  	u64 snap_follows;
>  
>  	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
>  	     inode, ceph_vinop(inode), cap, cap->cap_id,
>  	     ceph_cap_string(cap->issued));
>  
> +	dentry = d_find_primary(inode);
> +	if (dentry) {
> +		/* set pathbase to parent dir when msg_version >= 2 */
> +		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
> +					    recon_state->msg_version >= 2);
> +		dput(dentry);
> +		if (IS_ERR(path)) {
> +			err = PTR_ERR(path);
> +			goto out_err;
> +		}
> +	} else {
> +		path = NULL;
> +		pathlen = 0;
> +		pathbase = 0;
> +	}
> +
>  	spin_lock(&ci->i_ceph_lock);
>  	cap->seq = 0;        /* reset cap seq */
>  	cap->issue_seq = 0;  /* and issue_seq */
> @@ -3593,7 +3645,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
>  		rec.v2.issued = cpu_to_le32(cap->issued);
>  		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> -		rec.v2.pathbase = 0;
> +		rec.v2.pathbase = cpu_to_le64(pathbase);
>  		rec.v2.flock_len = (__force __le32)
>  			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
>  	} else {
> @@ -3604,7 +3656,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
>  		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
>  		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> -		rec.v1.pathbase = 0;
> +		rec.v1.pathbase = cpu_to_le64(pathbase);
>  	}
>  
>  	if (list_empty(&ci->i_cap_snaps)) {
> @@ -3666,7 +3718,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  			    sizeof(struct ceph_filelock);
>  		rec.v2.flock_len = cpu_to_le32(struct_len);
>  
> -		struct_len += sizeof(u32) + sizeof(rec.v2);
> +		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
>  
>  		if (struct_v >= 2)
>  			struct_len += sizeof(u64); /* snap_follows */
> @@ -3690,7 +3742,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  			ceph_pagelist_encode_8(pagelist, 1);
>  			ceph_pagelist_encode_32(pagelist, struct_len);
>  		}
> -		ceph_pagelist_encode_string(pagelist, NULL, 0);
> +		ceph_pagelist_encode_string(pagelist, path, pathlen);
>  		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
>  		ceph_locks_to_pagelist(flocks, pagelist,
>  				       num_fcntl_locks, num_flock_locks);
> @@ -3699,39 +3751,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  out_freeflocks:
>  		kfree(flocks);
>  	} else {
> -		u64 pathbase = 0;
> -		int pathlen = 0;
> -		char *path = NULL;
> -		struct dentry *dentry;
> -
> -		dentry = d_find_alias(inode);
> -		if (dentry) {
> -			path = ceph_mdsc_build_path(dentry,
> -						&pathlen, &pathbase, 0);
> -			dput(dentry);
> -			if (IS_ERR(path)) {
> -				err = PTR_ERR(path);
> -				goto out_err;
> -			}
> -			rec.v1.pathbase = cpu_to_le64(pathbase);
> -		}
> -
>  		err = ceph_pagelist_reserve(pagelist,
>  					    sizeof(u64) + sizeof(u32) +
>  					    pathlen + sizeof(rec.v1));
> -		if (err) {
> -			goto out_freepath;
> -		}
> +		if (err)
> +			goto out_err;
>  
>  		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
>  		ceph_pagelist_encode_string(pagelist, path, pathlen);
>  		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
> -out_freepath:
> -		ceph_mdsc_free_path(path, pathlen);
>  	}
>  
>  out_err:
> -	if (err >= 0)
> +	ceph_mdsc_free_path(path, pathlen);
> +	if (!err)
>  		recon_state->nr_caps++;
>  	return err;
>  }

Looks good. Merged into testing.

Thanks!
Jeff Layton Aug. 11, 2020, 11:27 a.m. UTC | #2
On Tue, 2020-08-11 at 15:23 +0800, Yan, Zheng wrote:
> Since nautilus, MDS tracks dirfrags whose child inodes have caps in open
> file table. When MDS recovers, it prefetches all of these dirfrags. This
> avoids using backtrace to load inodes. But dirfrags prefetch may load
> lots of useless inodes into cache, and make MDS run out of memory.
> 
> Recent MDS adds an option that disables dirfrags prefetch. When dirfrags
> prefetch is disabled. Recovering MDS only prefetches corresponding dir
> inodes. Including inodes' parent/d_name in cap reconnect message can
> help MDS to load inodes into its cache.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++--------------
>  1 file changed, 61 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 9a09d12569bd..4eaed12b4b4c 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3553,6 +3553,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
>  	return err;
>  }
>  
> +static struct dentry* d_find_primary(struct inode *inode)
> +{
> +	struct dentry *alias, *dn = NULL;
> +
> +	if (hlist_empty(&inode->i_dentry))
> +		return NULL;
> +
> +	spin_lock(&inode->i_lock);
> +	if (hlist_empty(&inode->i_dentry))
> +		goto out_unlock;
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
> +		if (!IS_ROOT(alias))
> +			dn = dget(alias);
> +		goto out_unlock;
> +	}
> +
> +	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
> +		spin_lock(&alias->d_lock);
> +		if (!d_unhashed(alias) &&
> +		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
> +			dn = dget_dlock(alias);
> +		}
> +		spin_unlock(&alias->d_lock);
> +		if (dn)
> +			break;
> +	}
> +out_unlock:
> +	spin_unlock(&inode->i_lock);
> +	return dn;
> +}
> +
>  /*
>   * Encode information about a cap for a reconnect with the MDS.
>   */
> @@ -3566,13 +3599,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  	struct ceph_inode_info *ci = cap->ci;
>  	struct ceph_reconnect_state *recon_state = arg;
>  	struct ceph_pagelist *pagelist = recon_state->pagelist;
> -	int err;
> +	struct dentry *dentry;
> +	char *path;
> +	int pathlen, err;
> +	u64 pathbase;
>  	u64 snap_follows;
>  
>  	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
>  	     inode, ceph_vinop(inode), cap, cap->cap_id,
>  	     ceph_cap_string(cap->issued));
>  
> +	dentry = d_find_primary(inode);
> +	if (dentry) {
> +		/* set pathbase to parent dir when msg_version >= 2 */
> +		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
> +					    recon_state->msg_version >= 2);

One question:

Do we really need to build a full path back to the root for the
msg_version == 1 case? I notice that the v1 message has a field for the
pathbase, which would seem to make the full path unnecessary. Is there
some quirk in older MDS versions that requires a full path for this?


> +		dput(dentry);
> +		if (IS_ERR(path)) {
> +			err = PTR_ERR(path);
> +			goto out_err;
> +		}
> +	} else {
> +		path = NULL;
> +		pathlen = 0;
> +		pathbase = 0;
> +	}
> +
>  	spin_lock(&ci->i_ceph_lock);
>  	cap->seq = 0;        /* reset cap seq */
>  	cap->issue_seq = 0;  /* and issue_seq */
> @@ -3593,7 +3645,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
>  		rec.v2.issued = cpu_to_le32(cap->issued);
>  		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> -		rec.v2.pathbase = 0;
> +		rec.v2.pathbase = cpu_to_le64(pathbase);
>  		rec.v2.flock_len = (__force __le32)
>  			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
>  	} else {
> @@ -3604,7 +3656,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
>  		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
>  		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> -		rec.v1.pathbase = 0;
> +		rec.v1.pathbase = cpu_to_le64(pathbase);
>  	}
>  
>  	if (list_empty(&ci->i_cap_snaps)) {
> @@ -3666,7 +3718,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  			    sizeof(struct ceph_filelock);
>  		rec.v2.flock_len = cpu_to_le32(struct_len);
>  
> -		struct_len += sizeof(u32) + sizeof(rec.v2);
> +		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
>  
>  		if (struct_v >= 2)
>  			struct_len += sizeof(u64); /* snap_follows */
> @@ -3690,7 +3742,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  			ceph_pagelist_encode_8(pagelist, 1);
>  			ceph_pagelist_encode_32(pagelist, struct_len);
>  		}
> -		ceph_pagelist_encode_string(pagelist, NULL, 0);
> +		ceph_pagelist_encode_string(pagelist, path, pathlen);
>  		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
>  		ceph_locks_to_pagelist(flocks, pagelist,
>  				       num_fcntl_locks, num_flock_locks);
> @@ -3699,39 +3751,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  out_freeflocks:
>  		kfree(flocks);
>  	} else {
> -		u64 pathbase = 0;
> -		int pathlen = 0;
> -		char *path = NULL;
> -		struct dentry *dentry;
> -
> -		dentry = d_find_alias(inode);
> -		if (dentry) {
> -			path = ceph_mdsc_build_path(dentry,
> -						&pathlen, &pathbase, 0);
> -			dput(dentry);
> -			if (IS_ERR(path)) {
> -				err = PTR_ERR(path);
> -				goto out_err;
> -			}
> -			rec.v1.pathbase = cpu_to_le64(pathbase);
> -		}
> -
>  		err = ceph_pagelist_reserve(pagelist,
>  					    sizeof(u64) + sizeof(u32) +
>  					    pathlen + sizeof(rec.v1));
> -		if (err) {
> -			goto out_freepath;
> -		}
> +		if (err)
> +			goto out_err;
>  
>  		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
>  		ceph_pagelist_encode_string(pagelist, path, pathlen);
>  		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
> -out_freepath:
> -		ceph_mdsc_free_path(path, pathlen);
>  	}
>  
>  out_err:
> -	if (err >= 0)
> +	ceph_mdsc_free_path(path, pathlen);
> +	if (!err)
>  		recon_state->nr_caps++;
>  	return err;
>  }
Yan, Zheng Aug. 11, 2020, 3:12 p.m. UTC | #3
On Tue, Aug 11, 2020 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Tue, 2020-08-11 at 15:23 +0800, Yan, Zheng wrote:
> > Since nautilus, MDS tracks dirfrags whose child inodes have caps in open
> > file table. When MDS recovers, it prefetches all of these dirfrags. This
> > avoids using backtrace to load inodes. But dirfrags prefetch may load
> > lots of useless inodes into cache, and make MDS run out of memory.
> >
> > Recent MDS adds an option that disables dirfrags prefetch. When dirfrags
> > prefetch is disabled. Recovering MDS only prefetches corresponding dir
> > inodes. Including inodes' parent/d_name in cap reconnect message can
> > help MDS to load inodes into its cache.
> >
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++--------------
> >  1 file changed, 61 insertions(+), 28 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 9a09d12569bd..4eaed12b4b4c 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -3553,6 +3553,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
> >       return err;
> >  }
> >
> > +static struct dentry* d_find_primary(struct inode *inode)
> > +{
> > +     struct dentry *alias, *dn = NULL;
> > +
> > +     if (hlist_empty(&inode->i_dentry))
> > +             return NULL;
> > +
> > +     spin_lock(&inode->i_lock);
> > +     if (hlist_empty(&inode->i_dentry))
> > +             goto out_unlock;
> > +
> > +     if (S_ISDIR(inode->i_mode)) {
> > +             alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
> > +             if (!IS_ROOT(alias))
> > +                     dn = dget(alias);
> > +             goto out_unlock;
> > +     }
> > +
> > +     hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
> > +             spin_lock(&alias->d_lock);
> > +             if (!d_unhashed(alias) &&
> > +                 (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
> > +                     dn = dget_dlock(alias);
> > +             }
> > +             spin_unlock(&alias->d_lock);
> > +             if (dn)
> > +                     break;
> > +     }
> > +out_unlock:
> > +     spin_unlock(&inode->i_lock);
> > +     return dn;
> > +}
> > +
> >  /*
> >   * Encode information about a cap for a reconnect with the MDS.
> >   */
> > @@ -3566,13 +3599,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >       struct ceph_inode_info *ci = cap->ci;
> >       struct ceph_reconnect_state *recon_state = arg;
> >       struct ceph_pagelist *pagelist = recon_state->pagelist;
> > -     int err;
> > +     struct dentry *dentry;
> > +     char *path;
> > +     int pathlen, err;
> > +     u64 pathbase;
> >       u64 snap_follows;
> >
> >       dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
> >            inode, ceph_vinop(inode), cap, cap->cap_id,
> >            ceph_cap_string(cap->issued));
> >
> > +     dentry = d_find_primary(inode);
> > +     if (dentry) {
> > +             /* set pathbase to parent dir when msg_version >= 2 */
> > +             path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
> > +                                         recon_state->msg_version >= 2);
>
> One question:
>
> Do we really need to build a full path back to the root for the
> msg_version == 1 case? I notice that the v1 message has a field for the
> pathbase, which would seem to make the full path unnecessary. Is there
> some quirk in older MDS versions that requires a full path for this?
>

emperor and older mds require this. I guess no one uses mds that old.
So  it's OK to always build relative path.

Regards
Yan, Zheng

>
> > +             dput(dentry);
> > +             if (IS_ERR(path)) {
> > +                     err = PTR_ERR(path);
> > +                     goto out_err;
> > +             }
> > +     } else {
> > +             path = NULL;
> > +             pathlen = 0;
> > +             pathbase = 0;
> > +     }
> > +
> >       spin_lock(&ci->i_ceph_lock);
> >       cap->seq = 0;        /* reset cap seq */
> >       cap->issue_seq = 0;  /* and issue_seq */
> > @@ -3593,7 +3645,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >               rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
> >               rec.v2.issued = cpu_to_le32(cap->issued);
> >               rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> > -             rec.v2.pathbase = 0;
> > +             rec.v2.pathbase = cpu_to_le64(pathbase);
> >               rec.v2.flock_len = (__force __le32)
> >                       ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
> >       } else {
> > @@ -3604,7 +3656,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >               ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
> >               ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
> >               rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> > -             rec.v1.pathbase = 0;
> > +             rec.v1.pathbase = cpu_to_le64(pathbase);
> >       }
> >
> >       if (list_empty(&ci->i_cap_snaps)) {
> > @@ -3666,7 +3718,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                           sizeof(struct ceph_filelock);
> >               rec.v2.flock_len = cpu_to_le32(struct_len);
> >
> > -             struct_len += sizeof(u32) + sizeof(rec.v2);
> > +             struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
> >
> >               if (struct_v >= 2)
> >                       struct_len += sizeof(u64); /* snap_follows */
> > @@ -3690,7 +3742,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                       ceph_pagelist_encode_8(pagelist, 1);
> >                       ceph_pagelist_encode_32(pagelist, struct_len);
> >               }
> > -             ceph_pagelist_encode_string(pagelist, NULL, 0);
> > +             ceph_pagelist_encode_string(pagelist, path, pathlen);
> >               ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
> >               ceph_locks_to_pagelist(flocks, pagelist,
> >                                      num_fcntl_locks, num_flock_locks);
> > @@ -3699,39 +3751,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >  out_freeflocks:
> >               kfree(flocks);
> >       } else {
> > -             u64 pathbase = 0;
> > -             int pathlen = 0;
> > -             char *path = NULL;
> > -             struct dentry *dentry;
> > -
> > -             dentry = d_find_alias(inode);
> > -             if (dentry) {
> > -                     path = ceph_mdsc_build_path(dentry,
> > -                                             &pathlen, &pathbase, 0);
> > -                     dput(dentry);
> > -                     if (IS_ERR(path)) {
> > -                             err = PTR_ERR(path);
> > -                             goto out_err;
> > -                     }
> > -                     rec.v1.pathbase = cpu_to_le64(pathbase);
> > -             }
> > -
> >               err = ceph_pagelist_reserve(pagelist,
> >                                           sizeof(u64) + sizeof(u32) +
> >                                           pathlen + sizeof(rec.v1));
> > -             if (err) {
> > -                     goto out_freepath;
> > -             }
> > +             if (err)
> > +                     goto out_err;
> >
> >               ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
> >               ceph_pagelist_encode_string(pagelist, path, pathlen);
> >               ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
> > -out_freepath:
> > -             ceph_mdsc_free_path(path, pathlen);
> >       }
> >
> >  out_err:
> > -     if (err >= 0)
> > +     ceph_mdsc_free_path(path, pathlen);
> > +     if (!err)
> >               recon_state->nr_caps++;
> >       return err;
> >  }
>
> --
> Jeff Layton <jlayton@kernel.org>
>

Patch
diff mbox series

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9a09d12569bd..4eaed12b4b4c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3553,6 +3553,39 @@  static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
 	return err;
 }
 
+static struct dentry* d_find_primary(struct inode *inode)
+{
+	struct dentry *alias, *dn = NULL;
+
+	if (hlist_empty(&inode->i_dentry))
+		return NULL;
+
+	spin_lock(&inode->i_lock);
+	if (hlist_empty(&inode->i_dentry))
+		goto out_unlock;
+
+	if (S_ISDIR(inode->i_mode)) {
+		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+		if (!IS_ROOT(alias))
+			dn = dget(alias);
+		goto out_unlock;
+	}
+
+	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+		spin_lock(&alias->d_lock);
+		if (!d_unhashed(alias) &&
+		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
+			dn = dget_dlock(alias);
+		}
+		spin_unlock(&alias->d_lock);
+		if (dn)
+			break;
+	}
+out_unlock:
+	spin_unlock(&inode->i_lock);
+	return dn;
+}
+
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
@@ -3566,13 +3599,32 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
-	int err;
+	struct dentry *dentry;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
 	u64 snap_follows;
 
 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
 	     inode, ceph_vinop(inode), cap, cap->cap_id,
 	     ceph_cap_string(cap->issued));
 
+	dentry = d_find_primary(inode);
+	if (dentry) {
+		/* set pathbase to parent dir when msg_version >= 2 */
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+					    recon_state->msg_version >= 2);
+		dput(dentry);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out_err;
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+		pathbase = 0;
+	}
+
 	spin_lock(&ci->i_ceph_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
@@ -3593,7 +3645,7 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v2.issued = cpu_to_le32(cap->issued);
 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v2.pathbase = 0;
+		rec.v2.pathbase = cpu_to_le64(pathbase);
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
@@ -3604,7 +3656,7 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
 		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v1.pathbase = 0;
+		rec.v1.pathbase = cpu_to_le64(pathbase);
 	}
 
 	if (list_empty(&ci->i_cap_snaps)) {
@@ -3666,7 +3718,7 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			    sizeof(struct ceph_filelock);
 		rec.v2.flock_len = cpu_to_le32(struct_len);
 
-		struct_len += sizeof(u32) + sizeof(rec.v2);
+		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
 
 		if (struct_v >= 2)
 			struct_len += sizeof(u64); /* snap_follows */
@@ -3690,7 +3742,7 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			ceph_pagelist_encode_8(pagelist, 1);
 			ceph_pagelist_encode_32(pagelist, struct_len);
 		}
-		ceph_pagelist_encode_string(pagelist, NULL, 0);
+		ceph_pagelist_encode_string(pagelist, path, pathlen);
 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
 		ceph_locks_to_pagelist(flocks, pagelist,
 				       num_fcntl_locks, num_flock_locks);
@@ -3699,39 +3751,20 @@  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 out_freeflocks:
 		kfree(flocks);
 	} else {
-		u64 pathbase = 0;
-		int pathlen = 0;
-		char *path = NULL;
-		struct dentry *dentry;
-
-		dentry = d_find_alias(inode);
-		if (dentry) {
-			path = ceph_mdsc_build_path(dentry,
-						&pathlen, &pathbase, 0);
-			dput(dentry);
-			if (IS_ERR(path)) {
-				err = PTR_ERR(path);
-				goto out_err;
-			}
-			rec.v1.pathbase = cpu_to_le64(pathbase);
-		}
-
 		err = ceph_pagelist_reserve(pagelist,
 					    sizeof(u64) + sizeof(u32) +
 					    pathlen + sizeof(rec.v1));
-		if (err) {
-			goto out_freepath;
-		}
+		if (err)
+			goto out_err;
 
 		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
 		ceph_pagelist_encode_string(pagelist, path, pathlen);
 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-out_freepath:
-		ceph_mdsc_free_path(path, pathlen);
 	}
 
 out_err:
-	if (err >= 0)
+	ceph_mdsc_free_path(path, pathlen);
+	if (!err)
 		recon_state->nr_caps++;
 	return err;
 }