diff mbox

[v3,1/5] ceph: clean up unsafe d_parent access in __choose_mds

Message ID C469B5CD-1854-4826-8E99-B401A751B50B@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yan, Zheng Dec. 15, 2016, 9:08 a.m. UTC
> On 14 Dec 2016, at 22:54, Jeff Layton <jlayton@redhat.com> wrote:
> 
> __choose_mds exists to pick an MDS to use when issuing a call. Doing
> that typically involves picking an inode and using the authoritative
> MDS for it. In most cases, that's pretty straightforward, as we are
> using an inode to which we hold a reference (usually represented by
> r_dentry or r_inode in the request).
> 
> In the case of a snapshotted directory however, we need to fetch
> the non-snapped parent, which involves walking back up the parents
> in the tree. The dentries in the snapshot dir are effectively frozen
> but the overall parent is _not_, and could vanish if a concurrent
> rename were to occur.
> 
> Clean this code up and take special care to ensure the validity of
> the entries we're working with. First, try to use the inode in
> r_locked_dir if one exists. If not and all we have is r_dentry,
> then we have to walk back up the tree. Use the rcu_read_lock for
> this so we can ensure that any d_parent we find won't go away, and
> take extra care to deal with the possibility that the dentries could
> go negative.
> 
> Change get_nonsnap_parent to return an inode, and take a reference to
> that inode before returning (if any). Change all of the other places
> where we set "inode" in __choose_mds to also take a reference, and then
> call iput on that inode before exiting the function.
> 
> Link: http://tracker.ceph.com/issues/18148
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
> fs/ceph/mds_client.c | 67 +++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 45 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 815acd1a56d4..e62b566d3817 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> }
> 
> /*
> + * Walk back up the dentry tree until we hit a dentry representing a
> + * non-snapshot inode. We do this using the rcu_read_lock (which must be held
> + * when calling this) to ensure that the objects won't disappear while we're
> + * working with them. Once we hit a candidate dentry, we attempt to take a
> + * reference to it, and return that as the result.
> + */
> +static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
> +	*inode = NULL;
> +
> +	while (dentry && !IS_ROOT(dentry)) {
> +		inode = d_inode_rcu(dentry);
> +		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
> +			break;
> +		dentry = dentry->d_parent;
> +	}
> +	if (inode)
> +		inode = igrab(inode);
> +	return inode;
> +}
> +
> +/*
>  * Choose mds to send request to next.  If there is a hint set in the
>  * request (e.g., due to a prior forward hint from the mds), use that.
>  * Otherwise, consult frag tree and/or caps to identify the
> @@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
>  *
>  * Called under mdsc->mutex.
>  */
> -static struct dentry *get_nonsnap_parent(struct dentry *dentry)
> -{
> -	/*
> -	 * we don't need to worry about protecting the d_parent access
> -	 * here because we never renaming inside the snapped namespace
> -	 * except to resplice to another snapdir, and either the old or new
> -	 * result is a valid result.
> -	 */
> -	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
> -		dentry = dentry->d_parent;
> -	return dentry;
> -}
> -
> static int __choose_mds(struct ceph_mds_client *mdsc,
> 			struct ceph_mds_request *req)
> {
> @@ -716,30 +724,42 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 	inode = NULL;
> 	if (req->r_inode) {
> 		inode = req->r_inode;
> +		ihold(inode);
> +	} else if (req->r_locked_dir) {
> +		inode = req->r_locked_dir;
> +		ihold(inode);


this seems incorrect. how about following incremental patch



Regards
Yan, Zheng

> 	} else if (req->r_dentry) {
> 		/* ignore race with rename; old or new d_parent is okay */
> -		struct dentry *parent = req->r_dentry->d_parent;
> -		struct inode *dir = d_inode(parent);
> +		struct dentry *parent;
> +		struct inode *dir;
> +
> +		rcu_read_lock();
> +		parent = req->r_dentry->d_parent;
> +		dir = d_inode_rcu(parent);
> 
> -		if (dir->i_sb != mdsc->fsc->sb) {
> -			/* not this fs! */
> +		if (!dir || dir->i_sb != mdsc->fsc->sb) {
> +			/*  not this fs or parent went negative */
> 			inode = d_inode(req->r_dentry);
> +			if (inode)
> +				ihold(inode);
> 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
> 			/* direct snapped/virtual snapdir requests
> 			 * based on parent dir inode */
> -			struct dentry *dn = get_nonsnap_parent(parent);
> -			inode = d_inode(dn);
> +			inode = get_nonsnap_parent(parent);
> 			dout("__choose_mds using nonsnap parent %p\n", inode);
> 		} else {
> 			/* dentry target */
> 			inode = d_inode(req->r_dentry);
> 			if (!inode || mode == USE_AUTH_MDS) {
> 				/* dir + name */
> -				inode = dir;
> +				inode = igrab(dir);
> 				hash = ceph_dentry_hash(dir, req->r_dentry);
> 				is_hash = true;
> +			} else {
> +				ihold(inode);
> 			}
> 		}
> +		rcu_read_unlock();
> 	}
> 
> 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
> @@ -768,7 +788,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 				     (int)r, frag.ndist);
> 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> 				    CEPH_MDS_STATE_ACTIVE)
> -					return mds;
> +					goto out;
> 			}
> 
> 			/* since this file/dir wasn't known to be
> @@ -783,7 +803,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 				     inode, ceph_vinop(inode), frag.frag, mds);
> 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> 				    CEPH_MDS_STATE_ACTIVE)
> -					return mds;
> +					goto out;
> 			}
> 		}
> 	}
> @@ -796,6 +816,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
> 	if (!cap) {
> 		spin_unlock(&ci->i_ceph_lock);
> +		iput(inode);
> 		goto random;
> 	}
> 	mds = cap->session->s_mds;
> @@ -803,6 +824,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 	     inode, ceph_vinop(inode), mds,
> 	     cap == ci->i_auth_cap ? "auth " : "", cap);
> 	spin_unlock(&ci->i_ceph_lock);
> +out:
> +	iput(inode);
> 	return mds;
> 
> random:
> -- 
> 2.7.4
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jeff Layton Dec. 15, 2016, 11:30 a.m. UTC | #1
On Thu, 2016-12-15 at 17:08 +0800, Yan, Zheng wrote:
> > 
> > On 14 Dec 2016, at 22:54, Jeff Layton <jlayton@redhat.com> wrote:
> > 
> > __choose_mds exists to pick an MDS to use when issuing a call. Doing
> > that typically involves picking an inode and using the authoritative
> > MDS for it. In most cases, that's pretty straightforward, as we are
> > using an inode to which we hold a reference (usually represented by
> > r_dentry or r_inode in the request).
> > 
> > In the case of a snapshotted directory however, we need to fetch
> > the non-snapped parent, which involves walking back up the parents
> > in the tree. The dentries in the snapshot dir are effectively frozen
> > but the overall parent is _not_, and could vanish if a concurrent
> > rename were to occur.
> > 
> > Clean this code up and take special care to ensure the validity of
> > the entries we're working with. First, try to use the inode in
> > r_locked_dir if one exists. If not and all we have is r_dentry,
> > then we have to walk back up the tree. Use the rcu_read_lock for
> > this so we can ensure that any d_parent we find won't go away, and
> > take extra care to deal with the possibility that the dentries could
> > go negative.
> > 
> > Change get_nonsnap_parent to return an inode, and take a reference to
> > that inode before returning (if any). Change all of the other places
> > where we set "inode" in __choose_mds to also take a reference, and then
> > call iput on that inode before exiting the function.
> > 
> > Link: http://tracker.ceph.com/issues/18148
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> > fs/ceph/mds_client.c | 67 +++++++++++++++++++++++++++++++++++-----------------
> > 1 file changed, 45 insertions(+), 22 deletions(-)
> > 
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 815acd1a56d4..e62b566d3817 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> > }
> > 
> > /*
> > + * Walk back up the dentry tree until we hit a dentry representing a
> > + * non-snapshot inode. We do this using the rcu_read_lock (which must be held
> > + * when calling this) to ensure that the objects won't disappear while we're
> > + * working with them. Once we hit a candidate dentry, we attempt to take a
> > + * reference to it, and return that as the result.
> > + */
> > +static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
> > +	*inode = NULL;
> > +
> > +	while (dentry && !IS_ROOT(dentry)) {
> > +		inode = d_inode_rcu(dentry);
> > +		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
> > +			break;
> > +		dentry = dentry->d_parent;
> > +	}
> > +	if (inode)
> > +		inode = igrab(inode);
> > +	return inode;
> > +}
> > +
> > +/*
> >  * Choose mds to send request to next.  If there is a hint set in the
> >  * request (e.g., due to a prior forward hint from the mds), use that.
> >  * Otherwise, consult frag tree and/or caps to identify the
> > @@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> >  *
> >  * Called under mdsc->mutex.
> >  */
> > -static struct dentry *get_nonsnap_parent(struct dentry *dentry)
> > -{
> > -	/*
> > -	 * we don't need to worry about protecting the d_parent access
> > -	 * here because we never renaming inside the snapped namespace
> > -	 * except to resplice to another snapdir, and either the old or new
> > -	 * result is a valid result.
> > -	 */
> > -	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
> > -		dentry = dentry->d_parent;
> > -	return dentry;
> > -}
> > -
> > static int __choose_mds(struct ceph_mds_client *mdsc,
> > 			struct ceph_mds_request *req)
> > {
> > @@ -716,30 +724,42 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 	inode = NULL;
> > 	if (req->r_inode) {
> > 		inode = req->r_inode;
> > +		ihold(inode);
> > +	} else if (req->r_locked_dir) {
> > +		inode = req->r_locked_dir;
> > +		ihold(inode);
> 
> 
> this seems incorrect. how about following incremental patch
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index c834d7d..4abc85f 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -725,9 +725,6 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>         if (req->r_inode) {
>                 inode = req->r_inode;
>                 ihold(inode);
> -       } else if (req->r_locked_dir) {
> -               inode = req->r_locked_dir;
> -               ihold(inode);
>         } else if (req->r_dentry) {
>                 /* ignore race with rename; old or new d_parent is okay */
>                 struct dentry *parent;
> @@ -735,7 +732,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>  
>                 rcu_read_lock();
>                 parent = req->r_dentry->d_parent;
> -               dir = d_inode_rcu(parent);
> +               dir = req->r_locked_dir ? : d_inode_rcu(parent);
>  
>                 if (!dir || dir->i_sb != mdsc->fsc->sb) {
>                         /*  not this fs or parent went negative */
> 
> 
> Regards
> Yan, Zheng

Ahh ok...

My original concern there was that we wouldn't have any guarantee that
d_inode(parent) == req->r_locked_dir, and then the follow on checks
might be wrong.

But...if we can rely on the fact that we hold the parent's mutex there
(which I think is what r_locked_dir is supposed to indicate), then I
think we can be certain of it.

I'll fold your patch in with the original.

Thanks!

> > 
> > 	} else if (req->r_dentry) {
> > 		/* ignore race with rename; old or new d_parent is okay */
> > -		struct dentry *parent = req->r_dentry->d_parent;
> > -		struct inode *dir = d_inode(parent);
> > +		struct dentry *parent;
> > +		struct inode *dir;
> > +
> > +		rcu_read_lock();
> > +		parent = req->r_dentry->d_parent;
> > +		dir = d_inode_rcu(parent);
> > 
> > -		if (dir->i_sb != mdsc->fsc->sb) {
> > -			/* not this fs! */
> > +		if (!dir || dir->i_sb != mdsc->fsc->sb) {
> > +			/*  not this fs or parent went negative */
> > 			inode = d_inode(req->r_dentry);
> > +			if (inode)
> > +				ihold(inode);
> > 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
> > 			/* direct snapped/virtual snapdir requests
> > 			 * based on parent dir inode */
> > -			struct dentry *dn = get_nonsnap_parent(parent);
> > -			inode = d_inode(dn);
> > +			inode = get_nonsnap_parent(parent);
> > 			dout("__choose_mds using nonsnap parent %p\n", inode);
> > 		} else {
> > 			/* dentry target */
> > 			inode = d_inode(req->r_dentry);
> > 			if (!inode || mode == USE_AUTH_MDS) {
> > 				/* dir + name */
> > -				inode = dir;
> > +				inode = igrab(dir);
> > 				hash = ceph_dentry_hash(dir, req->r_dentry);
> > 				is_hash = true;
> > +			} else {
> > +				ihold(inode);
> > 			}
> > 		}
> > +		rcu_read_unlock();
> > 	}
> > 
> > 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
> > @@ -768,7 +788,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 				     (int)r, frag.ndist);
> > 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> > 				    CEPH_MDS_STATE_ACTIVE)
> > -					return mds;
> > +					goto out;
> > 			}
> > 
> > 			/* since this file/dir wasn't known to be
> > @@ -783,7 +803,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 				     inode, ceph_vinop(inode), frag.frag, mds);
> > 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> > 				    CEPH_MDS_STATE_ACTIVE)
> > -					return mds;
> > +					goto out;
> > 			}
> > 		}
> > 	}
> > @@ -796,6 +816,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
> > 	if (!cap) {
> > 		spin_unlock(&ci->i_ceph_lock);
> > +		iput(inode);
> > 		goto random;
> > 	}
> > 	mds = cap->session->s_mds;
> > @@ -803,6 +824,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 	     inode, ceph_vinop(inode), mds,
> > 	     cap == ci->i_auth_cap ? "auth " : "", cap);
> > 	spin_unlock(&ci->i_ceph_lock);
> > +out:
> > +	iput(inode);
> > 	return mds;
> > 
> > random:
> > -- 
> > 2.7.4
> > 
>
diff mbox

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c834d7d..4abc85f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -725,9 +725,6 @@  static int __choose_mds(struct ceph_mds_client *mdsc,
        if (req->r_inode) {
                inode = req->r_inode;
                ihold(inode);
-       } else if (req->r_locked_dir) {
-               inode = req->r_locked_dir;
-               ihold(inode);
        } else if (req->r_dentry) {
                /* ignore race with rename; old or new d_parent is okay */
                struct dentry *parent;
@@ -735,7 +732,7 @@  static int __choose_mds(struct ceph_mds_client *mdsc,
 
                rcu_read_lock();
                parent = req->r_dentry->d_parent;
-               dir = d_inode_rcu(parent);
+               dir = req->r_locked_dir ? : d_inode_rcu(parent);
 
                if (!dir || dir->i_sb != mdsc->fsc->sb) {
                        /*  not this fs or parent went negative */