diff mbox series

[RFC,1/3] fs: introduce helper d_path_fast()

Message ID 20210508122530.1971-2-justin.he@arm.com (mailing list archive)
State New
Headers show
Series make '%pD' print full path for file | expand

Commit Message

Justin He May 8, 2021, 12:25 p.m. UTC
This helper is similar to d_path except for not taking seqlock/spinlock.

Signed-off-by: Jia He <justin.he@arm.com>
---
 fs/d_path.c            | 50 ++++++++++++++++++++++++++++++++----------
 include/linux/dcache.h |  1 +
 2 files changed, 39 insertions(+), 12 deletions(-)

Comments

Linus Torvalds May 8, 2021, 3:30 p.m. UTC | #1
On Sat, May 8, 2021 at 5:29 AM Jia He <justin.he@arm.com> wrote:
>
> This helper is similar to d_path except for not taking seqlock/spinlock.

I see why you did it that way, but conditional locking is something we
really really try to avoid in the kernel.

It basically makes a lot of static tools unable to follow the locking
rules, and it makes it hard for people to se what's going on too.

So instead of passing a "bool need_lock" thing down, the way to do
these things is generally to extract the code inside the locked region
into a helper function of its own, and then you have

  __unlocked_version(...)
  {
       .. do the actual work
  }

  locked_version(..)
  {
      take_lock(..)
      retval = __unlocked_version(..);
      release_lock(..);
      return retval;
  }

this prepend_path() case is a bit more complicated because there's two
layers of locking, but I think the pattern should still work fine.

In fact, I think it would clean up prepend_path() and make it more
legible to have the two layers of mount_lock / rename_lock be done in
callers with the restarting being done as a loop in the caller rather
than as "goto restart_*".

              Linus
Al Viro May 8, 2021, 7:13 p.m. UTC | #2
On Sat, May 08, 2021 at 08:30:43AM -0700, Linus Torvalds wrote:

> In fact, I think it would clean up prepend_path() and make it more
> legible to have the two layers of mount_lock / rename_lock be done in
> callers with the restarting being done as a loop in the caller rather
> than as "goto restart_*".

Potentiallty delicate question is how to pass bptr/blen to the caller
of that helper...
Linus Torvalds May 8, 2021, 8:39 p.m. UTC | #3
On Sat, May 8, 2021 at 12:13 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Potentiallty delicate question is how to pass bptr/blen to the caller
> of that helper...

So I was thinking something like this patch..

THIS IS TOTALLY UNTESTED!

I've basically introduced a "struct prepend_buffer" that acts as that
"end,len" pair and that gets passed around. It builds cleanly for me,
which actually makes me fairly happy that it might even be close
right, because the calling conventions would catch any mistake.

It looks superficially sane to me, but again - untested.

The patch ended up being bigger than I expected, but it's all fairly
straightforward - just munging the calling conventions.

Oh, and I did extract out the core of "prepend_path()" into
"prepend_entries()" just to show how it would work.

              Linus
Al Viro May 8, 2021, 9:05 p.m. UTC | #4
On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:

> +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)

If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.

> +{
> +	struct dentry *dentry = path->dentry;
> +	struct vfsmount *vfsmnt = path->mnt;
> +
> +	while (dentry != root->dentry || vfsmnt != root->mnt) {
> +		int error;
> +		struct dentry * parent;
> +
> +		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
> +			struct mount *parent = READ_ONCE(mnt->mnt_parent);
> +			struct mnt_namespace *mnt_ns;
> +
> +			/* Escaped? */
> +			if (dentry != vfsmnt->mnt_root)
> +				return 3;
> +
> +			/* Global root? */
> +			if (mnt != parent) {
> +				dentry = READ_ONCE(mnt->mnt_mountpoint);
> +				mnt = parent;
> +				vfsmnt = &mnt->mnt;
> +				continue;
> +			}
> +			mnt_ns = READ_ONCE(mnt->mnt_ns);
> +			/* open-coded is_mounted() to use local mnt_ns */
> +			if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
> +				return 1;	// absolute root
> +
> +			return 2;		// detached or not attached yet
> +			break;

?

> +		}
> +		parent = dentry->d_parent;
> +		prefetch(parent);
> +		error = prepend_name(b, &dentry->d_name);
> +		if (error)
> +			break;

return error, surely?

> +		dentry = parent;
> +	}
> +	return 0;
> +}

FWIW, if we go that way, I would make that

	while (dentry != root->dentry || &mnt->mnt != root->mnt) {
		int error;
		struct dentry *parent = READ_ONCE(dentry->d_parent);

		if (unlikely(dentry == mnt->mnt.mnt_root)) {
			struct mount *m = READ_ONCE(mnt->mnt_parent);

			/* Global root? */
			if (unlikely(mnt == m) {
				struct mnt_namespace *mnt_ns;

				mnt_ns = READ_ONCE(mnt->mnt_ns);
				/* open-coded is_mounted() to use local mnt_ns */
				if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
					return 1;	// absolute root
				else
					return 2;	// detached or not attached yet
			}
			// cross into whatever it's mounted on
			dentry = READ_ONCE(mnt->mnt_mountpoint);
			mnt = m;
			continue;
		}

		/* Escaped? */
		if (unlikely(dentry == parent))
			return 3;

		prefetch(parent);
		error = prepend_name(b, &dentry->d_name);
		if (error)
			return error;
		dentry = parent;
	}
	return 0;
Linus Torvalds May 8, 2021, 10:17 p.m. UTC | #5
On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
>
> > +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
>
> If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.

Too subtle for me.

And is it? Because mnt is from

     mnt = real_mount(path->mnt);

earlier, while vfsmount is plain "path->mnt".

> > +                     return 2;               // detached or not attached yet
> > +                     break;
>
> ?

Leftover. Good catch.

> > +             parent = dentry->d_parent;
> > +             prefetch(parent);
> > +             error = prepend_name(b, &dentry->d_name);
> > +             if (error)
> > +                     break;
>
> return error, surely?

Surely. Bad conversion to the separate function where I missed one of
the "break" statements.

> FWIW, if we go that way, I would make that

No arguments against that - I tried to keep it with the same structure
it had when it was inside prepend_path().

Which I obviously wasn't very good at (see your fixes above ;), but it
was *meant* to be a minimal patch with no structural change.

                      Linus
Linus Torvalds May 8, 2021, 10:42 p.m. UTC | #6
On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> FWIW, if we go that way, I would make that
>
>         while (dentry != root->dentry || &mnt->mnt != root->mnt) {
>                 int error;
>                 struct dentry *parent = READ_ONCE(dentry->d_parent);

Side note: you've added that READ_ONCE() to the parent reading, and I
think that's a bug-fix regardless. The old code does that plain

                parent = dentry->d_parent;

(after doing the mountpoint stuff). And d_parent isn't actually
guaranteed stable here.

It probably does not matter - we are in a RCU read-locked section, so
it's not like parent will go away, but in theory we might end up with
(for example) pre-fetching a different parent than the one we then
walk down.

But your READ_ONCE() is definitely the right thing to do (whether we
do your re-org or not, and whether we do this "prepend_buffer" thing
or not).

Do you want to do a final version with your fixes?

               Linus
Al Viro May 8, 2021, 10:46 p.m. UTC | #7
On Sat, May 08, 2021 at 03:17:44PM -0700, Linus Torvalds wrote:
> On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
> >
> > > +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
> >
> > If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.
> 
> Too subtle for me.
> 
> And is it? Because mnt is from
> 
>      mnt = real_mount(path->mnt);
> 
> earlier, while vfsmount is plain "path->mnt".

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}
Linus Torvalds May 8, 2021, 10:47 p.m. UTC | #8
On Sat, May 8, 2021 at 3:42 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> But your READ_ONCE() is definitely the right thing to do (whether we
> do your re-org or not, and whether we do this "prepend_buffer" thing
> or not).

Oh, and looking at it some more, I think it would probably be a good
thing to make __dentry_path() take a

       struct prepend_buffer *orig

argument, the same way prepend_path() does. Also, like prepend_path(),
the terminating NUL should probably be done by the caller.

Doing those two changes would simplify the hackery we now have in
"dentry_path()" due to the "//deleted" games. The whole "restore '/'
that was overwritten by the NUL added by __dentry_path() is
unbelievably ugly.

            Linus
Linus Torvalds May 8, 2021, 10:48 p.m. UTC | #9
On Sat, May 8, 2021 at 3:46 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> static inline struct mount *real_mount(struct vfsmount *mnt)
> {
>         return container_of(mnt, struct mount, mnt);

Too subtle for me indeed.

           Linus
Al Viro May 8, 2021, 11:15 p.m. UTC | #10
On Sat, May 08, 2021 at 10:46:23PM +0000, Al Viro wrote:
> On Sat, May 08, 2021 at 03:17:44PM -0700, Linus Torvalds wrote:
> > On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
> > >
> > > > +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
> > >
> > > If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.
> > 
> > Too subtle for me.
> > 
> > And is it? Because mnt is from
> > 
> >      mnt = real_mount(path->mnt);
> > 
> > earlier, while vfsmount is plain "path->mnt".
> 
> static inline struct mount *real_mount(struct vfsmount *mnt)
> {
>         return container_of(mnt, struct mount, mnt);
> }

Basically, struct vfsmount instances are always embedded into struct mount ones.
All information about the mount tree is in the latter (and is visible only if
you manage to include fs/mount.h); here we want to walk towards root, so...

Rationale: a lot places use struct vfsmount pointers, but they've no need to
access all that stuff.  So struct vfsmount got trimmed down, with most of the
things that used to be there migrating into the containing structure.

[Christian Browner Cc'd]
BTW, WTF do we have struct mount.user_ns and struct vfsmount.mnt_userns?
Can they ever be different?  Christian?

	Sigh...  Namespace flavours always remind me of old joke -
Highlander II: There Should've Been Only One...
Al Viro May 8, 2021, 11:18 p.m. UTC | #11
On Sat, May 08, 2021 at 11:15:28PM +0000, Al Viro wrote:

> [Christian Browner Cc'd]

Brauner, that is.  Sorry ;-/
Al Viro May 9, 2021, 2:20 a.m. UTC | #12
On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
> -static int prepend(char **buffer, int *buflen, const char *str, int namelen)
> +struct prepend_buffer {
> +	char *ptr;
> +	int len;
> +};
> +
> +static int prepend(struct prepend_buffer *b, const char *str, int namelen)
>  {
> -	*buflen -= namelen;
> -	if (*buflen < 0)
> +	b->len -= namelen;
> +	if (b->len < 0)
>  		return -ENAMETOOLONG;
> -	*buffer -= namelen;
> -	memcpy(*buffer, str, namelen);
> +	b->ptr -= namelen;
> +	memcpy(b->ptr, str, namelen);
>  	return 0;
>  }

OK, that part is pretty obvious - pointers to a couple of objects in the same
stack frame replaced with a single pointer to structure consisting of those
two objects.  Might actually be an optimization.

> @@ -35,16 +40,16 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
>   *
>   * Load acquire is needed to make sure that we see that terminating NUL.
>   */
> -static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
> +static int prepend_name(struct prepend_buffer *b, const struct qstr *name)
>  {
>  	const char *dname = smp_load_acquire(&name->name); /* ^^^ */
>  	u32 dlen = READ_ONCE(name->len);
>  	char *p;
>  
> -	*buflen -= dlen + 1;
> -	if (*buflen < 0)
> +	b->len -= dlen + 1;
> +	if (b->len < 0)
>  		return -ENAMETOOLONG;
> -	p = *buffer -= dlen + 1;
> +	p = b->ptr -= dlen + 1;
>  	*p++ = '/';
>  	while (dlen--) {
>  		char c = *dname++;

Ditto.

> @@ -55,6 +60,50 @@ static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
>  	return 0;
>  }
>  
> +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
> +{
> +	struct dentry *dentry = path->dentry;
> +	struct vfsmount *vfsmnt = path->mnt;
> +
> +	while (dentry != root->dentry || vfsmnt != root->mnt) {
> +		int error;
> +		struct dentry * parent;
> +
> +		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
> +			struct mount *parent = READ_ONCE(mnt->mnt_parent);
> +			struct mnt_namespace *mnt_ns;
> +
> +			/* Escaped? */
> +			if (dentry != vfsmnt->mnt_root)
> +				return 3;
> +
> +			/* Global root? */
> +			if (mnt != parent) {
> +				dentry = READ_ONCE(mnt->mnt_mountpoint);
> +				mnt = parent;
> +				vfsmnt = &mnt->mnt;
> +				continue;
> +			}
> +			mnt_ns = READ_ONCE(mnt->mnt_ns);
> +			/* open-coded is_mounted() to use local mnt_ns */
> +			if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
> +				return 1;	// absolute root
> +
> +			return 2;		// detached or not attached yet
> +			break;
> +		}
> +		parent = dentry->d_parent;
> +		prefetch(parent);
> +		error = prepend_name(b, &dentry->d_name);
> +		if (error)
> +			break;
> +
> +		dentry = parent;
> +	}
> +	return 0;
> +}

See other reply.

> +
>  /**
>   * prepend_path - Prepend path string to a buffer
>   * @path: the dentry/vfsmount to report
> @@ -74,15 +123,12 @@ static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
>   */
>  static int prepend_path(const struct path *path,
>  			const struct path *root,
> -			char **buffer, int *buflen)
> +			struct prepend_buffer *orig)
>  {
> -	struct dentry *dentry;
> -	struct vfsmount *vfsmnt;
>  	struct mount *mnt;
>  	int error = 0;
>  	unsigned seq, m_seq = 0;
> -	char *bptr;
> -	int blen;
> +	struct prepend_buffer b;
>  
>  	rcu_read_lock();
>  restart_mnt:
> @@ -90,50 +136,12 @@ static int prepend_path(const struct path *path,
>  	seq = 0;
>  	rcu_read_lock();
>  restart:
> -	bptr = *buffer;
> -	blen = *buflen;
> -	error = 0;
> -	dentry = path->dentry;
> -	vfsmnt = path->mnt;
> -	mnt = real_mount(vfsmnt);
> +	b = *orig;
> +	mnt = real_mount(path->mnt);
>  	read_seqbegin_or_lock(&rename_lock, &seq);
> -	while (dentry != root->dentry || vfsmnt != root->mnt) {
> -		struct dentry * parent;
>  
> -		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
> -			struct mount *parent = READ_ONCE(mnt->mnt_parent);
> -			struct mnt_namespace *mnt_ns;
> +	error = prepend_entries(&b, path, root, mnt);
>  
> -			/* Escaped? */
> -			if (dentry != vfsmnt->mnt_root) {
> -				bptr = *buffer;
> -				blen = *buflen;
> -				error = 3;
> -				break;
> -			}
> -			/* Global root? */
> -			if (mnt != parent) {
> -				dentry = READ_ONCE(mnt->mnt_mountpoint);
> -				mnt = parent;
> -				vfsmnt = &mnt->mnt;
> -				continue;
> -			}
> -			mnt_ns = READ_ONCE(mnt->mnt_ns);
> -			/* open-coded is_mounted() to use local mnt_ns */
> -			if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
> -				error = 1;	// absolute root
> -			else
> -				error = 2;	// detached or not attached yet
> -			break;
> -		}
> -		parent = dentry->d_parent;
> -		prefetch(parent);
> -		error = prepend_name(&bptr, &blen, &dentry->d_name);
> -		if (error)
> -			break;
> -
> -		dentry = parent;
> -	}
>  	if (!(seq & 1))
>  		rcu_read_unlock();
>  	if (need_seqretry(&rename_lock, seq)) {
> @@ -150,14 +158,17 @@ static int prepend_path(const struct path *path,
>  	}
>  	done_seqretry(&mount_lock, m_seq);
>  
> -	if (error >= 0 && bptr == *buffer) {
> -		if (--blen < 0)
> +	// Escaped? No path
> +	if (error == 3)
> +		b = *orig;
> +
> +	if (error >= 0 && b.ptr == orig->ptr) {
> +		if (--b.len < 0)
>  			error = -ENAMETOOLONG;
>  		else
> -			*--bptr = '/';
> +			*--b.ptr = '/';
>  	}
> -	*buffer = bptr;
> -	*buflen = blen;
> +	*orig = b;
>  	return error;
>  }
>  
> @@ -181,34 +192,34 @@ char *__d_path(const struct path *path,
>  	       const struct path *root,
>  	       char *buf, int buflen)
>  {
> -	char *res = buf + buflen;
> +	struct prepend_buffer b = { buf + buflen, buflen };
>  	int error;
>  
> -	prepend(&res, &buflen, "\0", 1);
> -	error = prepend_path(path, root, &res, &buflen);
> +	prepend(&b, "\0", 1);
> +	error = prepend_path(path, root, &b);

Minor yuck: that should be "", 1 (in the original as well).  Same below...
Fairly subtle point: we do *not* need to check for failures here, since
prepend_path() will attempt to produce at least something.  And we'll
catch that failure just fine.  However, we do depend upon buflen being
non-negative here.  If we ever called that (or any other in that family,
really) with buflen == MIN_INT, we'd get seriously unpleasant results.
No such callers exist, thankfully.

>  char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
>  {
> -	char *end = buffer + buflen;
> +	struct prepend_buffer b = { buffer + buflen, buflen };
> +
>  	/* these dentries are never renamed, so d_lock is not needed */
> -	if (prepend(&end, &buflen, " (deleted)", 11) ||
> -	    prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
> -	    prepend(&end, &buflen, "/", 1))  
> -		end = ERR_PTR(-ENAMETOOLONG);
> -	return end;
> +	if (prepend(&b, " (deleted)", 11) ||
> +	    prepend(&b, dentry->d_name.name, dentry->d_name.len) ||
> +	    prepend(&b, "/", 1))
> +		return ERR_PTR(-ENAMETOOLONG);
> +	return b.ptr;
>  }

Umm...  Interesting, especially considering the way dyname_dname() looks like.
char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
                        const char *fmt, ...)
{
        va_list args;
        char temp[64];
        int sz;

        va_start(args, fmt);
        sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
        va_end(args);

        if (sz > sizeof(temp) || sz > buflen)
                return ERR_PTR(-ENAMETOOLONG);

        buffer += buflen - sz;
        return memcpy(buffer, temp, sz);
}

Looks like there's a piece of prepend() open-coded in it.  And
since all ->d_dname() instances are either simple_dname() or end up
with call of dynamic_dname()...

Might make sense to turn that method into
	int (*d_dname)(struct dentry *, struct prepend_buffer *);

Followup patch, obviously, but it might be worth looking into.

Another thing that keeps bugging me about prepend_path() is the
set of return values.  I mean, 0/1/2/3/-ENAMETOOLONG, and all
except 0 are unlikely?  Might as well make that 0/1/2/3/-1, if
not an outright 0/1/2/3/4.  And prepend() could return bool, while
we are at it (true - success, false - overflow)...
Al Viro May 9, 2021, 2:28 a.m. UTC | #13
On Sat, May 08, 2021 at 03:47:38PM -0700, Linus Torvalds wrote:
> On Sat, May 8, 2021 at 3:42 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > But your READ_ONCE() is definitely the right thing to do (whether we
> > do your re-org or not, and whether we do this "prepend_buffer" thing
> > or not).
> 
> Oh, and looking at it some more, I think it would probably be a good
> thing to make __dentry_path() take a
> 
>        struct prepend_buffer *orig
> 
> argument, the same way prepend_path() does. Also, like prepend_path(),
> the terminating NUL should probably be done by the caller.
> 
> Doing those two changes would simplify the hackery we now have in
> "dentry_path()" due to the "//deleted" games. The whole "restore '/'
> that was overwritten by the NUL added by __dentry_path() is
> unbelievably ugly.

	Agreed.  Re READ_ONCE() - we are wrapped into
read_seqbegin_or_lock(&rename_lock, &seq) there, so it's more about
being explicit than about correctness considerations.
Linus Torvalds May 9, 2021, 2:53 a.m. UTC | #14
On Sat, May 8, 2021 at 7:28 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
>         Re READ_ONCE() - we are wrapped into
> read_seqbegin_or_lock(&rename_lock, &seq) there, so it's more about
> being explicit than about correctness considerations.

Well, part of this all is that the next step is that "vsnprintf()"
with '%pD' would basically use prepend_entries() with just the RCU
lock.

That said, even with the rename lock, that will only cause a retry on
rename - it won't necessarily fix any confusion that comes from the
compiler possibly silently re-loading 'parent' multiple times, and
getting different pointers due to a concurrent rename.

Now, those different results should all be individually ok, due to RCU
freeing, but it's _really_ confusing if 'parent' might be two
different things within the same iteration of the loop.

I don't see anything truly horrible that would happen - mainly "we'll
prefetch one parent, and then due to reloading the pointer we might
actually _use_ another parent entirely for the next iteration", but it
really is best to avoid that kind of confusion.

                Linus
Al Viro May 9, 2021, 4:58 a.m. UTC | #15
On Sun, May 09, 2021 at 02:20:32AM +0000, Al Viro wrote:
> Umm...  Interesting, especially considering the way dyname_dname() looks like.
                                                      dynamic_dname(), obviously.

> Looks like there's a piece of prepend() open-coded in it.  And
> since all ->d_dname() instances are either simple_dname() or end up
> with call of dynamic_dname()...
> 
> Might make sense to turn that method into
> 	int (*d_dname)(struct dentry *, struct prepend_buffer *);
> 
> Followup patch, obviously, but it might be worth looking into.
> 
> Another thing that keeps bugging me about prepend_path() is the
> set of return values.  I mean, 0/1/2/3/-ENAMETOOLONG, and all
> except 0 are unlikely?  Might as well make that 0/1/2/3/-1, if
> not an outright 0/1/2/3/4.  And prepend() could return bool, while
> we are at it (true - success, false - overflow)...

OK...  I think I see how to carve it up sanely, will post after I
get some sleep.
Eric W. Biederman May 9, 2021, 10:58 p.m. UTC | #16
Al Viro <viro@zeniv.linux.org.uk> writes:

> On Sat, May 08, 2021 at 10:46:23PM +0000, Al Viro wrote:
>> On Sat, May 08, 2021 at 03:17:44PM -0700, Linus Torvalds wrote:
>> > On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>> > >
>> > > On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
>> > >
>> > > > +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
>> > >
>> > > If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.
>> > 
>> > Too subtle for me.
>> > 
>> > And is it? Because mnt is from
>> > 
>> >      mnt = real_mount(path->mnt);
>> > 
>> > earlier, while vfsmount is plain "path->mnt".
>> 
>> static inline struct mount *real_mount(struct vfsmount *mnt)
>> {
>>         return container_of(mnt, struct mount, mnt);
>> }
>
> Basically, struct vfsmount instances are always embedded into struct mount ones.
> All information about the mount tree is in the latter (and is visible only if
> you manage to include fs/mount.h); here we want to walk towards root, so...
>
> Rationale: a lot places use struct vfsmount pointers, but they've no need to
> access all that stuff.  So struct vfsmount got trimmed down, with most of the
> things that used to be there migrating into the containing structure.
>
> [Christian Browner Cc'd]
> BTW, WTF do we have struct mount.user_ns and struct vfsmount.mnt_userns?
> Can they ever be different?  Christian?

I presume you are asking about struct mnt_namespace.user_ns and
struct vfsmount.mnt_userns.

That must the idmapped mounts work.

In short mnt_namespace.user_ns is the user namespace that owns
the mount namespace.

vfsmount.mnt_userns functionally could be reduced to just some struct
uid_gid_map structures hanging off the vfsmount.  It's purpose is
to add a generic translation of uids and gids on from the filesystem
view to the what we want to show userspace.

That code could probably benefit from some refactoring so it is clearer,
and some serious fixes.  I reported it earlier but it looks like there
is some real breakage in chown if you use idmapped mounts.

Eric
Christian Brauner May 10, 2021, 7:20 a.m. UTC | #17
On Sat, May 08, 2021 at 11:15:28PM +0000, Al Viro wrote:
> On Sat, May 08, 2021 at 10:46:23PM +0000, Al Viro wrote:
> > On Sat, May 08, 2021 at 03:17:44PM -0700, Linus Torvalds wrote:
> > > On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > > >
> > > > On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
> > > >
> > > > > +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
> > > >
> > > > If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.
> > > 
> > > Too subtle for me.
> > > 
> > > And is it? Because mnt is from
> > > 
> > >      mnt = real_mount(path->mnt);
> > > 
> > > earlier, while vfsmount is plain "path->mnt".
> > 
> > static inline struct mount *real_mount(struct vfsmount *mnt)
> > {
> >         return container_of(mnt, struct mount, mnt);
> > }
> 
> Basically, struct vfsmount instances are always embedded into struct mount ones.
> All information about the mount tree is in the latter (and is visible only if
> you manage to include fs/mount.h); here we want to walk towards root, so...
> 
> Rationale: a lot places use struct vfsmount pointers, but they've no need to
> access all that stuff.  So struct vfsmount got trimmed down, with most of the
> things that used to be there migrating into the containing structure.
> 
> [Christian Browner Cc'd]
> BTW, WTF do we have struct mount.user_ns and struct vfsmount.mnt_userns?
> Can they ever be different?  Christian?

Yes, they can.

> 
> 	Sigh...  Namespace flavours always remind me of old joke -
> Highlander II: There Should've Been Only One...

(I'd prefer if mount propagation would die first.)
Christian Brauner May 10, 2021, 12:51 p.m. UTC | #18
On Sun, May 09, 2021 at 05:58:22PM -0500, Eric W. Biederman wrote:
> Al Viro <viro@zeniv.linux.org.uk> writes:
> 
> > On Sat, May 08, 2021 at 10:46:23PM +0000, Al Viro wrote:
> >> On Sat, May 08, 2021 at 03:17:44PM -0700, Linus Torvalds wrote:
> >> > On Sat, May 8, 2021 at 2:06 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >> > >
> >> > > On Sat, May 08, 2021 at 01:39:45PM -0700, Linus Torvalds wrote:
> >> > >
> >> > > > +static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
> >> > >
> >> > > If anything, s/path/dentry/, since vfsmnt here will be equal to &mnt->mnt all along.
> >> > 
> >> > Too subtle for me.
> >> > 
> >> > And is it? Because mnt is from
> >> > 
> >> >      mnt = real_mount(path->mnt);
> >> > 
> >> > earlier, while vfsmount is plain "path->mnt".
> >> 
> >> static inline struct mount *real_mount(struct vfsmount *mnt)
> >> {
> >>         return container_of(mnt, struct mount, mnt);
> >> }
> >
> > Basically, struct vfsmount instances are always embedded into struct mount ones.
> > All information about the mount tree is in the latter (and is visible only if
> > you manage to include fs/mount.h); here we want to walk towards root, so...
> >
> > Rationale: a lot places use struct vfsmount pointers, but they've no need to
> > access all that stuff.  So struct vfsmount got trimmed down, with most of the
> > things that used to be there migrating into the containing structure.
> >
> > [Christian Browner Cc'd]
> > BTW, WTF do we have struct mount.user_ns and struct vfsmount.mnt_userns?
> > Can they ever be different?  Christian?
> 
> I presume you are asking about struct mnt_namespace.user_ns and
> struct vfsmount.mnt_userns.
> 
> That must the idmapped mounts work.
> 
> In short mnt_namespace.user_ns is the user namespace that owns
> the mount namespace.
> 
> vfsmount.mnt_userns functionally could be reduced to just some struct
> uid_gid_map structures hanging off the vfsmount.  It's purpose is

No. The userns can in the future be used for permission checking when
delegating features per mount.

> to add a generic translation of uids and gids on from the filesystem
> view to the what we want to show userspace.
> 
> That code could probably benefit from some refactoring so it is clearer,
> and some serious fixes.  I reported it earlier but it looks like there
> is some real breakage in chown if you use idmapped mounts.

You mentioned something about chown already some weeks ago here [1] and
never provided any details or reproducer for it. This code is
extensively covered by xfstests and systemd and others are already using
it so far without any issues reported by users. If there is an issue,
it'd be good to fix them and see the tests changed to cover that
particular case.

[1]: https://lore.kernel.org/lkml/20210213130042.828076-1-christian.brauner@ubuntu.com/T/#m3a9df31aa183e8797c70bc193040adfd601399ad
Justin He May 10, 2021, 3:07 p.m. UTC | #19
Hi Linus
I don't know how to display the attachment in this mail.
So I copied here below, together with one comment:
> -----Original Message-----
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Sent: Sunday, May 9, 2021 4:40 AM
> To: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Justin He <Justin.He@arm.com>; Petr Mladek <pmladek@suse.com>; Steven
> Rostedt <rostedt@goodmis.org>; Sergey Senozhatsky
> <senozhatsky@chromium.org>; Andy Shevchenko
> <andriy.shevchenko@linux.intel.com>; Rasmus Villemoes
> <linux@rasmusvillemoes.dk>; Jonathan Corbet <corbet@lwn.net>; Al Viro
> <viro@ftp.linux.org.uk>; Heiko Carstens <hca@linux.ibm.com>; Vasily Gorbik
> <gor@linux.ibm.com>; Christian Borntraeger <borntraeger@de.ibm.com>; Eric
> W . Biederman <ebiederm@xmission.com>; Darrick J. Wong
> <darrick.wong@oracle.com>; Peter Zijlstra (Intel) <peterz@infradead.org>;
> Ira Weiny <ira.weiny@intel.com>; Eric Biggers <ebiggers@google.com>; Ahmed
> S. Darwish <a.darwish@linutronix.de>; open list:DOCUMENTATION <linux-
> doc@vger.kernel.org>; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>; linux-s390 <linux-s390@vger.kernel.org>; linux-
> fsdevel <linux-fsdevel@vger.kernel.org>
> Subject: Re: [PATCH RFC 1/3] fs: introduce helper d_path_fast()
>
> On Sat, May 8, 2021 at 12:13 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > Potentiallty delicate question is how to pass bptr/blen to the caller
> > of that helper...
>
> So I was thinking something like this patch..
>
> THIS IS TOTALLY UNTESTED!
>
> I've basically introduced a "struct prepend_buffer" that acts as that
> "end,len" pair and that gets passed around. It builds cleanly for me,
> which actually makes me fairly happy that it might even be close
> right, because the calling conventions would catch any mistake.
>
> It looks superficially sane to me, but again - untested.
>
> The patch ended up being bigger than I expected, but it's all fairly
> straightforward - just munging the calling conventions.
>
> Oh, and I did extract out the core of "prepend_path()" into
> "prepend_entries()" just to show how it would work.
>
>               Linus
> fs/d_path.c | 227 ++++++++++++++++++++++++++++++++----------------------------
> 1 file changed, 123 insertions(+), 104 deletions(-)
>
>diff --git a/fs/d_path.c b/fs/d_path.c
>index 270d62133996..47eb29524271 100644
>--- a/fs/d_path.c
>+++ b/fs/d_path.c
>@@ -8,13 +8,18 @@
> #include <linux/prefetch.h>
> #include "mount.h"
>
>-static int prepend(char **buffer, int *buflen, const char *str, int namelen)
>+struct prepend_buffer {
>+      char *ptr;
>+      int len;
>+};
>+
>+static int prepend(struct prepend_buffer *b, const char *str, int namelen)
> {
>-      *buflen -= namelen;
>-      if (*buflen < 0)
>+      b->len -= namelen;
>+      if (b->len < 0)
>               return -ENAMETOOLONG;
>-      *buffer -= namelen;
>-      memcpy(*buffer, str, namelen);
>+      b->ptr -= namelen;
>+      memcpy(b->ptr, str, namelen);
>       return 0;
> }
>
>@@ -35,16 +40,16 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
>  *
>  * Load acquire is needed to make sure that we see that terminating NUL.
>  */
>-static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
>+static int prepend_name(struct prepend_buffer *b, const struct qstr *name)
> {
>       const char *dname = smp_load_acquire(&name->name); /* ^^^ */
>       u32 dlen = READ_ONCE(name->len);
>       char *p;
>
>-      *buflen -= dlen + 1;
>-      if (*buflen < 0)
>+      b->len -= dlen + 1;
>+      if (b->len < 0)
>               return -ENAMETOOLONG;
>-      p = *buffer -= dlen + 1;
>+      p = b->ptr -= dlen + 1;
>       *p++ = '/';
>       while (dlen--) {
>               char c = *dname++;
>@@ -55,6 +60,50 @@ static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
>       return 0;
> }
>
>+static inline int prepend_entries(struct prepend_buffer *b, const struct path *path, const struct path *root, struct mount *mnt)
>+{
>+      struct dentry *dentry = path->dentry;
>+      struct vfsmount *vfsmnt = path->mnt;
>+
>+      while (dentry != root->dentry || vfsmnt != root->mnt) {
>+              int error;
>+              struct dentry * parent;
>+
>+              if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
>+                      struct mount *parent = READ_ONCE(mnt->mnt_parent);
>+                      struct mnt_namespace *mnt_ns;
>+
>+                      /* Escaped? */
>+                      if (dentry != vfsmnt->mnt_root)
>+                              return 3;
>+
>+                      /* Global root? */
>+                      if (mnt != parent) {
>+                              dentry = READ_ONCE(mnt->mnt_mountpoint);
>+                              mnt = parent;
>+                              vfsmnt = &mnt->mnt;
>+                              continue;
>+                      }
>+                      mnt_ns = READ_ONCE(mnt->mnt_ns);
>+                      /* open-coded is_mounted() to use local mnt_ns */
>+                      if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
>+                              return 1;       // absolute root
>+
>+                      return 2;               // detached or not attached yet
>+                      break;
>+              }
>+              parent = dentry->d_parent;
>+              prefetch(parent);
>+              error = prepend_name(b, &dentry->d_name);
>+              if (error)
>+                      break;
>+
>+              dentry = parent;
>+      }
>+      return 0;
>+}
>+
>+
> /**
>  * prepend_path - Prepend path string to a buffer
>  * @path: the dentry/vfsmount to report
>@@ -74,15 +123,12 @@ static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
>  */
> static int prepend_path(const struct path *path,
>                       const struct path *root,
>-                      char **buffer, int *buflen)
>+                      struct prepend_buffer *orig)
> {
>-      struct dentry *dentry;
>-      struct vfsmount *vfsmnt;
>       struct mount *mnt;
>       int error = 0;
>       unsigned seq, m_seq = 0;
>-      char *bptr;
>-      int blen;
>+      struct prepend_buffer b;
>
>       rcu_read_lock();
> restart_mnt:
>@@ -90,50 +136,12 @@ static int prepend_path(const struct path *path,
>       seq = 0;
>       rcu_read_lock();
> restart:
>-      bptr = *buffer;
>-      blen = *buflen;
>-      error = 0;
>-      dentry = path->dentry;
>-      vfsmnt = path->mnt;
>-      mnt = real_mount(vfsmnt);
>+      b = *orig;
>+      mnt = real_mount(path->mnt);
>       read_seqbegin_or_lock(&rename_lock, &seq);
>-      while (dentry != root->dentry || vfsmnt != root->mnt) {
>-              struct dentry * parent;
>
>-              if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
>-                      struct mount *parent = READ_ONCE(mnt->mnt_parent);
>-                      struct mnt_namespace *mnt_ns;
>+      error = prepend_entries(&b, path, root, mnt);
>
>-                      /* Escaped? */
>-                      if (dentry != vfsmnt->mnt_root) {
>-                              bptr = *buffer;
>-                              blen = *buflen;
>-                              error = 3;
>-                              break;
>-                      }
>-                      /* Global root? */
>-                      if (mnt != parent) {
>-                              dentry = READ_ONCE(mnt->mnt_mountpoint);
>-                              mnt = parent;
>-                              vfsmnt = &mnt->mnt;
>-                              continue;
>-                      }
>-                      mnt_ns = READ_ONCE(mnt->mnt_ns);
>-                      /* open-coded is_mounted() to use local mnt_ns */
>-                      if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
>-                              error = 1;      // absolute root
>-                      else
>-                              error = 2;      // detached or not attached yet
>-                      break;
>-              }
>-              parent = dentry->d_parent;
>-              prefetch(parent);
>-              error = prepend_name(&bptr, &blen, &dentry->d_name);
>-              if (error)
>-                      break;
>-
>-              dentry = parent;
>-      }
>       if (!(seq & 1))
>               rcu_read_unlock();
>       if (need_seqretry(&rename_lock, seq)) {
>@@ -150,14 +158,17 @@ static int prepend_path(const struct path *path,
>       }
>       done_seqretry(&mount_lock, m_seq);
>
>-      if (error >= 0 && bptr == *buffer) {
>-              if (--blen < 0)
>+      // Escaped? No path
>+      if (error == 3)
>+              b = *orig;
>+
>+      if (error >= 0 && b.ptr == orig->ptr) {
>+              if (--b.len < 0)
>                       error = -ENAMETOOLONG;
>               else
>-                      *--bptr = '/';
>+                      *--b.ptr = '/';
>       }
>-      *buffer = bptr;
>-      *buflen = blen;
>+      *orig = b;
>       return error;
> }
>
>@@ -181,34 +192,34 @@ char *__d_path(const struct path *path,
>              const struct path *root,
>              char *buf, int buflen)
> {
>-      char *res = buf + buflen;
>+      struct prepend_buffer b = { buf + buflen, buflen };
>       int error;
>
>-      prepend(&res, &buflen, "\0", 1);
>-      error = prepend_path(path, root, &res, &buflen);
>+      prepend(&b, "\0", 1);
>+      error = prepend_path(path, root, &b);
>
>       if (error < 0)
>               return ERR_PTR(error);
>       if (error > 0)
>               return NULL;
>-      return res;
>+      return b.ptr;
> }
>
> char *d_absolute_path(const struct path *path,
>              char *buf, int buflen)
> {
>       struct path root = {};
>-      char *res = buf + buflen;
>+      struct prepend_buffer b = { buf + buflen, buflen };
>       int error;
>
>-      prepend(&res, &buflen, "\0", 1);
>-      error = prepend_path(path, &root, &res, &buflen);
>+      prepend(&b, "\0", 1);
>+      error = prepend_path(path, &root, &b);
>
>       if (error > 1)
>               error = -EINVAL;
>       if (error < 0)
>               return ERR_PTR(error);
>-      return res;
>+      return b.ptr;
> }
>
> /*
>@@ -216,21 +227,21 @@ char *d_absolute_path(const struct path *path,
>  */
> static int path_with_deleted(const struct path *path,
>                            const struct path *root,
>-                           char **buf, int *buflen)
>+                           struct prepend_buffer *b)
> {
>-      prepend(buf, buflen, "\0", 1);
>+      prepend(b, "\0", 1);
>       if (d_unlinked(path->dentry)) {
>-              int error = prepend(buf, buflen, " (deleted)", 10);
>+              int error = prepend(b, " (deleted)", 10);
>               if (error)
>                       return error;
>       }
>
>-      return prepend_path(path, root, buf, buflen);
>+      return prepend_path(path, root, b);
> }
>
>-static int prepend_unreachable(char **buffer, int *buflen)
>+static int prepend_unreachable(struct prepend_buffer *b)
> {
>-      return prepend(buffer, buflen, "(unreachable)", 13);
>+      return prepend(b, "(unreachable)", 13);
> }
>
> static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
>@@ -261,7 +272,7 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
>  */
> char *d_path(const struct path *path, char *buf, int buflen)
> {
>-      char *res = buf + buflen;
>+      struct prepend_buffer b = { buf + buflen, buflen };
>       struct path root;
>       int error;
>
>@@ -282,12 +293,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
>
>       rcu_read_lock();
>       get_fs_root_rcu(current->fs, &root);
>-      error = path_with_deleted(path, &root, &res, &buflen);
>+      error = path_with_deleted(path, &root, &b);
>       rcu_read_unlock();
>
>       if (error < 0)
>-              res = ERR_PTR(error);
>-      return res;
>+              return ERR_PTR(error);
>+      return b.ptr;
> }
> EXPORT_SYMBOL(d_path);
>
>@@ -314,13 +325,14 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
>
> char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
> {
>-      char *end = buffer + buflen;
>+      struct prepend_buffer b = { buffer + buflen, buflen };
>+
>       /* these dentries are never renamed, so d_lock is not needed */
>-      if (prepend(&end, &buflen, " (deleted)", 11) ||
>-          prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
>-          prepend(&end, &buflen, "/", 1))
>-              end = ERR_PTR(-ENAMETOOLONG);
>-      return end;
>+      if (prepend(&b, " (deleted)", 11) ||
>+          prepend(&b, dentry->d_name.name, dentry->d_name.len) ||
>+          prepend(&b, "/", 1))
>+              return ERR_PTR(-ENAMETOOLONG);
>+      return b.ptr;
> }
>
> /*
>@@ -329,8 +341,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
> static char *__dentry_path(const struct dentry *d, char *buf, int buflen)
> {
>       const struct dentry *dentry;
>-      char *end, *retval;
>-      int len, seq = 0;
>+      struct prepend_buffer b;
>+      char *retval;
>+      int seq = 0;
>       int error = 0;
>
>       if (buflen < 2)
>@@ -339,22 +352,22 @@ static char *__dentry_path(const struct dentry *d, char *buf, int buflen)
>       rcu_read_lock();
> restart:
>       dentry = d;
>-      end = buf + buflen;
>-      len = buflen;
>-      prepend(&end, &len, "\0", 1);
>+      b.ptr = buf + buflen;
>+      b.len = buflen;
>+      prepend(&b, "\0", 1);
>       /* Get '/' right */
>-      retval = end-1;
>+      retval = b.ptr-1;
>       *retval = '/';
>       read_seqbegin_or_lock(&rename_lock, &seq);
>       while (!IS_ROOT(dentry)) {
>               const struct dentry *parent = dentry->d_parent;
>
>               prefetch(parent);
>-              error = prepend_name(&end, &len, &dentry->d_name);
>+              error = prepend_name(&b, &dentry->d_name);
>               if (error)
>                       break;
>
>-              retval = end;
>+              retval = b.ptr;
>               dentry = parent;
>       }
>       if (!(seq & 1))
>@@ -379,16 +392,23 @@ EXPORT_SYMBOL(dentry_path_raw);
>
> char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
> {
>-      char *p = NULL;
>+      struct prepend_buffer b = { buf + buflen, buflen };
>       char *retval;
>+      char *p = NULL;
>
>       if (d_unlinked(dentry)) {
>-              p = buf + buflen;
>-              if (prepend(&p, &buflen, "//deleted", 10) != 0)
>+              if (prepend(&b, "//deleted", 10) != 0)
>                       goto Elong;
>-              buflen++;
>+
>+              // save away beginning of "//deleted" string
>+              // and let "__dentry_path()" overwrite one byte
>+              // with the terminating NUL that we'll restore
>+              // below.
>+              p = b.ptr;
>+              b.ptr++;
>+              b.len++;
>       }
>-      retval = __dentry_path(dentry, buf, buflen);
>+      retval = __dentry_path(dentry, b.ptr, b.len);

I didn't quite understand the logic here. Seems it is not equal to
the previous. Should it be s/b.ptr/buf here? Otherwise, in __dentry_path,
it will use the range [b.ptr, b.ptr+b.len] instead of [buf, buf+b.len].
Am I missing anything here?

--
Cheers,
Justin (Jia He)

>       if (!IS_ERR(retval) && p)
>               *p = '/';       /* restore '/' overriden with '\0' */
>       return retval;
>@@ -441,11 +461,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
>       error = -ENOENT;
>       if (!d_unlinked(pwd.dentry)) {
>               unsigned long len;
>-              char *cwd = page + PATH_MAX;
>-              int buflen = PATH_MAX;
>+              struct prepend_buffer b = { page + PATH_MAX, PATH_MAX };
>
>-              prepend(&cwd, &buflen, "\0", 1);
>-              error = prepend_path(&pwd, &root, &cwd, &buflen);
>+              prepend(&b, "\0", 1);
>+              error = prepend_path(&pwd, &root, &b);
>               rcu_read_unlock();
>
>               if (error < 0)
>@@ -453,16 +472,16 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
>
>               /* Unreachable from current root */
>               if (error > 0) {
>-                      error = prepend_unreachable(&cwd, &buflen);
>+                      error = prepend_unreachable(&b);
>                       if (error)
>                               goto out;
>               }
>
>               error = -ERANGE;
>-              len = PATH_MAX + page - cwd;
>+              len = PATH_MAX + page - b.ptr;
>               if (len <= size) {
>                       error = len;
>-                      if (copy_to_user(buf, cwd, len))
>+                      if (copy_to_user(buf, b.ptr, len))
>                               error = -EFAULT;
>               }
>       } else {
>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Eric W. Biederman May 10, 2021, 4:16 p.m. UTC | #20
Al Viro <viro@zeniv.linux.org.uk> writes:
>
> Another thing that keeps bugging me about prepend_path() is the
> set of return values.  I mean, 0/1/2/3/-ENAMETOOLONG, and all
> except 0 are unlikely?  Might as well make that 0/1/2/3/-1, if
> not an outright 0/1/2/3/4.  And prepend() could return bool, while
> we are at it (true - success, false - overflow)...

I remember seeing that the different callers of prepend_path treated
those different cases differently.

But now that I look again the return value 3 (escaped) gets lumped
together with 2(detached).


On second look it appears that the two patterns that we actually have
are basically:

char *__d_path(const struct path *path,
	       const struct path *root,
	       char *buf, int buflen)
{
	error = prepend_path(path, root, &res, &buflen);

	if (error < 0)
		return ERR_PTR(error);
	if (error > 0)
		return NULL;
	return res;
}

char *d_absolute_path(const struct path *path,
	       char *buf, int buflen)
{
	error = prepend_path(path, &root, &res, &buflen);

	if (error > 1)
		error = -EINVAL;
	if (error < 0)
		return ERR_PTR(error);
	return res;
}

With d_absolute_path deciding that return value 1 absolute is not an
error.

That does look like there is plenty of room to refactor and make things
clearer.


Eric
Linus Torvalds May 10, 2021, 5:03 p.m. UTC | #21
On Mon, May 10, 2021 at 8:08 AM Justin He <Justin.He@arm.com> wrote:
>
> >
> > char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
> > {
> >-      char *p = NULL;
> >+      struct prepend_buffer b = { buf + buflen, buflen };
> >       char *retval;
> >+      char *p = NULL;
> >
> >       if (d_unlinked(dentry)) {
> >-              p = buf + buflen;
> >-              if (prepend(&p, &buflen, "//deleted", 10) != 0)
> >+              if (prepend(&b, "//deleted", 10) != 0)
> >                       goto Elong;
> >-              buflen++;
> >+
> >+              // save away beginning of "//deleted" string
> >+              // and let "__dentry_path()" overwrite one byte
> >+              // with the terminating NUL that we'll restore
> >+              // below.
> >+              p = b.ptr;
> >+              b.ptr++;
> >+              b.len++;
> >       }
> >-      retval = __dentry_path(dentry, buf, buflen);
> >+      retval = __dentry_path(dentry, b.ptr, b.len);
>
> I didn't quite understand the logic here. Seems it is not equal to
> the previous. Should it be s/b.ptr/buf here? Otherwise, in __dentry_path,
> it will use the range [b.ptr, b.ptr+b.len] instead of [buf, buf+b.len].
> Am I missing anything here?

No, you're right. That __dentry_path() call should get "buf, b.len" as
arguments.

I knew it was squirrelly, but didn't think it through. I actually
wanted to change "__dentry_path()" to take a "struct prepend_buffer",
and not add the NUL at the end (so that the caller would have to do it
first), because that would have made the logic much more
straightforward (and made the semantics the same as the other internal
helpers).

And that would have fixed that bug of mine too. But then I didn't do
it, and just mentioned it  as a later cleanup.

Good catch.

                  Linus
Al Viro May 19, 2021, 12:43 a.m. UTC | #22
First of all, apologies for delays (one of the disks on the main
devel/testing box has become an ex-parrot, with... interesting recovery).

	Here's what I've got for carve-up of cleanups.  This stuff lives
in git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.d_path,
individual patches in followups.  14 commits, all changes are to fs/d_path.c,
total being +133/-191.  Moderately tested, seems to work here.  Review
and testing would be welcome...

	Part 1: trivial preliminary cleanup

1/14) d_path: "\0" is {0,0}, not {0}
	A bunch of places used "\0" as a literal for 1-element array consisting
of NULs.  That should've been "", obviously...

	Part 2: untangling __dentry_path()

2/14) d_path: saner calling conventions for __dentry_path()
	__dentry_path() used to copy the calling conventions for dentry_path().
That was fine for use in dentry_path_raw(), but it created a lot of headache
in dentry_path(), since we might need a suffix (//deleted) in there, without
NUL between it and the pathname itself.  So we had to
	1) (possibly) put /deleted into buffer and remember where it went
	2) let __dentry_path() prepend NUL-terminated pathname
	3) override NUL with / if we had done (1)
Life becomes much easier if __dentry_path() does *NOT* put NUL in there.
Then dentry_path_raw() becomes 'put "" into buffer, then __dentry_path()'
and dentry_path() - 'put "" or "//deleted" into buffer, then __dentry_path()'.

Additionally, we switch the way buffer information is passed to one similar
to what we do in prepend()/prepend_name()/etc., i.e. pass the pointer to the
end of buffer instead of that to beginning.  That's what we'd been using
in __dentry_path() all along, and now that the callers are doing some prepend()
before calling __dentry_path(), they that value already on hand.

3/14)  d_path: regularize handling of root dentry in __dentry_path()
	All path-forming primitives boil down to sequence of prepend_name()
on dentries encountered along the way toward root.  Each time we prepend
/ + dentry name to the buffer.  Normally that does exactly what we want,
but there's a corner case when we don't call prepend_name() at all (in case
of __dentry_path() that happens if we are given root dentry).  We obviously
want to end up with "/", rather than "", so this corner case needs to be
handled.
	__dentry_path() used to manually put '/' in the end of buffer before
doing anything else, to be overwritten by the first call of prepend_name()
if one happens and to be left in place if we don't call prepend_name() at
all.  That required manually checking that we had space in the buffer
(prepend_name() and prepend() take care of such checks themselves) and lead
to clumsy keeping track of return value.
	A better approach is to check if the main loop has added anything
into the buffer and prepend "/" if it hasn't.  A side benefit of using prepend()
is that it does the right thing if we'd already run out of buffer, making
the overflow-handling logics simpler.

NB: the above might be worth putting into commit message.

	Part 3: overflow handling cleanups

We have an overcomplicated handling of overflows.  Primitives (prepend() and
prepend_name()) check if we'd run out of space and return -ENAMETOOLONG.
Then it's propagated all the way out by call chain.  However, the same
primitives are safe to call in case we'd *already* run out of space and
that condition is easily checked at any level of callchain.  The next
5 commits use that to simplify the control flow.

4/14)  d_path: get rid of path_with_deleted()
	expand into the sole caller, rearrange the suffix handing
along the lines of dentry_path().

5/14)  getcwd(2): saner logics around prepend_path() call
	Turn
		prepend_path()
		if it says it has run out of space, fail with ENAMETOOLONG
		if it wants "(unreachable) " prepended
			do so
			if that says it has run out of space, fail with ENAMETOOLONG
	into
		prepend_path()
		if it wants "(unreachable) " prepended
			do so
		if we see we'd run out of space at some point
			fail with ENAMETOOLONG

6/14)  d_path: don't bother with return value of prepend()
	Almost nothing is looking at return value of prepend() and it's
easy to get rid of the last stragglers...

7/14)  d_path: lift -ENAMETOOLONG handling into callers of prepend_path()
	It's easier to have prepend_path() return 0 on overflow and
check for overflow in the callers.  The logics is the same for all callers
(ran out of space => ERR_PTR(-ENAMETOOLONG)), so we get a bit of boilerplate,
but even with that the callers become simpler.  Added boilerplate will be
dealt with a couple of commits down the road.

8/14)  d_path: make prepend_name() boolean
	Unlike the case of prepend(), callers of prepend_name() really want
to see whether it has run out of space - the loops it's called in are
lockless and we could, in principle, end up spinning there for indetermined
amount of iterations.  Dropping out of loop if we run out of space in the
buffers serves as a backstop for (very unlikely) cases.
	However, all we care about is success/failure - we generate
ENAMETOOLONG in the callers, if not callers of callers, so we can bloody well
make it return bool.

	Part 4: introduction of prepend_buffer

9/14)  d_path: introduce struct prepend_buffer
	We've a lot of places where we have pairs of form (pointer to end
of buffer, amount of space left in front of that).  These sit in pairs of
variables located next to each other and usually passed by reference.
Turn those into instances of new type (struct prepend_buffer) and pass
reference to the pair instead of pairs of references to its fields.

Initialization (of form {buf + len, len}) turned into a macro (DECLARE_BUF),
to avoid brainos.  Extraction of string (buffer contents if we hadn't run
out of space, ERR_PTR(-ENAMETOOLONG) otherwise) is done by extract_string();
that eats the leftover boilerplate from earlier in the series.

	Part 5: extracting the lockless part of prepend_path()
The thing that started the entire mess had been an attempt to use d_path
machinery for vsprintf(); that can't grab rename_lock and mount_lock,
so we needed a variant of prepend_path() that would try to go without
those locks.  The obvious approach is to lift the internal loop of
prepend_path() into a new primitive.  However, that needs some massage
first to separate local variables - otherwise we end up with the
argument list from hell.

10/14) d_path: prepend_path(): get rid of vfsmnt
	redundant - we maintain mnt and vfsmnt through the loop,
with the latter being a pointer to mnt->mnt all along.
11/14) d_path: prepend_path(): lift resetting b in case when we'd return 3 out of loop
	the only place in the inner loop where we need p; we use it
to reset b in case we would return 3.  We can easily check that error is 3
after the loop and do resetting there.  Note that we only need that after
the *outer* loop - the body of that starts with assignment to b anyway.
12/14) d_path: prepend_path(): lift the inner loop into a new helper
	ta-da

	Part 6: followups

13/14) d_path: prepend_path() is unlikely to return non-zero
t14/14) getcwd(2): clean up error handling
Linus Torvalds May 19, 2021, 2:39 a.m. UTC | #23
On Tue, May 18, 2021 at 2:44 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
>         Here's what I've got for carve-up of cleanups.

Thanks, these all look logical to me.

I only read through the individual patches, I didn't test or check the
end result, but it all looked like good sane cleanups.

              Linus
Justin He June 22, 2021, 2 p.m. UTC | #24
> -----Original Message-----
> From: Al Viro <viro@ftp.linux.org.uk> On Behalf Of Al Viro
> Sent: Wednesday, May 19, 2021 8:43 AM
> To: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Justin He <Justin.He@arm.com>; Petr Mladek <pmladek@suse.com>; Steven
> Rostedt <rostedt@goodmis.org>; Sergey Senozhatsky
> <senozhatsky@chromium.org>; Andy Shevchenko
> <andriy.shevchenko@linux.intel.com>; Rasmus Villemoes
> <linux@rasmusvillemoes.dk>; Jonathan Corbet <corbet@lwn.net>; Al Viro
> <viro@ftp.linux.org.uk>; Heiko Carstens <hca@linux.ibm.com>; Vasily Gorbik
> <gor@linux.ibm.com>; Christian Borntraeger <borntraeger@de.ibm.com>; Eric
> W . Biederman <ebiederm@xmission.com>; Darrick J. Wong
> <darrick.wong@oracle.com>; Peter Zijlstra (Intel) <peterz@infradead.org>;
> Ira Weiny <ira.weiny@intel.com>; Eric Biggers <ebiggers@google.com>; Ahmed
> S. Darwish <a.darwish@linutronix.de>; open list:DOCUMENTATION <linux-
> doc@vger.kernel.org>; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>; linux-s390 <linux-s390@vger.kernel.org>; linux-
> fsdevel <linux-fsdevel@vger.kernel.org>
> Subject: [PATCHSET] d_path cleanups

For the whole patch series, I once tested several cases, most of them are
Related to new '%pD' behavior:
1. print '%pD' with full path of ext4 file
2. mount a ext4 filesystem upon a ext4 filesystem, and print the file
with '%pD'
3. all test_print selftests, including the new '%14pD' '%-14pD'
4. kasnprintf

In summary, please feel free to add/not_add:
Tested-by: Jia He <justin.he@arm.com>

--
Cheers,
Justin (Jia He)
diff mbox series

Patch

diff --git a/fs/d_path.c b/fs/d_path.c
index a69e2cd36e6e..985a5754a9f5 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -61,7 +61,7 @@  static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
  * @root: root vfsmnt/dentry
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
- *
+ * @need_lock: not ignoring renames and changes to the mount tree
  * The function will first try to write out the pathname without taking any
  * lock other than the RCU read lock to make sure that dentries won't go away.
  * It only checks the sequence number of the global rename_lock as any change
@@ -74,7 +74,7 @@  static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
  */
 static int prepend_path(const struct path *path,
 			const struct path *root,
-			char **buffer, int *buflen)
+			char **buffer, int *buflen, bool need_lock)
 {
 	struct dentry *dentry;
 	struct vfsmount *vfsmnt;
@@ -86,7 +86,8 @@  static int prepend_path(const struct path *path,
 
 	rcu_read_lock();
 restart_mnt:
-	read_seqbegin_or_lock(&mount_lock, &m_seq);
+	if (need_lock)
+		read_seqbegin_or_lock(&mount_lock, &m_seq);
 	seq = 0;
 	rcu_read_lock();
 restart:
@@ -96,7 +97,8 @@  static int prepend_path(const struct path *path,
 	dentry = path->dentry;
 	vfsmnt = path->mnt;
 	mnt = real_mount(vfsmnt);
-	read_seqbegin_or_lock(&rename_lock, &seq);
+	if (need_lock)
+		read_seqbegin_or_lock(&rename_lock, &seq);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -136,19 +138,21 @@  static int prepend_path(const struct path *path,
 	}
 	if (!(seq & 1))
 		rcu_read_unlock();
-	if (need_seqretry(&rename_lock, seq)) {
+	if (need_lock && need_seqretry(&rename_lock, seq)) {
 		seq = 1;
 		goto restart;
 	}
-	done_seqretry(&rename_lock, seq);
+	if (need_lock)
+		done_seqretry(&rename_lock, seq);
 
 	if (!(m_seq & 1))
 		rcu_read_unlock();
-	if (need_seqretry(&mount_lock, m_seq)) {
+	if (need_lock && need_seqretry(&mount_lock, m_seq)) {
 		m_seq = 1;
 		goto restart_mnt;
 	}
-	done_seqretry(&mount_lock, m_seq);
+	if (need_lock)
+		done_seqretry(&mount_lock, m_seq);
 
 	if (error >= 0 && bptr == *buffer) {
 		if (--blen < 0)
@@ -185,7 +189,7 @@  char *__d_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	error = prepend_path(path, root, &res, &buflen);
+	error = prepend_path(path, root, &res, &buflen, true);
 
 	if (error < 0)
 		return ERR_PTR(error);
@@ -202,7 +206,7 @@  char *d_absolute_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	error = prepend_path(path, &root, &res, &buflen);
+	error = prepend_path(path, &root, &res, &buflen, true);
 
 	if (error > 1)
 		error = -EINVAL;
@@ -225,7 +229,7 @@  static int path_with_deleted(const struct path *path,
 			return error;
 	}
 
-	return prepend_path(path, root, buf, buflen);
+	return prepend_path(path, root, buf, buflen, true);
 }
 
 static int prepend_unreachable(char **buffer, int *buflen)
@@ -291,6 +295,28 @@  char *d_path(const struct path *path, char *buf, int buflen)
 }
 EXPORT_SYMBOL(d_path);
 
+/**
+ * d_path_fast - fast return the path of a dentry
+ * This helper is similar to d_path other than taking seqlock/spinlock.
+ */
+char *d_path_fast(const struct path *path, char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	struct path root;
+	int error;
+
+	rcu_read_lock();
+	get_fs_root_rcu(current->fs, &root);
+	prepend(&res, &buflen, "\0", 1);
+	error = prepend_path(path, &root, &res, &buflen, false);
+	rcu_read_unlock();
+
+	if (error < 0)
+		res = ERR_PTR(error);
+	return res;
+}
+EXPORT_SYMBOL(d_path_fast);
+
 /*
  * Helper function for dentry_operations.d_dname() members
  */
@@ -445,7 +471,7 @@  SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		int buflen = PATH_MAX;
 
 		prepend(&cwd, &buflen, "\0", 1);
-		error = prepend_path(&pwd, &root, &cwd, &buflen);
+		error = prepend_path(&pwd, &root, &cwd, &buflen, true);
 		rcu_read_unlock();
 
 		if (error < 0)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c1e48014106f..9a00fb90824f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -300,6 +300,7 @@  char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
 extern char *d_path(const struct path *, char *, int);
+extern char *d_path_fast(const struct path *, char *, int);
 extern char *dentry_path_raw(struct dentry *, char *, int);
 extern char *dentry_path(struct dentry *, char *, int);