diff mbox

[v2] vfs: avoid recopying file names in getname_flags

Message ID 1427309152-25129-1-git-send-email-boqun.feng@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boqun Feng March 25, 2015, 6:45 p.m. UTC
In the current implementation of getname_flags, a file name in the
user-space will be recopied if it takes more space that
EMBEDDED_NAME_MAX, however, at this moment, EMBEDDED_NAME_MAX bytes of
the file name are already copied into kernel address space, the only
reason why the recopy is needed is that "kname", which points to the
string of the file name, needs to be relocated.

And the recopy can be avoided if we change the memory layout of the
`names_cachep` allocation. By putting `struct filename` at the tail of
the allocation instead of the head, relocation of kname is avoided.
Once putting the struct at the tail, each byte in the user space will be
copied only one time, so the recopy is avoided.

Of course, other functions aware of the layout of the `names_cachep`
allocation, i.e., getname_kernel and putname also need to be modified to
adapt to the new layout.

Since we change the layout of `names_cachep` allocation, in order to
figure out whether kname and the struct are separate, we now need to
check whether the file name string starts at the address
EMBEDDED_NAME_MAX bytes before the address of the struct.  As a result,
`iname`, which points the end of `struct filename`, is not needed
anymore.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
v1 --> v2:
Rebase the patch onto the for-next branch of Al's vfs repo.


 fs/namei.c         | 45 ++++++++++++++++++++++++++++-----------------
 include/linux/fs.h |  1 -
 2 files changed, 28 insertions(+), 18 deletions(-)

Comments

Boqun Feng March 29, 2015, 4:27 a.m. UTC | #1
Ping.

This patch has been tested by 0day test bot.

Thanks,
Boqun Feng


On Thu, Mar 26, 2015 at 02:45:52AM +0800, Boqun Feng wrote:
> In the current implementation of getname_flags, a file name in the
> user-space will be recopied if it takes more space that
> EMBEDDED_NAME_MAX, however, at this moment, EMBEDDED_NAME_MAX bytes of
> the file name are already copied into kernel address space, the only
> reason why the recopy is needed is that "kname", which points to the
> string of the file name, needs to be relocated.
> 
> And the recopy can be avoided if we change the memory layout of the
> `names_cachep` allocation. By putting `struct filename` at the tail of
> the allocation instead of the head, relocation of kname is avoided.
> Once putting the struct at the tail, each byte in the user space will be
> copied only one time, so the recopy is avoided.
> 
> Of course, other functions aware of the layout of the `names_cachep`
> allocation, i.e., getname_kernel and putname also need to be modified to
> adapt to the new layout.
> 
> Since we change the layout of `names_cachep` allocation, in order to
> figure out whether kname and the struct are separate, we now need to
> check whether the file name string starts at the address
> EMBEDDED_NAME_MAX bytes before the address of the struct.  As a result,
> `iname`, which points the end of `struct filename`, is not needed
> anymore.
> 
> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> ---
> v1 --> v2:
> Rebase the patch onto the for-next branch of Al's vfs repo.
> 
> 
>  fs/namei.c         | 45 ++++++++++++++++++++++++++++-----------------
>  include/linux/fs.h |  1 -
>  2 files changed, 28 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 7a11ec1..6d04029 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -119,7 +119,7 @@
>   * PATH_MAX includes the nul terminator --RR.
>   */
>  
> -#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
> +#define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
>  
>  struct filename *
>  getname_flags(const char __user *filename, int flags, int *empty)
> @@ -132,44 +132,53 @@ getname_flags(const char __user *filename, int flags, int *empty)
>  	if (result)
>  		return result;
>  
> -	result = __getname();
> -	if (unlikely(!result))
> +	kname = __getname();
> +	if (unlikely(!kname))
>  		return ERR_PTR(-ENOMEM);
>  
>  	/*
>  	 * First, try to embed the struct filename inside the names_cache
>  	 * allocation
>  	 */
> -	kname = (char *)result->iname;
> +	result = (struct filename *)(kname + EMBEDDED_NAME_MAX);
>  	result->name = kname;
>  
>  	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
>  	if (unlikely(len < 0)) {
> -		__putname(result);
> +		__putname(kname);
>  		return ERR_PTR(len);
>  	}
>  
>  	/*
>  	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
>  	 * separate struct filename so we can dedicate the entire
> -	 * names_cache allocation for the pathname, and re-do the copy from
> +	 * names_cache allocation for the pathname, and continue the copy from
>  	 * userland.
>  	 */
>  	if (unlikely(len == EMBEDDED_NAME_MAX)) {
> -		kname = (char *)result;
> -
>  		result = kzalloc(sizeof(*result), GFP_KERNEL);
>  		if (unlikely(!result)) {
>  			__putname(kname);
>  			return ERR_PTR(-ENOMEM);
>  		}
>  		result->name = kname;
> -		len = strncpy_from_user(kname, filename, PATH_MAX);
> +		/* we can't simply add the number of rest chars we copy to len,
> +		 * since strncpy_from_user may return negative to indicate
> +		 * something is wrong, so we do the addition later, after
> +		 * strncpy_from_user succeeds, and we know we already copy
> +		 * EMBEDDED_NAME_MAX chars.
> +		 */
> +		len = strncpy_from_user(kname + EMBEDDED_NAME_MAX,
> +				filename + EMBEDDED_NAME_MAX,
> +				PATH_MAX - EMBEDDED_NAME_MAX);
> +
>  		if (unlikely(len < 0)) {
>  			__putname(kname);
>  			kfree(result);
>  			return ERR_PTR(len);
>  		}
> +
> +		len += EMBEDDED_NAME_MAX;
>  		if (unlikely(len == PATH_MAX)) {
>  			__putname(kname);
>  			kfree(result);
> @@ -204,26 +213,28 @@ struct filename *
>  getname_kernel(const char * filename)
>  {
>  	struct filename *result;
> +	char *kname;
>  	int len = strlen(filename) + 1;
>  
> -	result = __getname();
> -	if (unlikely(!result))
> +	kname = __getname();
> +	if (unlikely(!kname))
>  		return ERR_PTR(-ENOMEM);
>  
>  	if (len <= EMBEDDED_NAME_MAX) {
> -		result->name = (char *)result->iname;
> +		result = (struct filename *)(kname + EMBEDDED_NAME_MAX);
> +		result->name = kname;
>  	} else if (len <= PATH_MAX) {
>  		struct filename *tmp;
>  
>  		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
>  		if (unlikely(!tmp)) {
> -			__putname(result);
> +			__putname(kname);
>  			return ERR_PTR(-ENOMEM);
>  		}
> -		tmp->name = (char *)result;
> +		tmp->name = kname;
>  		result = tmp;
>  	} else {
> -		__putname(result);
> +		__putname(kname);
>  		return ERR_PTR(-ENAMETOOLONG);
>  	}
>  	memcpy((char *)result->name, filename, len);
> @@ -242,11 +253,11 @@ void putname(struct filename *name)
>  	if (--name->refcnt > 0)
>  		return;
>  
> -	if (name->name != name->iname) {
> +	if (name->name != ((char *)name - EMBEDDED_NAME_MAX)) {
>  		__putname(name->name);
>  		kfree(name);
>  	} else
> -		__putname(name);
> +		__putname(name->name);
>  }
>  
>  static int check_acl(struct inode *inode, int mask)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index dfbd88a..dd67284 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2166,7 +2166,6 @@ struct filename {
>  	const __user char	*uptr;	/* original userland pointer */
>  	struct audit_names	*aname;
>  	int			refcnt;
> -	const char		iname[];
>  };
>  
>  extern long vfs_truncate(struct path *, loff_t);
> -- 
> 2.3.3
>
Richard Weinberger March 29, 2015, 10:13 a.m. UTC | #2
On Wed, Mar 25, 2015 at 7:45 PM, Boqun Feng <boqun.feng@gmail.com> wrote:
> In the current implementation of getname_flags, a file name in the
> user-space will be recopied if it takes more space that
> EMBEDDED_NAME_MAX, however, at this moment, EMBEDDED_NAME_MAX bytes of
> the file name are already copied into kernel address space, the only
> reason why the recopy is needed is that "kname", which points to the
> string of the file name, needs to be relocated.
>
> And the recopy can be avoided if we change the memory layout of the
> `names_cachep` allocation. By putting `struct filename` at the tail of
> the allocation instead of the head, relocation of kname is avoided.
> Once putting the struct at the tail, each byte in the user space will be
> copied only one time, so the recopy is avoided.
>
> Of course, other functions aware of the layout of the `names_cachep`
> allocation, i.e., getname_kernel and putname also need to be modified to
> adapt to the new layout.
>
> Since we change the layout of `names_cachep` allocation, in order to
> figure out whether kname and the struct are separate, we now need to
> check whether the file name string starts at the address
> EMBEDDED_NAME_MAX bytes before the address of the struct.  As a result,
> `iname`, which points the end of `struct filename`, is not needed
> anymore.
>
> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> ---
> v1 --> v2:
> Rebase the patch onto the for-next branch of Al's vfs repo.
>
>
>  fs/namei.c         | 45 ++++++++++++++++++++++++++++-----------------
>  include/linux/fs.h |  1 -
>  2 files changed, 28 insertions(+), 18 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 7a11ec1..6d04029 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -119,7 +119,7 @@
>   * PATH_MAX includes the nul terminator --RR.
>   */
>
> -#define EMBEDDED_NAME_MAX      (PATH_MAX - offsetof(struct filename, iname))
> +#define EMBEDDED_NAME_MAX      (PATH_MAX - sizeof(struct filename))

*confused*
 EMBEDDED_NAME_MAX is currently PATH_MAX - sizeof(struct filename)

Did you mix original and new source while creating the patch?
Richard Weinberger March 29, 2015, 10:14 a.m. UTC | #3
On Sun, Mar 29, 2015 at 6:27 AM, Boqun Feng <boqun.feng@gmail.com> wrote:
> Ping.
>
> This patch has been tested by 0day test bot.

I hope you did more than build test this patch...
Boqun Feng March 29, 2015, 3:10 p.m. UTC | #4
Hi Richard,

On Sun, Mar 29, 2015 at 12:13:29PM +0200, Richard Weinberger wrote:
> On Wed, Mar 25, 2015 at 7:45 PM, Boqun Feng <boqun.feng@gmail.com> wrote:
> > In the current implementation of getname_flags, a file name in the
> > user-space will be recopied if it takes more space that
> > EMBEDDED_NAME_MAX, however, at this moment, EMBEDDED_NAME_MAX bytes of
> > the file name are already copied into kernel address space, the only
> > reason why the recopy is needed is that "kname", which points to the
> > string of the file name, needs to be relocated.
> >
> > And the recopy can be avoided if we change the memory layout of the
> > `names_cachep` allocation. By putting `struct filename` at the tail of
> > the allocation instead of the head, relocation of kname is avoided.
> > Once putting the struct at the tail, each byte in the user space will be
> > copied only one time, so the recopy is avoided.
> >
> > Of course, other functions aware of the layout of the `names_cachep`
> > allocation, i.e., getname_kernel and putname also need to be modified to
> > adapt to the new layout.
> >
> > Since we change the layout of `names_cachep` allocation, in order to
> > figure out whether kname and the struct are separate, we now need to
> > check whether the file name string starts at the address
> > EMBEDDED_NAME_MAX bytes before the address of the struct.  As a result,
> > `iname`, which points the end of `struct filename`, is not needed
> > anymore.
> >
> > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> > ---
> > v1 --> v2:
> > Rebase the patch onto the for-next branch of Al's vfs repo.
> >
> >
> >  fs/namei.c         | 45 ++++++++++++++++++++++++++++-----------------
> >  include/linux/fs.h |  1 -
> >  2 files changed, 28 insertions(+), 18 deletions(-)
> >
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 7a11ec1..6d04029 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -119,7 +119,7 @@
> >   * PATH_MAX includes the nul terminator --RR.
> >   */
> >
> > -#define EMBEDDED_NAME_MAX      (PATH_MAX - offsetof(struct filename, iname))
> > +#define EMBEDDED_NAME_MAX      (PATH_MAX - sizeof(struct filename))
> 
> *confused*
>  EMBEDDED_NAME_MAX is currently PATH_MAX - sizeof(struct filename)
> 
> Did you mix original and new source while creating the patch?

This patch is based on branch 'for-next' in Al's tree, in that repo
commit b2ddab0e5324aec11620666eccc4f0ff64802131 ("kill struct filename.separate")
changes EMBEDDED_NAME_MAX to (PATH_MAX - offsetof(struct filename, iname))

I put the "based-on" information in the version changing log, seems it's
better to put it in the commit log to make it clearer for reviewers?

Thanks,
Boqun Feng

> 
> -- 
> Thanks,
> //richard
Boqun Feng March 29, 2015, 3:23 p.m. UTC | #5
On Sun, Mar 29, 2015 at 12:14:24PM +0200, Richard Weinberger wrote:
> On Sun, Mar 29, 2015 at 6:27 AM, Boqun Feng <boqun.feng@gmail.com> wrote:
> > Ping.
> >
> > This patch has been tested by 0day test bot.
> 
> I hope you did more than build test this patch...

I did, I boot the new kernel and ran some scripts for creating and
deleting files with various name lengths. Any suggestions for tests,
considering the corner cases are very long file names? Thank you.

For 0day testing bot, I was told that it will run booting and other
test cases, but if there is no error, it won't report. I haven't
received an error report yet since three days before.

Regards,
Boqun

> 
> -- 
> Thanks,
> //richard
Boqun Feng April 7, 2015, 8:38 a.m. UTC | #6
Ping again...

Thanks,
Boqun Feng

On Sun, Mar 29, 2015 at 12:27:44PM +0800, Boqun Feng wrote:
> Ping.
> 
> This patch has been tested by 0day test bot.
> 
> Thanks,
> Boqun Feng
> 
> 
> On Thu, Mar 26, 2015 at 02:45:52AM +0800, Boqun Feng wrote:
> > In the current implementation of getname_flags, a file name in the
> > user-space will be recopied if it takes more space that
> > EMBEDDED_NAME_MAX, however, at this moment, EMBEDDED_NAME_MAX bytes of
> > the file name are already copied into kernel address space, the only
> > reason why the recopy is needed is that "kname", which points to the
> > string of the file name, needs to be relocated.
> > 
> > And the recopy can be avoided if we change the memory layout of the
> > `names_cachep` allocation. By putting `struct filename` at the tail of
> > the allocation instead of the head, relocation of kname is avoided.
> > Once putting the struct at the tail, each byte in the user space will be
> > copied only one time, so the recopy is avoided.
> > 
> > Of course, other functions aware of the layout of the `names_cachep`
> > allocation, i.e., getname_kernel and putname also need to be modified to
> > adapt to the new layout.
> > 
> > Since we change the layout of `names_cachep` allocation, in order to
> > figure out whether kname and the struct are separate, we now need to
> > check whether the file name string starts at the address
> > EMBEDDED_NAME_MAX bytes before the address of the struct.  As a result,
> > `iname`, which points the end of `struct filename`, is not needed
> > anymore.
> > 
> > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> > ---
> > v1 --> v2:
> > Rebase the patch onto the for-next branch of Al's vfs repo.
> > 
> > 
> >  fs/namei.c         | 45 ++++++++++++++++++++++++++++-----------------
> >  include/linux/fs.h |  1 -
> >  2 files changed, 28 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 7a11ec1..6d04029 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -119,7 +119,7 @@
> >   * PATH_MAX includes the nul terminator --RR.
> >   */
> >  
> > -#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
> > +#define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
> >  
> >  struct filename *
> >  getname_flags(const char __user *filename, int flags, int *empty)
> > @@ -132,44 +132,53 @@ getname_flags(const char __user *filename, int flags, int *empty)
> >  	if (result)
> >  		return result;
> >  
> > -	result = __getname();
> > -	if (unlikely(!result))
> > +	kname = __getname();
> > +	if (unlikely(!kname))
> >  		return ERR_PTR(-ENOMEM);
> >  
> >  	/*
> >  	 * First, try to embed the struct filename inside the names_cache
> >  	 * allocation
> >  	 */
> > -	kname = (char *)result->iname;
> > +	result = (struct filename *)(kname + EMBEDDED_NAME_MAX);
> >  	result->name = kname;
> >  
> >  	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
> >  	if (unlikely(len < 0)) {
> > -		__putname(result);
> > +		__putname(kname);
> >  		return ERR_PTR(len);
> >  	}
> >  
> >  	/*
> >  	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
> >  	 * separate struct filename so we can dedicate the entire
> > -	 * names_cache allocation for the pathname, and re-do the copy from
> > +	 * names_cache allocation for the pathname, and continue the copy from
> >  	 * userland.
> >  	 */
> >  	if (unlikely(len == EMBEDDED_NAME_MAX)) {
> > -		kname = (char *)result;
> > -
> >  		result = kzalloc(sizeof(*result), GFP_KERNEL);
> >  		if (unlikely(!result)) {
> >  			__putname(kname);
> >  			return ERR_PTR(-ENOMEM);
> >  		}
> >  		result->name = kname;
> > -		len = strncpy_from_user(kname, filename, PATH_MAX);
> > +		/* we can't simply add the number of rest chars we copy to len,
> > +		 * since strncpy_from_user may return negative to indicate
> > +		 * something is wrong, so we do the addition later, after
> > +		 * strncpy_from_user succeeds, and we know we already copy
> > +		 * EMBEDDED_NAME_MAX chars.
> > +		 */
> > +		len = strncpy_from_user(kname + EMBEDDED_NAME_MAX,
> > +				filename + EMBEDDED_NAME_MAX,
> > +				PATH_MAX - EMBEDDED_NAME_MAX);
> > +
> >  		if (unlikely(len < 0)) {
> >  			__putname(kname);
> >  			kfree(result);
> >  			return ERR_PTR(len);
> >  		}
> > +
> > +		len += EMBEDDED_NAME_MAX;
> >  		if (unlikely(len == PATH_MAX)) {
> >  			__putname(kname);
> >  			kfree(result);
> > @@ -204,26 +213,28 @@ struct filename *
> >  getname_kernel(const char * filename)
> >  {
> >  	struct filename *result;
> > +	char *kname;
> >  	int len = strlen(filename) + 1;
> >  
> > -	result = __getname();
> > -	if (unlikely(!result))
> > +	kname = __getname();
> > +	if (unlikely(!kname))
> >  		return ERR_PTR(-ENOMEM);
> >  
> >  	if (len <= EMBEDDED_NAME_MAX) {
> > -		result->name = (char *)result->iname;
> > +		result = (struct filename *)(kname + EMBEDDED_NAME_MAX);
> > +		result->name = kname;
> >  	} else if (len <= PATH_MAX) {
> >  		struct filename *tmp;
> >  
> >  		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
> >  		if (unlikely(!tmp)) {
> > -			__putname(result);
> > +			__putname(kname);
> >  			return ERR_PTR(-ENOMEM);
> >  		}
> > -		tmp->name = (char *)result;
> > +		tmp->name = kname;
> >  		result = tmp;
> >  	} else {
> > -		__putname(result);
> > +		__putname(kname);
> >  		return ERR_PTR(-ENAMETOOLONG);
> >  	}
> >  	memcpy((char *)result->name, filename, len);
> > @@ -242,11 +253,11 @@ void putname(struct filename *name)
> >  	if (--name->refcnt > 0)
> >  		return;
> >  
> > -	if (name->name != name->iname) {
> > +	if (name->name != ((char *)name - EMBEDDED_NAME_MAX)) {
> >  		__putname(name->name);
> >  		kfree(name);
> >  	} else
> > -		__putname(name);
> > +		__putname(name->name);
> >  }
> >  
> >  static int check_acl(struct inode *inode, int mask)
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index dfbd88a..dd67284 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2166,7 +2166,6 @@ struct filename {
> >  	const __user char	*uptr;	/* original userland pointer */
> >  	struct audit_names	*aname;
> >  	int			refcnt;
> > -	const char		iname[];
> >  };
> >  
> >  extern long vfs_truncate(struct path *, loff_t);
> > -- 
> > 2.3.3
> >
Al Viro April 11, 2015, 11:56 p.m. UTC | #7
On Tue, Apr 07, 2015 at 04:38:26PM +0800, Boqun Feng wrote:
> Ping again...

What exactly does it buy us?  You need a pathname just a bit under 4Kb, which,
with all due respect, is an extremely rare case.  Resulting code is more
complicated, we _still_ copy twice (sure, the second time is for 16 bytes or
so, but...), instead of "compare with the address of embedded array" we get
the loveliness like
> > > +	if (name->name != ((char *)name - EMBEDDED_NAME_MAX)) {
this...   _And_, on top of everything else, we get name and name->name
guaranteed to hit different cachelines, in all cases, including the common
ones.

What for?  It's not as if userland memory had been communicated with by
IP over carrier pigeons, after all, and the cost of 4Kb worth of
(essentially) memcpy() is going to be
	a) incurred in extremely rare case
and
	b) be dwarfed by the work we need to _do_ with what we'd copied.
After all, that pathname is going to be parsed and traversed - all 4Kb
worth of it.

So what's the point?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro April 12, 2015, 1:13 a.m. UTC | #8
On Sun, Apr 12, 2015 at 12:56:55AM +0100, Al Viro wrote:

> What for?  It's not as if userland memory had been communicated with by
> IP over carrier pigeons, after all, and the cost of 4Kb worth of
> (essentially) memcpy() is going to be
> 	a) incurred in extremely rare case
> and
> 	b) be dwarfed by the work we need to _do_ with what we'd copied.
> After all, that pathname is going to be parsed and traversed - all 4Kb
> worth of it.
> 
> So what's the point?

BTW, looking at the __getname() callers...  Lustre one sure as hell looks
bogus:
        char *tmp = __getname();

        if (!tmp)
                return ERR_PTR(-ENOMEM);

        len = strncpy_from_user(tmp, filename, PATH_MAX);
        if (len == 0)
                ret = -ENOENT;
        else if (len > PATH_MAX)
                ret = -ENAMETOOLONG;

        if (ret) {
                __putname(tmp);
                tmp =  ERR_PTR(ret);
        }
        return tmp;

Note that
	* strncpy_from_user(p, u, n) can return a negative (-EFAULT)
	* strncpy_from_user(p, u, n) cannot return a positive greater than
n.  In case of missing NUL among the n bytes starting at u (and no faults
accessing those) we get exactly n bytes copied and n returned.  In case
when NUL is there, we copy everything up to and including that NUL and
return number of non-NUL bytes copied.

IOW, these failure cases had never been tested.  Name being too long ends up
with non-NUL-terminated array of characters returned, and the very first
caller of ll_getname() feeds it to strlen().  Fault ends up with uninitialized
array...

AFAICS, the damn thing should just use getname() and quit reinventing the
wheel, badly.

As for your question in another thread - AFAICS, it's impossible with the
current code, but not too robust.  Fortunately, it's trivial to make
independent on allocator details - all it takes is
	result = kzalloc(offsetof(struct filename, iname[1]), GFP_KERNEL);
and we are done - result->iname+1 will be within the allocated object,
no matter what.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boqun Feng April 13, 2015, 6:33 a.m. UTC | #9
Hi Al,

On Sun, Apr 12, 2015 at 12:56:55AM +0100, Al Viro wrote:
> On Tue, Apr 07, 2015 at 04:38:26PM +0800, Boqun Feng wrote:
> > Ping again...
> 
> What exactly does it buy us?  You need a pathname just a bit under 4Kb, which,
> with all due respect, is an extremely rare case.  Resulting code is more
> complicated, we _still_ copy twice (sure, the second time is for 16 bytes or
> so, but...), instead of "compare with the address of embedded array" we get
> the loveliness like
> > > > +	if (name->name != ((char *)name - EMBEDDED_NAME_MAX)) {
> this...   _And_, on top of everything else, we get name and name->name
> guaranteed to hit different cachelines, in all cases, including the common
> ones.
> 
> What for?  It's not as if userland memory had been communicated with by
> IP over carrier pigeons, after all, and the cost of 4Kb worth of
> (essentially) memcpy() is going to be
> 	a) incurred in extremely rare case
> and
> 	b) be dwarfed by the work we need to _do_ with what we'd copied.
> After all, that pathname is going to be parsed and traversed - all 4Kb
> worth of it.
> 
> So what's the point?

Thank you for your response.

Well, my original purpose of doing this is to avoid recopying file
names, I thought although long file names are race, it's worthy if we
can optimize without affecting common cases. But you are right, I fail
to take cachelines into consideration, so comman cases are affected.

Before I totally give it up, I'd like to run some performance tests
about this patch, which I should do before sending the patch, I will do
better next time ;-)
If I find something new, I will let you know.

Thanks again for your comments.

Regards,
Boqun Feng
Boqun Feng April 13, 2015, 7:34 a.m. UTC | #10
On Sun, Apr 12, 2015 at 02:13:18AM +0100, Al Viro wrote:
> 
> BTW, looking at the __getname() callers...  Lustre one sure as hell looks
> bogus:
>         char *tmp = __getname();
> 
>         if (!tmp)
>                 return ERR_PTR(-ENOMEM);
> 
>         len = strncpy_from_user(tmp, filename, PATH_MAX);
>         if (len == 0)
>                 ret = -ENOENT;
>         else if (len > PATH_MAX)
>                 ret = -ENAMETOOLONG;
> 
>         if (ret) {
>                 __putname(tmp);
>                 tmp =  ERR_PTR(ret);
>         }
>         return tmp;
> 
> Note that
> 	* strncpy_from_user(p, u, n) can return a negative (-EFAULT)
> 	* strncpy_from_user(p, u, n) cannot return a positive greater than
> n.  In case of missing NUL among the n bytes starting at u (and no faults
> accessing those) we get exactly n bytes copied and n returned.  In case
> when NUL is there, we copy everything up to and including that NUL and
> return number of non-NUL bytes copied.
> 
> IOW, these failure cases had never been tested.  Name being too long ends up
> with non-NUL-terminated array of characters returned, and the very first
> caller of ll_getname() feeds it to strlen().  Fault ends up with uninitialized
> array...

Yeah.. and it's suprising to see it doesn't make any trouble yet.

> 
> AFAICS, the damn thing should just use getname() and quit reinventing the
> wheel, badly.

I cscoped the kernel code and find 15 __getname() callers, they use the
memory that __getname()s return in quite different ways.

But at least we can divide them into two groups, 1) fill the memory with
names from user space 2) fill the memory with names from kernel space.

For 1) we can use getname() to do the job and for 2) I think first we
need to figure how they are using the memory, because they may generate
names in different ways, and clean them one by one if they need to be.

> 
> As for your question in another thread - AFAICS, it's impossible with the
> current code, but not too robust.  Fortunately, it's trivial to make
> independent on allocator details - all it takes is
> 	result = kzalloc(offsetof(struct filename, iname[1]), GFP_KERNEL);
> and we are done - result->iname+1 will be within the allocated object,
> no matter what.

Thank you for your response. As long as the actual size of result is not
a power of 2, the problem will not happen.(Maybe add a comment before
struct filename)


Regards,
Boqun Feng
diff mbox

Patch

diff --git a/fs/namei.c b/fs/namei.c
index 7a11ec1..6d04029 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -119,7 +119,7 @@ 
  * PATH_MAX includes the nul terminator --RR.
  */
 
-#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
+#define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
 
 struct filename *
 getname_flags(const char __user *filename, int flags, int *empty)
@@ -132,44 +132,53 @@  getname_flags(const char __user *filename, int flags, int *empty)
 	if (result)
 		return result;
 
-	result = __getname();
-	if (unlikely(!result))
+	kname = __getname();
+	if (unlikely(!kname))
 		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * First, try to embed the struct filename inside the names_cache
 	 * allocation
 	 */
-	kname = (char *)result->iname;
+	result = (struct filename *)(kname + EMBEDDED_NAME_MAX);
 	result->name = kname;
 
 	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 	if (unlikely(len < 0)) {
-		__putname(result);
+		__putname(kname);
 		return ERR_PTR(len);
 	}
 
 	/*
 	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 	 * separate struct filename so we can dedicate the entire
-	 * names_cache allocation for the pathname, and re-do the copy from
+	 * names_cache allocation for the pathname, and continue the copy from
 	 * userland.
 	 */
 	if (unlikely(len == EMBEDDED_NAME_MAX)) {
-		kname = (char *)result;
-
 		result = kzalloc(sizeof(*result), GFP_KERNEL);
 		if (unlikely(!result)) {
 			__putname(kname);
 			return ERR_PTR(-ENOMEM);
 		}
 		result->name = kname;
-		len = strncpy_from_user(kname, filename, PATH_MAX);
+		/* we can't simply add the number of rest chars we copy to len,
+		 * since strncpy_from_user may return negative to indicate
+		 * something is wrong, so we do the addition later, after
+		 * strncpy_from_user succeeds, and we know we already copy
+		 * EMBEDDED_NAME_MAX chars.
+		 */
+		len = strncpy_from_user(kname + EMBEDDED_NAME_MAX,
+				filename + EMBEDDED_NAME_MAX,
+				PATH_MAX - EMBEDDED_NAME_MAX);
+
 		if (unlikely(len < 0)) {
 			__putname(kname);
 			kfree(result);
 			return ERR_PTR(len);
 		}
+
+		len += EMBEDDED_NAME_MAX;
 		if (unlikely(len == PATH_MAX)) {
 			__putname(kname);
 			kfree(result);
@@ -204,26 +213,28 @@  struct filename *
 getname_kernel(const char * filename)
 {
 	struct filename *result;
+	char *kname;
 	int len = strlen(filename) + 1;
 
-	result = __getname();
-	if (unlikely(!result))
+	kname = __getname();
+	if (unlikely(!kname))
 		return ERR_PTR(-ENOMEM);
 
 	if (len <= EMBEDDED_NAME_MAX) {
-		result->name = (char *)result->iname;
+		result = (struct filename *)(kname + EMBEDDED_NAME_MAX);
+		result->name = kname;
 	} else if (len <= PATH_MAX) {
 		struct filename *tmp;
 
 		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
 		if (unlikely(!tmp)) {
-			__putname(result);
+			__putname(kname);
 			return ERR_PTR(-ENOMEM);
 		}
-		tmp->name = (char *)result;
+		tmp->name = kname;
 		result = tmp;
 	} else {
-		__putname(result);
+		__putname(kname);
 		return ERR_PTR(-ENAMETOOLONG);
 	}
 	memcpy((char *)result->name, filename, len);
@@ -242,11 +253,11 @@  void putname(struct filename *name)
 	if (--name->refcnt > 0)
 		return;
 
-	if (name->name != name->iname) {
+	if (name->name != ((char *)name - EMBEDDED_NAME_MAX)) {
 		__putname(name->name);
 		kfree(name);
 	} else
-		__putname(name);
+		__putname(name->name);
 }
 
 static int check_acl(struct inode *inode, int mask)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dfbd88a..dd67284 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2166,7 +2166,6 @@  struct filename {
 	const __user char	*uptr;	/* original userland pointer */
 	struct audit_names	*aname;
 	int			refcnt;
-	const char		iname[];
 };
 
 extern long vfs_truncate(struct path *, loff_t);