diff mbox

linux-next: Tree for Jan 20 -- Kernel panic - Unable to mount root fs

Message ID 20150121033600.GN29656@ZenIV.linux.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Al Viro Jan. 21, 2015, 3:36 a.m. UTC
On Tue, Jan 20, 2015 at 06:44:34PM -0800, Guenter Roeck wrote:
> >The shit hits the fan earlier - when we end up missing /dev.  There are
> >two places where it could've been created (depending on CONFIG_BLK_DEV_INITRD);
> >	sys_mkdir(collected, mode);
> >in init/initramfs.c (line 353 in linux-next) and
> >         err = sys_mkdir((const char __user __force *) "/dev", 0755);
> >in init/noinitramfs.c (line 32).  The latter would've screamed on failure;
> >could you printk of collected (%s), mode (%o) and return value (%d) in the
> >former and see what happens?
> >
> 
> sys_mkdir .:40775 returned -17
> sys_mkdir usr:40775 returned 0
> sys_mkdir usr/lib:40775 returned -2
> sys_mkdir usr/share:40755 returned -2
> sys_mkdir usr/share/udhcpc:40755 returned -2
> sys_mkdir usr/bin:40775 returned -2
> sys_mkdir usr/sbin:40775 returned -2
> sys_mkdir mnt:40775 returned 0
> sys_mkdir proc:40775 returned 0
> sys_mkdir root:40775 returned 0
> sys_mkdir lib:40775 returned 0
> sys_mkdir lib/modules:40775 returned -2
> sys_mkdir lib/modules/3.9.2:40775 returned -2
> sys_mkdir lib/modules/3.9.2/kernel:40775 returned -2
> 
> with
> 	int err = sys_mkdir(collected, mode);
> 	pr_info("sys_mkdir %s:%o returned %d\n", collected, mode, err);
> added in init/initramfs.c.

Just what is lib/modules/3.9.2 doing there?  In any case, I think I have at
least a plausible direction for digging.  Look:

struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *tmp = getname(pathname);
        struct dentry *res;
        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
        res = kern_path_create(dfd, tmp->name, path, lookup_flags);

struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct nameidata nd;
        int err2;
        int error;
        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

        /*
         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
         * other flags passed in are ignored!
         */
        lookup_flags &= LOOKUP_REVAL;

        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);

static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
{
        int retval;
        struct filename *filename;

        filename = getname_kernel(name);
        if (unlikely(IS_ERR(filename)))
                return PTR_ERR(filename);
        retval = filename_lookup(dfd, filename, flags, nd);

and we have done getname_kernel() on the name->name of result of getname().
At the very least, it's pointless - we already *have* struct filename for
that sucker.  Now, it shouldn't have screwed the things up - it would better
not, anyway, since we might legitimately have two identical pathname
among the syscall arguments.  However, let's see if this (on top of
linux-next, in addition to the same printks) changes behaviour:

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Guenter Roeck Jan. 21, 2015, 4:01 a.m. UTC | #1
On 01/20/2015 07:36 PM, Al Viro wrote:
> On Tue, Jan 20, 2015 at 06:44:34PM -0800, Guenter Roeck wrote:
>>> The shit hits the fan earlier - when we end up missing /dev.  There are
>>> two places where it could've been created (depending on CONFIG_BLK_DEV_INITRD);
>>> 	sys_mkdir(collected, mode);
>>> in init/initramfs.c (line 353 in linux-next) and
>>>          err = sys_mkdir((const char __user __force *) "/dev", 0755);
>>> in init/noinitramfs.c (line 32).  The latter would've screamed on failure;
>>> could you printk of collected (%s), mode (%o) and return value (%d) in the
>>> former and see what happens?
>>>
>>
>> sys_mkdir .:40775 returned -17
>> sys_mkdir usr:40775 returned 0
>> sys_mkdir usr/lib:40775 returned -2
>> sys_mkdir usr/share:40755 returned -2
>> sys_mkdir usr/share/udhcpc:40755 returned -2
>> sys_mkdir usr/bin:40775 returned -2
>> sys_mkdir usr/sbin:40775 returned -2
>> sys_mkdir mnt:40775 returned 0
>> sys_mkdir proc:40775 returned 0
>> sys_mkdir root:40775 returned 0
>> sys_mkdir lib:40775 returned 0
>> sys_mkdir lib/modules:40775 returned -2
>> sys_mkdir lib/modules/3.9.2:40775 returned -2
>> sys_mkdir lib/modules/3.9.2/kernel:40775 returned -2
>>
>> with
>> 	int err = sys_mkdir(collected, mode);
>> 	pr_info("sys_mkdir %s:%o returned %d\n", collected, mode, err);
>> added in init/initramfs.c.
>
> Just what is lib/modules/3.9.2 doing there?  In any case, I think I have at

Artifact from when I created the root file system (which apparently was with
3.9.2). It is irrelevant for my testing, at least so far, so I never
bothered fixing it.

> least a plausible direction for digging.  Look:
>

> diff --git a/fs/namei.c b/fs/namei.c
> index 323957f..c7d107c 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c

With this patch:

sys_mkdir .:40775 returned -17
sys_mkdir usr:40775 returned 0
sys_mkdir usr/lib:40775 returned 0
sys_mkdir usr/share:40755 returned 0
sys_mkdir usr/share/udhcpc:40755 returned 0
sys_mkdir usr/bin:40775 returned 0
sys_mkdir usr/sbin:40775 returned 0
sys_mkdir mnt:40775 returned 0
sys_mkdir proc:40775 returned 0
sys_mkdir root:40775 returned 0
sys_mkdir lib:40775 returned 0
sys_mkdir lib/modules:40775 returned 0
...

and the problem is fixed.

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Jan. 21, 2015, 4:36 a.m. UTC | #2
On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
> With this patch:
> 
> sys_mkdir .:40775 returned -17
> sys_mkdir usr:40775 returned 0
> sys_mkdir usr/lib:40775 returned 0
> sys_mkdir usr/share:40755 returned 0
> sys_mkdir usr/share/udhcpc:40755 returned 0
> sys_mkdir usr/bin:40775 returned 0
> sys_mkdir usr/sbin:40775 returned 0
> sys_mkdir mnt:40775 returned 0
> sys_mkdir proc:40775 returned 0
> sys_mkdir root:40775 returned 0
> sys_mkdir lib:40775 returned 0
> sys_mkdir lib/modules:40775 returned 0
> ...
> 
> and the problem is fixed.

... except that it simply confirms that something's fishy with getname_kernel()
of ->name of struct filename returned by getname().  IOW, I still do not
understand the mechanism of breakage there.

Another thing I really do not understand is
+               if (inode->i_ino) {
+                       /* valid inode number, use that for the comparison */
+                       if (n->ino != inode->i_ino ||
+                           n->dev != inode->i_sb->s_dev)
+                               continue;
in __audit_inode().  We don't *have* dentries with dentry->d_inode->i_ino == 0.
Ever.  WTF is that about?  Paul?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sabrina Dubroca Jan. 21, 2015, 11:05 a.m. UTC | #3
2015-01-21, 04:36:38 +0000, Al Viro wrote:
> On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
> > With this patch:
> > 
> > sys_mkdir .:40775 returned -17
> > sys_mkdir usr:40775 returned 0
> > sys_mkdir usr/lib:40775 returned 0
> > sys_mkdir usr/share:40755 returned 0
> > sys_mkdir usr/share/udhcpc:40755 returned 0
> > sys_mkdir usr/bin:40775 returned 0
> > sys_mkdir usr/sbin:40775 returned 0
> > sys_mkdir mnt:40775 returned 0
> > sys_mkdir proc:40775 returned 0
> > sys_mkdir root:40775 returned 0
> > sys_mkdir lib:40775 returned 0
> > sys_mkdir lib/modules:40775 returned 0
> > ...
> > 
> > and the problem is fixed.

This patch also works for me.


> ... except that it simply confirms that something's fishy with getname_kernel()
> of ->name of struct filename returned by getname().  IOW, I still do not
> understand the mechanism of breakage there.

I'm not so sure about that.  I tried to copy name to a new string in
do_path_lookup and that didn't help.

Now, I've removed the

        putname(filename);

line from do_path_lookup and I don't get the panic.


And BTW, I added Guenter's debugging to init/initramfs.c and got:
sys_mkdir dev:40755 returned 0
sys_mkdir root:40700 returned 0

even if it ends up panic'ing.
Guenter Roeck Jan. 21, 2015, 1:32 p.m. UTC | #4
On 01/21/2015 03:05 AM, Sabrina Dubroca wrote:
> 2015-01-21, 04:36:38 +0000, Al Viro wrote:
>> On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
>>> With this patch:
>>>
>>> sys_mkdir .:40775 returned -17
>>> sys_mkdir usr:40775 returned 0
>>> sys_mkdir usr/lib:40775 returned 0
>>> sys_mkdir usr/share:40755 returned 0
>>> sys_mkdir usr/share/udhcpc:40755 returned 0
>>> sys_mkdir usr/bin:40775 returned 0
>>> sys_mkdir usr/sbin:40775 returned 0
>>> sys_mkdir mnt:40775 returned 0
>>> sys_mkdir proc:40775 returned 0
>>> sys_mkdir root:40775 returned 0
>>> sys_mkdir lib:40775 returned 0
>>> sys_mkdir lib/modules:40775 returned 0
>>> ...
>>>
>>> and the problem is fixed.
>
> This patch also works for me.
>
>
>> ... except that it simply confirms that something's fishy with getname_kernel()
>> of ->name of struct filename returned by getname().  IOW, I still do not
>> understand the mechanism of breakage there.
>
> I'm not so sure about that.  I tried to copy name to a new string in
> do_path_lookup and that didn't help.
>
> Now, I've removed the
>
>          putname(filename);
>
> line from do_path_lookup and I don't get the panic.
>
>
> And BTW, I added Guenter's debugging to init/initramfs.c and got:
> sys_mkdir dev:40755 returned 0
> sys_mkdir root:40700 returned 0
>
> even if it ends up panic'ing.
>
Another data point (though I have no idea if it is useful or what it means):

In the working case, path_init sets nd->flags to 0x50 or 0x51.
In the non-working case (ie for all files with a '/' in the name),
it sets nd->flags to 0x10 or 0x11, even though it is always called
with the LOOKUP_RCU bit set in flags.

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thierry Reding Jan. 21, 2015, 2:42 p.m. UTC | #5
On Wed, Jan 21, 2015 at 12:05:39PM +0100, Sabrina Dubroca wrote:
> 2015-01-21, 04:36:38 +0000, Al Viro wrote:
> > On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
> > > With this patch:
> > > 
> > > sys_mkdir .:40775 returned -17
> > > sys_mkdir usr:40775 returned 0
> > > sys_mkdir usr/lib:40775 returned 0
> > > sys_mkdir usr/share:40755 returned 0
> > > sys_mkdir usr/share/udhcpc:40755 returned 0
> > > sys_mkdir usr/bin:40775 returned 0
> > > sys_mkdir usr/sbin:40775 returned 0
> > > sys_mkdir mnt:40775 returned 0
> > > sys_mkdir proc:40775 returned 0
> > > sys_mkdir root:40775 returned 0
> > > sys_mkdir lib:40775 returned 0
> > > sys_mkdir lib/modules:40775 returned 0
> > > ...
> > > 
> > > and the problem is fixed.
> 
> This patch also works for me.
> 
> 
> > ... except that it simply confirms that something's fishy with getname_kernel()
> > of ->name of struct filename returned by getname().  IOW, I still do not
> > understand the mechanism of breakage there.
> 
> I'm not so sure about that.  I tried to copy name to a new string in
> do_path_lookup and that didn't help.
> 
> Now, I've removed the
> 
>         putname(filename);
> 
> line from do_path_lookup and I don't get the panic.

That would indicate that somehow the refcount got unbalanced. Looking
more closely it seems like the various audit_*() function do take a
reference, but maybe that's not enough.

But debugging this further I see no indication that the memory is ever
freed, or otherwise corrupted.

I did collect a bit more data, perhaps that's useful. I started seeing
this issue as well on devices that boot over NFS. After reading this
thread I also realized that another warning that I was seeing might be
related:

	[   28.261930] Warning: unable to open an initial console.

I've added a couple of printks and see that the reason for this is that
/dev/console doesn't get created. /dev however does get created.

	[   11.786627] sys_mkdir dev:40755 returned 0
	...
	[   11.978748] sys_mknod dev/console:20600 returned -2

The chain that fails turns out to be this:

	sys_mknod()
	  sys_mknodat()
	    user_path_create()
	      kern_path_create()
	        do_path_lookup()
	          filename_lookup()
	            path_lookupat()
	              path_init()
	                link_path_walk()
	                  walk_component()

walk_components() ends up calling lookup_slow() and the result is that
inode == NULL and d_is_negative(path->dentry) returns true, therefore
causing -ENOENT to be returned.

I tried to figure out why inode would be NULL at that point or why
d_is_negative() returned true, but I ended up getting completely lost,
so I thought it best to report my findings before I confuse everything.

Is there anything else I can investigate to track this down?

Thierry
Paul Moore Jan. 21, 2015, 3:06 p.m. UTC | #6
On Wednesday, January 21, 2015 04:36:38 AM Al Viro wrote:
> Another thing I really do not understand is
> +               if (inode->i_ino) {
> +                       /* valid inode number, use that for the ...
> +                       if (n->ino != inode->i_ino ||
> +                           n->dev != inode->i_sb->s_dev)
> +                               continue;
> in __audit_inode().  We don't *have* dentries with dentry->d_inode->i_ino ==
> 0. Ever.  WTF is that about?  Paul?

Likely stupidity on my part.  It looks like a typo, that first if conditional 
should check "n->ino" instead of "inode->i_ino"; in __audit_getname() we 
record names without any inode numbers, so we need to see if this is one of 
those records.  Interesting that it passed my testing; either my testing is 
crap (always a strong possibility) or something else came to the rescue.  I'm 
still coming up to speed on the audit/VFS code ...

I'll fix that up and include in the next patchset once we resolve this issue.
Paul Moore Jan. 21, 2015, 3:24 p.m. UTC | #7
On Wednesday, January 21, 2015 03:42:16 PM Thierry Reding wrote:
> On Wed, Jan 21, 2015 at 12:05:39PM +0100, Sabrina Dubroca wrote:
> > 2015-01-21, 04:36:38 +0000, Al Viro wrote:
> > > On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
> > > > With this patch:
> > > > 
> > > > sys_mkdir .:40775 returned -17
> > > > sys_mkdir usr:40775 returned 0
> > > > sys_mkdir usr/lib:40775 returned 0
> > > > sys_mkdir usr/share:40755 returned 0
> > > > sys_mkdir usr/share/udhcpc:40755 returned 0
> > > > sys_mkdir usr/bin:40775 returned 0
> > > > sys_mkdir usr/sbin:40775 returned 0
> > > > sys_mkdir mnt:40775 returned 0
> > > > sys_mkdir proc:40775 returned 0
> > > > sys_mkdir root:40775 returned 0
> > > > sys_mkdir lib:40775 returned 0
> > > > sys_mkdir lib/modules:40775 returned 0
> > > > ...
> > > > 
> > > > and the problem is fixed.
> > 
> > This patch also works for me.
> > 
> > > ... except that it simply confirms that something's fishy with
> > > getname_kernel() of ->name of struct filename returned by getname(). 
> > > IOW, I still do not understand the mechanism of breakage there.
> > 
> > I'm not so sure about that.  I tried to copy name to a new string in
> > do_path_lookup and that didn't help.
> > 
> > Now, I've removed the
> > 
> >         putname(filename);
> > 
> > line from do_path_lookup and I don't get the panic.
> 
> That would indicate that somehow the refcount got unbalanced. Looking
> more closely it seems like the various audit_*() function do take a
> reference, but maybe that's not enough.

I'm thinking the same thing and I think the problem may be that 
__audit_reusename() is not bumping the filename->refcnt.  Can someone who is 
seeing this problem bump the refcnt in __audit_reusename()?

  struct filename *
  __audit_reusename(const __user char *uptr)
  {
        struct audit_context *context = current->audit_context;
        struct audit_names *n;

        list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
                if (n->name->uptr == uptr) {
+                       n->name->refcnt++;
                        return n->name;
                }
        }
        return NULL;
  }
Thierry Reding Jan. 21, 2015, 3:39 p.m. UTC | #8
On Wed, Jan 21, 2015 at 10:24:11AM -0500, Paul Moore wrote:
> On Wednesday, January 21, 2015 03:42:16 PM Thierry Reding wrote:
> > On Wed, Jan 21, 2015 at 12:05:39PM +0100, Sabrina Dubroca wrote:
> > > 2015-01-21, 04:36:38 +0000, Al Viro wrote:
> > > > On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
> > > > > With this patch:
> > > > > 
> > > > > sys_mkdir .:40775 returned -17
> > > > > sys_mkdir usr:40775 returned 0
> > > > > sys_mkdir usr/lib:40775 returned 0
> > > > > sys_mkdir usr/share:40755 returned 0
> > > > > sys_mkdir usr/share/udhcpc:40755 returned 0
> > > > > sys_mkdir usr/bin:40775 returned 0
> > > > > sys_mkdir usr/sbin:40775 returned 0
> > > > > sys_mkdir mnt:40775 returned 0
> > > > > sys_mkdir proc:40775 returned 0
> > > > > sys_mkdir root:40775 returned 0
> > > > > sys_mkdir lib:40775 returned 0
> > > > > sys_mkdir lib/modules:40775 returned 0
> > > > > ...
> > > > > 
> > > > > and the problem is fixed.
> > > 
> > > This patch also works for me.
> > > 
> > > > ... except that it simply confirms that something's fishy with
> > > > getname_kernel() of ->name of struct filename returned by getname(). 
> > > > IOW, I still do not understand the mechanism of breakage there.
> > > 
> > > I'm not so sure about that.  I tried to copy name to a new string in
> > > do_path_lookup and that didn't help.
> > > 
> > > Now, I've removed the
> > > 
> > >         putname(filename);
> > > 
> > > line from do_path_lookup and I don't get the panic.
> > 
> > That would indicate that somehow the refcount got unbalanced. Looking
> > more closely it seems like the various audit_*() function do take a
> > reference, but maybe that's not enough.
> 
> I'm thinking the same thing and I think the problem may be that 
> __audit_reusename() is not bumping the filename->refcnt.  Can someone who is 
> seeing this problem bump the refcnt in __audit_reusename()?
> 
>   struct filename *
>   __audit_reusename(const __user char *uptr)
>   {
>         struct audit_context *context = current->audit_context;
>         struct audit_names *n;
> 
>         list_for_each_entry(n, &context->names_list, list) {
>                 if (!n->name)
>                         continue;
>                 if (n->name->uptr == uptr) {
> +                       n->name->refcnt++;
>                         return n->name;
>                 }
>         }
>         return NULL;
>   }

That doesn't seem to help, at least in my case.

Thierry
Sabrina Dubroca Jan. 21, 2015, 3:54 p.m. UTC | #9
2015-01-21, 16:39:12 +0100, Thierry Reding wrote:
> On Wed, Jan 21, 2015 at 10:24:11AM -0500, Paul Moore wrote:
> > On Wednesday, January 21, 2015 03:42:16 PM Thierry Reding wrote:
> > > On Wed, Jan 21, 2015 at 12:05:39PM +0100, Sabrina Dubroca wrote:
> > > > 2015-01-21, 04:36:38 +0000, Al Viro wrote:
> > > > > On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
> > > > > > With this patch:
> > > > > > 
> > > > > > sys_mkdir .:40775 returned -17
> > > > > > sys_mkdir usr:40775 returned 0
> > > > > > sys_mkdir usr/lib:40775 returned 0
> > > > > > sys_mkdir usr/share:40755 returned 0
> > > > > > sys_mkdir usr/share/udhcpc:40755 returned 0
> > > > > > sys_mkdir usr/bin:40775 returned 0
> > > > > > sys_mkdir usr/sbin:40775 returned 0
> > > > > > sys_mkdir mnt:40775 returned 0
> > > > > > sys_mkdir proc:40775 returned 0
> > > > > > sys_mkdir root:40775 returned 0
> > > > > > sys_mkdir lib:40775 returned 0
> > > > > > sys_mkdir lib/modules:40775 returned 0
> > > > > > ...
> > > > > > 
> > > > > > and the problem is fixed.
> > > > 
> > > > This patch also works for me.
> > > > 
> > > > > ... except that it simply confirms that something's fishy with
> > > > > getname_kernel() of ->name of struct filename returned by getname(). 
> > > > > IOW, I still do not understand the mechanism of breakage there.
> > > > 
> > > > I'm not so sure about that.  I tried to copy name to a new string in
> > > > do_path_lookup and that didn't help.
> > > > 
> > > > Now, I've removed the
> > > > 
> > > >         putname(filename);
> > > > 
> > > > line from do_path_lookup and I don't get the panic.
> > > 
> > > That would indicate that somehow the refcount got unbalanced. Looking
> > > more closely it seems like the various audit_*() function do take a
> > > reference, but maybe that's not enough.
> > 
> > I'm thinking the same thing and I think the problem may be that 
> > __audit_reusename() is not bumping the filename->refcnt.  Can someone who is 
> > seeing this problem bump the refcnt in __audit_reusename()?
> > 
> >   struct filename *
> >   __audit_reusename(const __user char *uptr)
> >   {
> >         struct audit_context *context = current->audit_context;
> >         struct audit_names *n;
> > 
> >         list_for_each_entry(n, &context->names_list, list) {
> >                 if (!n->name)
> >                         continue;
> >                 if (n->name->uptr == uptr) {
> > +                       n->name->refcnt++;
> >                         return n->name;
> >                 }
> >         }
> >         return NULL;
> >   }
> 
> That doesn't seem to help, at least in my case.

Same here.

Well, it's probably not an audit issue.  I tried audit=0 on the
commandline, and I just rebuilt a kernel with CONFIG_AUDIT=n, and it's
still panicing.  This should have fixed any audit-related issue,
right?
Paul Moore Jan. 21, 2015, 4:16 p.m. UTC | #10
On Wednesday, January 21, 2015 04:54:07 PM Sabrina Dubroca wrote:
> 2015-01-21, 16:39:12 +0100, Thierry Reding wrote:
> > That doesn't seem to help, at least in my case.
> 
> Same here.

Okay, thanks for trying.  Sorry that didn't resolve things.

> Well, it's probably not an audit issue.  I tried audit=0 on the
> commandline, and I just rebuilt a kernel with CONFIG_AUDIT=n, and it's
> still panicing.  This should have fixed any audit-related issue,
> right?

Most likely.  Back to the code I go ...
Guenter Roeck Jan. 21, 2015, 4:21 p.m. UTC | #11
On 01/21/2015 07:54 AM, Sabrina Dubroca wrote:
> 2015-01-21, 16:39:12 +0100, Thierry Reding wrote:
>> On Wed, Jan 21, 2015 at 10:24:11AM -0500, Paul Moore wrote:
>>> On Wednesday, January 21, 2015 03:42:16 PM Thierry Reding wrote:
>>>> On Wed, Jan 21, 2015 at 12:05:39PM +0100, Sabrina Dubroca wrote:
>>>>> 2015-01-21, 04:36:38 +0000, Al Viro wrote:
>>>>>> On Tue, Jan 20, 2015 at 08:01:26PM -0800, Guenter Roeck wrote:
>>>>>>> With this patch:
>>>>>>>
>>>>>>> sys_mkdir .:40775 returned -17
>>>>>>> sys_mkdir usr:40775 returned 0
>>>>>>> sys_mkdir usr/lib:40775 returned 0
>>>>>>> sys_mkdir usr/share:40755 returned 0
>>>>>>> sys_mkdir usr/share/udhcpc:40755 returned 0
>>>>>>> sys_mkdir usr/bin:40775 returned 0
>>>>>>> sys_mkdir usr/sbin:40775 returned 0
>>>>>>> sys_mkdir mnt:40775 returned 0
>>>>>>> sys_mkdir proc:40775 returned 0
>>>>>>> sys_mkdir root:40775 returned 0
>>>>>>> sys_mkdir lib:40775 returned 0
>>>>>>> sys_mkdir lib/modules:40775 returned 0
>>>>>>> ...
>>>>>>>
>>>>>>> and the problem is fixed.
>>>>>
>>>>> This patch also works for me.
>>>>>
>>>>>> ... except that it simply confirms that something's fishy with
>>>>>> getname_kernel() of ->name of struct filename returned by getname().
>>>>>> IOW, I still do not understand the mechanism of breakage there.
>>>>>
>>>>> I'm not so sure about that.  I tried to copy name to a new string in
>>>>> do_path_lookup and that didn't help.
>>>>>
>>>>> Now, I've removed the
>>>>>
>>>>>          putname(filename);
>>>>>
>>>>> line from do_path_lookup and I don't get the panic.
>>>>
>>>> That would indicate that somehow the refcount got unbalanced. Looking
>>>> more closely it seems like the various audit_*() function do take a
>>>> reference, but maybe that's not enough.
>>>
>>> I'm thinking the same thing and I think the problem may be that
>>> __audit_reusename() is not bumping the filename->refcnt.  Can someone who is
>>> seeing this problem bump the refcnt in __audit_reusename()?
>>>
>>>    struct filename *
>>>    __audit_reusename(const __user char *uptr)
>>>    {
>>>          struct audit_context *context = current->audit_context;
>>>          struct audit_names *n;
>>>
>>>          list_for_each_entry(n, &context->names_list, list) {
>>>                  if (!n->name)
>>>                          continue;
>>>                  if (n->name->uptr == uptr) {
>>> +                       n->name->refcnt++;
>>>                          return n->name;
>>>                  }
>>>          }
>>>          return NULL;
>>>    }
>>
>> That doesn't seem to help, at least in my case.
>
> Same here.
>
> Well, it's probably not an audit issue.  I tried audit=0 on the
> commandline, and I just rebuilt a kernel with CONFIG_AUDIT=n, and it's
> still panicing.  This should have fixed any audit-related issue,
> right?
>
I don't have audit enabled, so I don't think that is the problem either
(the refcount increase didn't help, and a WARN(1) added to the code
at the same location did not trigger).

Wonder if we have a use-after-free case and just have been lucky all along.

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Jan. 21, 2015, 5:38 p.m. UTC | #12
On Wed, Jan 21, 2015 at 11:16:23AM -0500, Paul Moore wrote:
> On Wednesday, January 21, 2015 04:54:07 PM Sabrina Dubroca wrote:
> > 2015-01-21, 16:39:12 +0100, Thierry Reding wrote:
> > > That doesn't seem to help, at least in my case.
> > 
> > Same here.
> 
> Okay, thanks for trying.  Sorry that didn't resolve things.
> 
> > Well, it's probably not an audit issue.  I tried audit=0 on the
> > commandline, and I just rebuilt a kernel with CONFIG_AUDIT=n, and it's
> > still panicing.  This should have fixed any audit-related issue,
> > right?
> 
> Most likely.  Back to the code I go ...

FWIW, I really wonder if populate_rootfs() (run ultimately from
kernel_init(), by way of kernel_init_freeable(), do_basic_setup() and
do_initcalls()) ends up with some side effects as far as struct filename
are concerned...

Note that if we _ever_ hit reuse logics there, we are going to get bogus
matches asoddingplenty - *all* those sys_mkdir(), etc. are going to be
with filenames in the same reused buffer.  So if anything in there leaks
from one call to another, we are going to have a mess on hands.

Another place where that can be a problem is devtmpfs - there's a kernel
thread doing actual mkdir, mknod, etc. in that abomination and if _that_
ends up accumulating aushit entries, we'll end up with interesting problems.

Folks, could you print the value of audit_dummy_context() in populate_rootfs()
and in drivers/base/devtmpfs.c:devtmpfsd()?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guenter Roeck Jan. 21, 2015, 5:51 p.m. UTC | #13
On 01/21/2015 09:38 AM, Al Viro wrote:
> On Wed, Jan 21, 2015 at 11:16:23AM -0500, Paul Moore wrote:
>> On Wednesday, January 21, 2015 04:54:07 PM Sabrina Dubroca wrote:
>>> 2015-01-21, 16:39:12 +0100, Thierry Reding wrote:
>>>> That doesn't seem to help, at least in my case.
>>>
>>> Same here.
>>
>> Okay, thanks for trying.  Sorry that didn't resolve things.
>>
>>> Well, it's probably not an audit issue.  I tried audit=0 on the
>>> commandline, and I just rebuilt a kernel with CONFIG_AUDIT=n, and it's
>>> still panicing.  This should have fixed any audit-related issue,
>>> right?
>>
>> Most likely.  Back to the code I go ...
>
> FWIW, I really wonder if populate_rootfs() (run ultimately from
> kernel_init(), by way of kernel_init_freeable(), do_basic_setup() and
> do_initcalls()) ends up with some side effects as far as struct filename
> are concerned...
>
> Note that if we _ever_ hit reuse logics there, we are going to get bogus
> matches asoddingplenty - *all* those sys_mkdir(), etc. are going to be
> with filenames in the same reused buffer.  So if anything in there leaks
> from one call to another, we are going to have a mess on hands.
>
> Another place where that can be a problem is devtmpfs - there's a kernel
> thread doing actual mkdir, mknod, etc. in that abomination and if _that_
> ends up accumulating aushit entries, we'll end up with interesting problems.
>
> Folks, could you print the value of audit_dummy_context() in populate_rootfs()
> and in drivers/base/devtmpfs.c:devtmpfsd()?
>
populate_rootfs: audit_dummy_context() returns 1

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Jan. 21, 2015, 6:29 p.m. UTC | #14
On Wed, Jan 21, 2015 at 05:32:13AM -0800, Guenter Roeck wrote:
> Another data point (though I have no idea if it is useful or what it means):
> 
> In the working case, path_init sets nd->flags to 0x50 or 0x51.
> In the non-working case (ie for all files with a '/' in the name),
> it sets nd->flags to 0x10 or 0x11, even though it is always called
> with the LOOKUP_RCU bit set in flags.

Umm...  Are those path_init() succeeding or failing?  Note that path_init()
includes "walk everything except for the last component", so your non-working
case is "have it walk anything at all".  What's failing there?  path_init()
or handling the remaining component?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guenter Roeck Jan. 21, 2015, 7:06 p.m. UTC | #15
On 01/21/2015 10:29 AM, Al Viro wrote:
> On Wed, Jan 21, 2015 at 05:32:13AM -0800, Guenter Roeck wrote:
>> Another data point (though I have no idea if it is useful or what it means):
>>
>> In the working case, path_init sets nd->flags to 0x50 or 0x51.
>> In the non-working case (ie for all files with a '/' in the name),
>> it sets nd->flags to 0x10 or 0x11, even though it is always called
>> with the LOOKUP_RCU bit set in flags.
>
> Umm...  Are those path_init() succeeding or failing?  Note that path_init()
> includes "walk everything except for the last component", so your non-working
> case is "have it walk anything at all".  What's failing there?  path_init()
> or handling the remaining component?
>
path_init() returns -2. Guess that explains the unexpected flags ;-).
The failuere is from
	link_path_walk()
		walk_component()

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/namei.c b/fs/namei.c
index 323957f..c7d107c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3314,7 +3314,7 @@  struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	return file;
 }
 
-struct dentry *kern_path_create(int dfd, const char *pathname,
+static struct dentry *__kern_path_create(int dfd, struct filename *name,
 				struct path *path, unsigned int lookup_flags)
 {
 	struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3329,7 +3329,7 @@  struct dentry *kern_path_create(int dfd, const char *pathname,
 	 */
 	lookup_flags &= LOOKUP_REVAL;
 
-	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
+	error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
 	if (error)
 		return ERR_PTR(error);
 
@@ -3383,6 +3383,19 @@  out:
 	path_put(&nd.path);
 	return dentry;
 }
+
+struct dentry *kern_path_create(int dfd, const char *pathname,
+				struct path *path, unsigned int lookup_flags)
+{
+	struct filename *filename = getname_kernel(pathname);
+	struct dentry *res = ERR_CAST(filename);
+
+	if (!IS_ERR(filename)) {
+		res = __kern_path_create(dfd, filename, path, lookup_flags);
+		putname(filename);
+	}
+	return res;
+}
 EXPORT_SYMBOL(kern_path_create);
 
 void done_path_create(struct path *path, struct dentry *dentry)
@@ -3401,7 +3414,7 @@  struct dentry *user_path_create(int dfd, const char __user *pathname,
 	struct dentry *res;
 	if (IS_ERR(tmp))
 		return ERR_CAST(tmp);
-	res = kern_path_create(dfd, tmp->name, path, lookup_flags);
+	res = __kern_path_create(dfd, tmp, path, lookup_flags);
 	putname(tmp);
 	return res;
 }