diff mbox

[RFC,1/1] shiftfs: uid/gid shifting bind mount

Message ID 1486235972.2484.19.camel@HansenPartnership.com (mailing list archive)
State New, archived
Headers show

Commit Message

James Bottomley Feb. 4, 2017, 7:19 p.m. UTC
This allows any subtree to be uid/gid shifted and bound elsewhere.  It
does this by operating simlarly to overlayfs.  Its primary use is for
shifting the underlying uids of filesystems used to support
unpriviliged (uid shifted) containers.  The usual use case here is
that the container is operating with an uid shifted unprivileged root
but sometimes needs to make use of or work with a filesystem image
that has root at real uid 0.

The mechanism is to allow any subordinate mount namespace to mount a
shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only allowing
it to mount marked subtrees (using the -o mark option as root).  Once
mounted, the subtree is mapped via the super block user namespace so
that the interior ids of the mounting user namespace are the ids
written to the filesystem.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>

---

v1: - based on original shiftfs with uid mappings now done via s_user_ns
---
 fs/Kconfig                 |   8 +
 fs/Makefile                |   1 +
 fs/shiftfs.c               | 728 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/magic.h |   2 +
 4 files changed, 739 insertions(+)
 create mode 100644 fs/shiftfs.c

Comments

Amir Goldstein Feb. 5, 2017, 7:51 a.m. UTC | #1
On Sat, Feb 4, 2017 at 9:19 PM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> This allows any subtree to be uid/gid shifted and bound elsewhere.  It
> does this by operating simlarly to overlayfs.  Its primary use is for
> shifting the underlying uids of filesystems used to support
> unpriviliged (uid shifted) containers.  The usual use case here is
> that the container is operating with an uid shifted unprivileged root
> but sometimes needs to make use of or work with a filesystem image
> that has root at real uid 0.
>
> The mechanism is to allow any subordinate mount namespace to mount a
> shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only allowing
> it to mount marked subtrees (using the -o mark option as root).  Once
> mounted, the subtree is mapped via the super block user namespace so
> that the interior ids of the mounting user namespace are the ids
> written to the filesystem.
>
> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
>

James,

Allow me to point out some problems in this patch and offer a slightly different
approach.

First of all, the subject says "uid/gid shifting bind mount", but it's not
really a bind mount. What it is is a stackable mount and 2 levels of
stack no less.
So one thing that is missing is increasing of sb->s_stack_depth and that
also means that shiftfs cannot be used to recursively shift uids in child userns
if that was ever the intention.

The other problem is that by forking overlayfs functionality, shiftfs is going
to miss out on overlayfs bug fixes related to user credentials differ
from mounter
credentials, like fd3220d ("ovl: update S_ISGID when setting posix ACLs").
I am not sure that this specific case is relevant to shiftfs, but
there could be other.

So how about, instead of forking a new containers specialized stackable fs,
that the needed functionality be merged into overlayfs code?
I think overlayfs container users may also benefit from shiftfs
functionality, no?
In any case, overlayfs has considerable millage used as fs for containers,
so many issues related to running with different userns may have already been
addressed.

Overlayfs already stores the mounter's credentials and uses them to perform
most of the operations on upper.

I know it wasn't the original purpose of overlayfs to run as a single layer, but
there is nothing really preventing from doing that. In fact, I am
doing just that
with my snapshot mount patches, see:
https://github.com/amir73il/linux/commit/acc6c25eab03c176c9ef736544fab3fba663765d#diff-2b85a3c5bea4263d08a2bdff639192c3
I registered a new fs type ("snapshot"), which reuses most of the existing
overlayfs operations.
With this patch it is possible to mount an overlay with only upper layer, so all
the operations are pass through except for the credentials, e.g.:

mount -t snapshot -o upper=<origin> shiftfs_test <mark location>

If you think this concept is workable, then the functionality of
mounting overlayfs
with only upper should be integrated into plain overlayfs and shiftfs could be
a very thin variant of overlayfs mount using shitfs_fs_type, just for the sake
of having FS_USERNS_MOUNT, e.g:

+ /*
+  * XXX: reusing ovl_mount()/ovl_fill_super(), but could also just reuse
+  * ovl_dentry_operations/ovl_super_operations/ovl_xattr_handlers/ovl_new_inode()
+  */
+static struct file_system_type shiftfs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "shiftfs",
+       .mount          = ovl_mount,
+       .kill_sb        = kill_anon_super,
+       .fs_flags       = FS_USERNS_MOUNT,
+};
+MODULE_ALIAS_FS("shiftfs");
+MODULE_ALIAS("shiftfs");
+#define IS_SHIFTFS_SB(sb) ((sb)->s_type == &shiftfs_type)

And instead of verifying that shiftfs is mounted inside container over shiftfs,
verify that it is mounted over an overlayfs noexec mount e.g.:

+       if (IS_SHIFTFS_SB(sb)) {
+               /*
+                * this leg executes if we're admin capable in
+                * the namespace, so be very careful
+                */
+               if (path.dentry->d_sb->s_magic != OVERLAYFS_MAGIC ||
!(path.dentry->d_sb->s_iflags & SB_I_NOEXEC))
+                       goto out_put;

From users manual POV:

in host:
mount -t overlay -o noexec,upper=<origin> container_visible <mark location>

in container:
mount -t shiftfs -o upper=<mark location> container_writable
<somewhere in my local mount ns>

Thought?
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 6, 2017, 1:18 a.m. UTC | #2
On Sun, 2017-02-05 at 09:51 +0200, Amir Goldstein wrote:
> On Sat, Feb 4, 2017 at 9:19 PM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > This allows any subtree to be uid/gid shifted and bound elsewhere. 
> >  It does this by operating simlarly to overlayfs.  Its primary use 
> > is for shifting the underlying uids of filesystems used to support
> > unpriviliged (uid shifted) containers.  The usual use case here is
> > that the container is operating with an uid shifted unprivileged 
> > root but sometimes needs to make use of or work with a filesystem 
> > image that has root at real uid 0.
> > 
> > The mechanism is to allow any subordinate mount namespace to mount 
> > a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only
> > allowing it to mount marked subtrees (using the -o mark option as 
> > root).   Once mounted, the subtree is mapped via the super block 
> > user namespace so that the interior ids of the mounting user 
> > namespace are the ids written to the filesystem.
> > 
> > Signed-off-by: James Bottomley <
> > James.Bottomley@HansenPartnership.com>
> > 
> 
> James,
> 
> Allow me to point out some problems in this patch and offer a 
> slightly different approach.
> 
> First of all, the subject says "uid/gid shifting bind mount", but 
> it's not really a bind mount. What it is is a stackable mount and 2 
> levels of stack no less.

The reason for the description is to have it behave exactly like a bind
mount.  You can assert that a bind mount is, in fact, a stacked mount,
but we don't currently.  I'm also not sure where you get your 2 levels
from?

>  So one thing that is missing is increasing of sb->s_stack_depth and
> that also means that shiftfs cannot be used to recursively shift uids
> in child userns if that was ever the intention.

I can't think of a use case that would ever need that, but perhaps
other container people can.

> The other problem is that by forking overlayfs functionality,

So this wouldn't really be the right way to look at it: shiftfs shares
no code with overlayfs at all, so is definitely not a fork.  The only
piece of functionality it has which is similar to overlayfs is the way
it does lookups via a new dentry cache.  However, that functionality is
not unique to overlayfs and if you look, you'll see that
shiftfs_lookup() actually has far more in common with
ecryptfs_lookup().

>  shiftfs is going to miss out on overlayfs bug fixes related to user 
> credentials differ from mounter credentials, like fd3220d ("ovl: 
> update S_ISGID when setting posix ACLs"). I am not sure that this 
> specific case is relevant to shiftfs, but there could be other.

OK, so shiftfs doesn't have this bug and the reason why is
illustrative: basically shiftfs does three things

   1. lookups via a uid/gid shifted dentry cache
   2. shifted credential inode operations permission checks on the
      underlying filesystem
   3. location marking for unprivileged mount

I think we've already seen that 1. isn't from overlayfs but the
functionality could be added to overlayfs, I suppose.  The big problem
is 2.  The overlayfs code emulates the permission checks, which makes
it rather complex (this is where you get your bugs like the above
from).  I did actually look at adding 2. to overlayfs on the theory
that a single layer overlay might be closest to what this is, but
eventually concluded I'd have to take the special cases and add a whole
lot more to them ... it really would increase the maintenance burden
substantially and make the code an unreadable rats nest.

When you think about it this way, it becomes obvious that the clean
separation is if shiftfs functionality is layered on top of overlayfs
and when you do that, doing it as its own filesystem is more logical.

> So how about, instead of forking a new containers specialized 
> stackable fs, that the needed functionality be merged into overlayfs 
> code? I think overlayfs container users may also benefit from shiftfs
> functionality, no?

I think I covered the why not merge the code above.  As to the
functionality, since Docker already has a graph driver, the graph
driver can do the shifting on top of the overlays.

>  In any case, overlayfs has considerable millage used as fs for
> containers, so many issues related to running with different userns
> may have already been addressed.

Overlayfs is s_user_ns blind so it's highly unlikely to have seen any
issues with the user namespaces, let alone addressed them.  This will
also be compounded by the fact that its primary user: docker, has
rather a weak use of the user namespace currently.

The other thing is the use case: Most immutable infrastructure
container systems create the overlays in the host and then bind them
into the container.  This binding is an additional mount operation. 
 Now the could mount from an overlay as an overlay but it's adding
complexity because the container itself cannot control the overlay
(it's a host provided thing) so it is definitely cleaner to make the
second mount a different filesystem (i.e. shiftfs) where the nature of
the overlay is hidden from the container.

> Overlayfs already stores the mounter's credentials and uses them to 
> perform most of the operations on upper.

OK, that's case 2. again.  So I think you may be labouring under the
misapprehension that shiftfs and overlayfs do the same thing with
override credentials?  They don't: overlayfs emulates the permission
lookups and then overrides based on *historical* admin credentials to
force what it's already decided on the underlying fielsystems.  Shiftfs
overrides the *current* credentials with a uid/gid and namespace shift
and then runs the permission checks.  Thus if I wanted to add what
shiftfs does to overlayfs, I'd have to add another load of overriding
based on current credentials in the currently unoverriden emulated
permission checks.  I think you can see that simply running the real
permission checks on the underlying filesystem with overridden
credentials is much simpler.

> I know it wasn't the original purpose of overlayfs to run as a single 
> layer, but there is nothing really preventing from doing that. In 
> fact, I am doing just that with my snapshot mount patches, see:
> https://github.com/amir73il/linux/commit/acc6c25eab03c176c9ef736544fa
> b3fba663765d#diff-2b85a3c5bea4263d08a2bdff639192c3
> I registered a new fs type ("snapshot"), which reuses most of the 
> existing overlayfs operations. With this patch it is possible to 
> mount an overlay with only upper layer, so all the operations are
> pass through except for the credentials, e.g.:
> 
> mount -t snapshot -o upper=<origin> shiftfs_test <mark location>

OK, so since you don't need to special case the permission checks, I
can see why this might work for you because you don't need to modify
overlayfs to do this.  Since I can't consume the overlay code as is, it
doesn't work for me because I'd have to add lots of special case code
to it.

James

> If you think this concept is workable, then the functionality of
> mounting overlayfs with only upper should be integrated into plain 
> overlayfs and shiftfs could be a very thin variant of overlayfs mount 
> using shitfs_fs_type, just for the sake of having FS_USERNS_MOUNT,
> e.g:
> 
> + /*
> +  * XXX: reusing ovl_mount()/ovl_fill_super(), but could also just
> reuse
> +  *
> ovl_dentry_operations/ovl_super_operations/ovl_xattr_handlers/ovl_new
> _inode()
> +  */
> +static struct file_system_type shiftfs_type = {
> +       .owner          = THIS_MODULE,
> +       .name           = "shiftfs",
> +       .mount          = ovl_mount,
> +       .kill_sb        = kill_anon_super,
> +       .fs_flags       = FS_USERNS_MOUNT,
> +};
> +MODULE_ALIAS_FS("shiftfs");
> +MODULE_ALIAS("shiftfs");
> +#define IS_SHIFTFS_SB(sb) ((sb)->s_type == &shiftfs_type)
> 
> And instead of verifying that shiftfs is mounted inside container
> over shiftfs,
> verify that it is mounted over an overlayfs noexec mount e.g.:
> 
> +       if (IS_SHIFTFS_SB(sb)) {
> +               /*
> +                * this leg executes if we're admin capable in
> +                * the namespace, so be very careful
> +                */
> +               if (path.dentry->d_sb->s_magic != OVERLAYFS_MAGIC ||
> !(path.dentry->d_sb->s_iflags & SB_I_NOEXEC))
> +                       goto out_put;
> 
> From users manual POV:
> 
> in host:
> mount -t overlay -o noexec,upper=<origin> container_visible <mark
> location>
> 
> in container:
> mount -t shiftfs -o upper=<mark location> container_writable
> <somewhere in my local mount ns>
> 
> Thought?
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. R. Okajima Feb. 6, 2017, 3:25 a.m. UTC | #3
James Bottomley:
> This allows any subtree to be uid/gid shifted and bound elsewhere.  It
	:::

Interesting.
But I am afraid that the inconsistency problem of the inode numbers will
happen.

shiftfs_new_inode() uses get_next_ino() which means
- 1st time: inodeA is created and cached, inumA is assigned
- after using inodeA, it will be discarded from the cache
- 2nd time: inodeA is looked-up again, and another inode number (inumB)
  is assgined.

This inconsistency will not be a problem for the "pure virtual" fs such
as procfs and sysfs. But your shiftfs is not pure as them. Shiftfs will
be used as a wrapper (or "binder" which means bind-mount) of an orginary
filesystem.
The symptom of this problem from users perspective will be
- find -inum doesn't work
- git-status doesn't work, which keeps st_dev and st_ino and compares
  the current files.
Of course they will be limited to when the target dir is huge and/or
system memory is low. As long as the inode cache is large enough to hold
all necessary inodes, the problem won't happen.

If shiftfs will supports exporting via NFS in the future, the
consistency of inum will be important too.


J. R. Okajima
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Feb. 6, 2017, 6:38 a.m. UTC | #4
On Mon, Feb 6, 2017 at 5:25 AM, J. R. Okajima <hooanon05g@gmail.com> wrote:
> James Bottomley:
>> This allows any subtree to be uid/gid shifted and bound elsewhere.  It
>         :::
>
> Interesting.
> But I am afraid that the inconsistency problem of the inode numbers will
> happen.
>

Yet another example that overlayfs already is in the process of solving
(it is fixed for stat of merged directory inode).
In fact, fir the case of single layer overlay (as well as shiftfs) the
solution is trivial -
preserve underlying inode st_ino/d_ino and use the overlayed fs st_dev.

> shiftfs_new_inode() uses get_next_ino() which means
> - 1st time: inodeA is created and cached, inumA is assigned
> - after using inodeA, it will be discarded from the cache
> - 2nd time: inodeA is looked-up again, and another inode number (inumB)
>   is assgined.
>
> This inconsistency will not be a problem for the "pure virtual" fs such
> as procfs and sysfs. But your shiftfs is not pure as them. Shiftfs will
> be used as a wrapper (or "binder" which means bind-mount) of an orginary
> filesystem.
> The symptom of this problem from users perspective will be
> - find -inum doesn't work
> - git-status doesn't work, which keeps st_dev and st_ino and compares
>   the current files.
> Of course they will be limited to when the target dir is huge and/or
> system memory is low. As long as the inode cache is large enough to hold
> all necessary inodes, the problem won't happen.
>
> If shiftfs will supports exporting via NFS in the future, the
> consistency of inum will be important too.
>
>
> J. R. Okajima
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 6, 2017, 6:46 a.m. UTC | #5
On Mon, 2017-02-06 at 12:25 +0900, J. R. Okajima wrote:
> James Bottomley:
> > This allows any subtree to be uid/gid shifted and bound elsewhere. 
> >  It
> 	:::
> 
> Interesting.
> But I am afraid that the inconsistency problem of the inode numbers 
> will happen.
> 
> shiftfs_new_inode() uses get_next_ino() which means
> - 1st time: inodeA is created and cached, inumA is assigned
> - after using inodeA, it will be discarded from the cache
> - 2nd time: inodeA is looked-up again, and another inode number 
> (inumB)   is assgined.

Yes, I know the problem.  However, I believe most current linux
filesystems no longer guarantee stable, for the lifetime of the file,
inode numbers.  The usual docker container root is overlayfs, which,
similarly doesn't support stable inode numbers.  I see the odd
complaint about docker with overlayfs having unstable inode numbers,
but none seems to have any serious repercussions.

[...]
> If shiftfs will supports exporting via NFS in the future, the
> consistency of inum will be important too.

If it's a problem, then it's fixable with s_export_op, but I was mostly
thinking that because it's not a problem for overlayfs based
containers, it wouldn't be one for shiftfs based ones, which is why I
didn't implement it.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Feb. 6, 2017, 6:59 a.m. UTC | #6
On Mon, Feb 6, 2017 at 3:18 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Sun, 2017-02-05 at 09:51 +0200, Amir Goldstein wrote:
>> On Sat, Feb 4, 2017 at 9:19 PM, James Bottomley
>> <James.Bottomley@hansenpartnership.com> wrote:
>> > This allows any subtree to be uid/gid shifted and bound elsewhere.
>> >  It does this by operating simlarly to overlayfs.  Its primary use
>> > is for shifting the underlying uids of filesystems used to support
>> > unpriviliged (uid shifted) containers.  The usual use case here is
>> > that the container is operating with an uid shifted unprivileged
>> > root but sometimes needs to make use of or work with a filesystem
>> > image that has root at real uid 0.
>> >
>> > The mechanism is to allow any subordinate mount namespace to mount
>> > a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only
>> > allowing it to mount marked subtrees (using the -o mark option as
>> > root).   Once mounted, the subtree is mapped via the super block
>> > user namespace so that the interior ids of the mounting user
>> > namespace are the ids written to the filesystem.
>> >
>> > Signed-off-by: James Bottomley <
>> > James.Bottomley@HansenPartnership.com>
>> >
>>
>> James,
>>
>> Allow me to point out some problems in this patch and offer a
>> slightly different approach.
>>
>> First of all, the subject says "uid/gid shifting bind mount", but
>> it's not really a bind mount. What it is is a stackable mount and 2
>> levels of stack no less.
>
> The reason for the description is to have it behave exactly like a bind
> mount.  You can assert that a bind mount is, in fact, a stacked mount,
> but we don't currently.  I'm also not sure where you get your 2 levels
> from?
>

A bind mount does not incur recursion into VFS code, a stacked fs does.
And there is a programmable limit of stack depth of 2, which stacked
fs need to comply with.
Your proposed setup has 2 stacked fs, the mark shitfs by admin
and the uid shitfs by container user. Or maybe I misunderstood.


>>  So one thing that is missing is increasing of sb->s_stack_depth and
>> that also means that shiftfs cannot be used to recursively shift uids
>> in child userns if that was ever the intention.
>
> I can't think of a use case that would ever need that, but perhaps
> other container people can.
>
>> The other problem is that by forking overlayfs functionality,
>
> So this wouldn't really be the right way to look at it: shiftfs shares
> no code with overlayfs at all, so is definitely not a fork.  The only
> piece of functionality it has which is similar to overlayfs is the way
> it does lookups via a new dentry cache.  However, that functionality is
> not unique to overlayfs and if you look, you'll see that
> shiftfs_lookup() actually has far more in common with
> ecryptfs_lookup().

That's a good point. All stackable file systems may share similar problems
and solutions (e.g. consistent st_ino/st_dev). Perhaps it calls for shared
library code or more generic VFS code.
At the moment ecryptfs is not seeing much development, so everything
happens in overlayfs. If there is going to be more than 1 actively developed
stackable fs, we need to see about that.

>
>>  shiftfs is going to miss out on overlayfs bug fixes related to user
>> credentials differ from mounter credentials, like fd3220d ("ovl:
>> update S_ISGID when setting posix ACLs"). I am not sure that this
>> specific case is relevant to shiftfs, but there could be other.
>
> OK, so shiftfs doesn't have this bug and the reason why is
> illustrative: basically shiftfs does three things
>
>    1. lookups via a uid/gid shifted dentry cache
>    2. shifted credential inode operations permission checks on the
>       underlying filesystem
>    3. location marking for unprivileged mount
>
> I think we've already seen that 1. isn't from overlayfs but the
> functionality could be added to overlayfs, I suppose.  The big problem
> is 2.  The overlayfs code emulates the permission checks, which makes
> it rather complex (this is where you get your bugs like the above
> from).  I did actually look at adding 2. to overlayfs on the theory
> that a single layer overlay might be closest to what this is, but
> eventually concluded I'd have to take the special cases and add a whole
> lot more to them ... it really would increase the maintenance burden
> substantially and make the code an unreadable rats nest.
>

The use cases for uid shifting are still overwelming for me.
I take your word for it that its going to be a maintanace burdon
to add this functionality to overlayfs.

> When you think about it this way, it becomes obvious that the clean
> separation is if shiftfs functionality is layered on top of overlayfs
> and when you do that, doing it as its own filesystem is more logical.
>

Yes, I agree with that statement. This is inline with the solution I outlined
at the end of my previous email, where single layer overlayfs is used
for the host "mark" mount, although I wonder if the same cannot be
achieved with a bind mount?

in host:
mount -t overlay -o noexec,upper=<origin> container_visible <mark location>

in container:
mount -t shiftfs -o <mark location> <somewhere in my local mount ns>
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 6, 2017, 2:41 p.m. UTC | #7
On Mon, 2017-02-06 at 08:59 +0200, Amir Goldstein wrote:
> On Mon, Feb 6, 2017 at 3:18 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Sun, 2017-02-05 at 09:51 +0200, Amir Goldstein wrote:
> > > On Sat, Feb 4, 2017 at 9:19 PM, James Bottomley
> > > <James.Bottomley@hansenpartnership.com> wrote:
> > > > This allows any subtree to be uid/gid shifted and bound 
> > > > elsewhere.  It does this by operating simlarly to overlayfs. 
> > > >  Its primary use is for shifting the underlying uids of 
> > > > filesystems used to support unpriviliged (uid shifted) 
> > > > containers.  The usual use case here is that the container is 
> > > > operating with an uid shifted unprivileged root but sometimes 
> > > > needs to make use of or work with a filesystem image that has
> > > > root at real uid 0.
> > > > 
> > > > The mechanism is to allow any subordinate mount namespace to 
> > > > mount a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but 
> > > > only allowing it to mount marked subtrees (using the -o mark 
> > > > option as root).   Once mounted, the subtree is mapped via the 
> > > > super block user namespace so that the interior ids of the 
> > > > mounting user namespace are the ids written to the filesystem.
> > > > 
> > > > Signed-off-by: James Bottomley <
> > > > James.Bottomley@HansenPartnership.com>
> > > > 
> > > 
> > > James,
> > > 
> > > Allow me to point out some problems in this patch and offer a
> > > slightly different approach.
> > > 
> > > First of all, the subject says "uid/gid shifting bind mount", but
> > > it's not really a bind mount. What it is is a stackable mount and 
> > > 2 levels of stack no less.
> > 
> > The reason for the description is to have it behave exactly like a 
> > bind mount.  You can assert that a bind mount is, in fact, a 
> > stacked mount, but we don't currently.  I'm also not sure where you 
> > get your 2 levels from?
> > 
> 
> A bind mount does not incur recursion into VFS code, a stacked fs 
> does. And there is a programmable limit of stack depth of 2, which 
> stacked fs need to comply with. Your proposed setup has 2 stacked fs, 
> the mark shitfs by admin and the uid shitfs by container user. Or
> maybe I misunderstood.

Oh, right, actually, it wouldn't be 2 because once the unprivileged
mount uses the marked filesystem, what it uses is the mnt and dentry
from the underlying filesystem (what you would have got from a path
lookup on it).

That said, it does perform recursive calls to the underlying filesystem
unlike a true bind mount, so I can add the depth easily enough.

> > >  So one thing that is missing is increasing of sb->s_stack_depth 
> > > and that also means that shiftfs cannot be used to recursively 
> > > shift uids in child userns if that was ever the intention.
> > 
> > I can't think of a use case that would ever need that, but perhaps
> > other container people can.
> > 
> > > The other problem is that by forking overlayfs functionality,
> > 
> > So this wouldn't really be the right way to look at it: shiftfs 
> > shares no code with overlayfs at all, so is definitely not a fork. 
> >  The only piece of functionality it has which is similar to 
> > overlayfs is the way it does lookups via a new dentry cache. 
> >  However, that functionality is not unique to overlayfs and if you 
> > look, you'll see that shiftfs_lookup() actually has far more in 
> > common with ecryptfs_lookup().
> 
> That's a good point. All stackable file systems may share similar 
> problems and solutions (e.g. consistent st_ino/st_dev). Perhaps it 
> calls for shared library code or more generic VFS code. At the moment 
> ecryptfs is not seeing much development, so everything happens in 
> overlayfs. If there is going to be more than 1 actively developed
> stackable fs, we need to see about that.

I believe we already do ... if you look at the lookup functions of each
of them, you see the only common thing is encapsulated in a variant of
the lookup_one_len() functions.  After that, even simple things like
our negative dentry handling differs.

> > >  shiftfs is going to miss out on overlayfs bug fixes related to 
> > > user credentials differ from mounter credentials, like fd3220d 
> > > ("ovl: update S_ISGID when setting posix ACLs"). I am not sure 
> > > that this specific case is relevant to shiftfs, but there could
> > > be other.
> > 
> > OK, so shiftfs doesn't have this bug and the reason why is
> > illustrative: basically shiftfs does three things
> > 
> >    1. lookups via a uid/gid shifted dentry cache
> >    2. shifted credential inode operations permission checks on the
> >       underlying filesystem
> >    3. location marking for unprivileged mount
> > 
> > I think we've already seen that 1. isn't from overlayfs but the
> > functionality could be added to overlayfs, I suppose.  The big 
> > problem is 2.  The overlayfs code emulates the permission checks, 
> > which makes it rather complex (this is where you get your bugs like 
> > the above from).  I did actually look at adding 2. to overlayfs on 
> > the theory that a single layer overlay might be closest to what 
> > this is, but eventually concluded I'd have to take the special 
> > cases and add a whole lot more to them ... it really would increase 
> > the maintenance burden substantially and make the code an
> > unreadable rats nest.
> > 
> 
> The use cases for uid shifting are still overwelming for me.
> I take your word for it that its going to be a maintanace burdon
> to add this functionality to overlayfs.
> 
> > When you think about it this way, it becomes obvious that the clean
> > separation is if shiftfs functionality is layered on top of 
> > overlayfs and when you do that, doing it as its own filesystem is 
> > more logical.
> > 
> 
> Yes, I agree with that statement. This is inline with the solution I 
> outlined at the end of my previous email, where single layer 
> overlayfs is used for the host "mark" mount, although I wonder if the 
> same cannot be achieved with a bind mount?

I understand, but once I can't consume overlayfs to construct it, the
idea of trying to use it becomes a negative not a positive.

We could achieve the same thing using bind mounts, if the vfsmount
structure carried a private field, but it doesn't.  I think given the
prevalence of this structure throughout the mount tree, that's a
deliberate decision to keep it thin.

> in host:
> mount -t overlay -o noexec,upper=<origin> container_visible <mark
> location>
> 
> in container:
> mount -t shiftfs -o <mark location> <somewhere in my local mount ns>

So I'm not sure it's a more widespread problem: mount --bind is usable
inside an unprivileged container, which means you can bridge filesystem
subtrees even only being local container admin.  The problem is
mounting other filesystems types.  Marking a type safe for mounting is
done by the FS_USERNS_MOUNT flag but it means for things like shiftfs
that you do have to restrict the source location, but for most
filesystem types, that source will be a device, so they will need other
checking than a mount mark.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Feb. 6, 2017, 2:50 p.m. UTC | #8
On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> Yes, I know the problem.  However, I believe most current linux
> filesystems no longer guarantee stable, for the lifetime of the file,
> inode numbers.  The usual docker container root is overlayfs, which,
> similarly doesn't support stable inode numbers.  I see the odd
> complaint about docker with overlayfs having unstable inode numbers,
> but none seems to have any serious repercussions.

Um, no.  Most current linux file systems *do* guarantee stable inode
numbers.  For one thing, NFS would break horribly if you didn't have
stable inode numbers.  Never mind applications which depend on POSIX
semantics.  And you wouldn't be able to save games in rogue or
nethack, either.  :-)

Overlayfs may not, currently, but it's considered a bug.

	      	   	      	       - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 6, 2017, 3:18 p.m. UTC | #9
On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> > Yes, I know the problem.  However, I believe most current linux
> > filesystems no longer guarantee stable, for the lifetime of the 
> > file, inode numbers.  The usual docker container root is overlayfs,
> > which, similarly doesn't support stable inode numbers.  I see the 
> > odd complaint about docker with overlayfs having unstable inode
> > numbers, but none seems to have any serious repercussions.
> 
> Um, no.  Most current linux file systems *do* guarantee stable inode
> numbers.  For one thing, NFS would break horribly if you didn't have
> stable inode numbers.  Never mind applications which depend on POSIX
> semantics.  And you wouldn't be able to save games in rogue or
> nethack, either.  :-)

I believe that's why we have the superblock export operations to
manufacture unique filehandles in the absence of inode number
stability.  The generic one uses inode numbers, but it doesn't have to.
 I thought reiserfs (if we can go back that far) was the first
generally used filesystem that didn't guarantee stable inode numbers,
so we have a lot of historical precedence.

Thanks to reiserfs, I thought we also iterated to weak stability
guarantees for inode numbers which mean no inconsistencies in
applications that use inode numbers for caching?  It's still not POSIX,
but I thought it was good enough for most use cases.

> Overlayfs may not, currently, but it's considered a bug.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
lkml@pengaru.com Feb. 6, 2017, 3:38 p.m. UTC | #10
On Mon, Feb 06, 2017 at 07:18:16AM -0800, James Bottomley wrote:
> On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> > On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> > > Yes, I know the problem.  However, I believe most current linux
> > > filesystems no longer guarantee stable, for the lifetime of the 
> > > file, inode numbers.  The usual docker container root is overlayfs,
> > > which, similarly doesn't support stable inode numbers.  I see the 
> > > odd complaint about docker with overlayfs having unstable inode
> > > numbers, but none seems to have any serious repercussions.
> > 
> > Um, no.  Most current linux file systems *do* guarantee stable inode
> > numbers.  For one thing, NFS would break horribly if you didn't have
> > stable inode numbers.  Never mind applications which depend on POSIX
> > semantics.  And you wouldn't be able to save games in rogue or
> > nethack, either.  :-)
> 
> I believe that's why we have the superblock export operations to
> manufacture unique filehandles in the absence of inode number
> stability.  The generic one uses inode numbers, but it doesn't have to.
>  I thought reiserfs (if we can go back that far) was the first
> generally used filesystem that didn't guarantee stable inode numbers,
> so we have a lot of historical precedence.
> 
> Thanks to reiserfs, I thought we also iterated to weak stability
> guarantees for inode numbers which mean no inconsistencies in
> applications that use inode numbers for caching?  It's still not POSIX,
> but I thought it was good enough for most use cases.
> 

Even plain tar extraction is sensitive to directory inode stability:
http://git.savannah.gnu.org/cgit/tar.git/tree/src/extract.c?h=release_1_29#n867

This caused errors on overlayfs if the extraction churned through enough
of the dentry cache to evict the relevant directory (can be forced to
reproduce reliably via drop_caches).

Regards,
Vito Caputo
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. R. Okajima Feb. 6, 2017, 4:24 p.m. UTC | #11
James Bottomley:
> Yes, I know the problem.  However, I believe most current linux
> filesystems no longer guarantee stable, for the lifetime of the file,
> inode numbers.  The usual docker container root is overlayfs, which,
> similarly doesn't support stable inode numbers.  I see the odd
> complaint about docker with overlayfs having unstable inode numbers,
> but none seems to have any serious repercussions.

I think it serious.
Reusing the backend fs' inum is a good approach which Amir wrote.
Based on this, I'd suggest you to support the hardlinks.

bakend_dentry = lookup_one_len()
if (d_inode->i_nlink != 1)
	shiftfs_inode = ilookup();
if (!shiftfs_inode) {
	shiftfs_inode = new_inode();
	shiftfs_inode->i_ino = bakend_dentry->d_inode->i_ino;
}


J. R. Okajima
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 6, 2017, 4:29 p.m. UTC | #12
On Mon, 2017-02-06 at 08:38 +0200, Amir Goldstein wrote:
> On Mon, Feb 6, 2017 at 5:25 AM, J. R. Okajima <hooanon05g@gmail.com>
> wrote:
> > James Bottomley:
> > > This allows any subtree to be uid/gid shifted and bound
> > > elsewhere.  It
> >         :::
> > 
> > Interesting.
> > But I am afraid that the inconsistency problem of the inode numbers 
> > will happen.
> > 
> 
> Yet another example that overlayfs already is in the process of 
> solving (it is fixed for stat of merged directory inode).
> In fact, fir the case of single layer overlay (as well as shiftfs) 
> the solution is trivial - preserve underlying inode st_ino/d_ino and 
> use the overlayed fs st_dev.

not sure I follow what st_ino is, do you mean  s_root->d_inode->i_ino?
or did you mean s_dev (which is more traditional)?

The problem with this is there's no way to ensure global uniqueness in
a mapping that goes (ino, ino) -> (ino) (or (s_dev, ino) -> (ino)) and
I believe global uniqueness is more important because the i_ino is used
in the hashed lookups.  Secondly you're not guaranteed that s_root
->d_inode->i_ino is unique ... historically a lot of filesystems use a
well known inode number as the root, that's why filehandles
traditionally used something representing the device and the inode
number (we also have s_dev uniqueness problems for tmpfs which is used
in some overlays).

We can certainly construct a filehandle using an export operations
override that is unique and can be used to lookup the underlying object
(based on the underlying device and inode).

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 6, 2017, 5:32 p.m. UTC | #13
On Mon, 2017-02-06 at 09:38 -0600, lkml@pengaru.com wrote:
> On Mon, Feb 06, 2017 at 07:18:16AM -0800, James Bottomley wrote:
> > On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> > > On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> > > > Yes, I know the problem.  However, I believe most current linux
> > > > filesystems no longer guarantee stable, for the lifetime of the
> > > > file, inode numbers.  The usual docker container root is
> > > > overlayfs,
> > > > which, similarly doesn't support stable inode numbers.  I see
> > > > the 
> > > > odd complaint about docker with overlayfs having unstable inode
> > > > numbers, but none seems to have any serious repercussions.
> > > 
> > > Um, no.  Most current linux file systems *do* guarantee stable
> > > inode
> > > numbers.  For one thing, NFS would break horribly if you didn't
> > > have
> > > stable inode numbers.  Never mind applications which depend on
> > > POSIX
> > > semantics.  And you wouldn't be able to save games in rogue or
> > > nethack, either.  :-)
> > 
> > I believe that's why we have the superblock export operations to
> > manufacture unique filehandles in the absence of inode number
> > stability.  The generic one uses inode numbers, but it doesn't have
> > to.
> >  I thought reiserfs (if we can go back that far) was the first
> > generally used filesystem that didn't guarantee stable inode
> > numbers,
> > so we have a lot of historical precedence.
> > 
> > Thanks to reiserfs, I thought we also iterated to weak stability
> > guarantees for inode numbers which mean no inconsistencies in
> > applications that use inode numbers for caching?  It's still not
> > POSIX,
> > but I thought it was good enough for most use cases.
> > 
> 
> Even plain tar extraction is sensitive to directory inode stability:
> http://git.savannah.gnu.org/cgit/tar.git/tree/src/extract.c?h=release
> _1_29#n867
> 
> This caused errors on overlayfs if the extraction churned through 
> enough of the dentry cache to evict the relevant directory (can be 
> forced to reproduce reliably via drop_caches).

Yes, I know the bug.  I think it's up to tar maintainers, but if they
want to support weakly posix filesystems, they should really be using
the filehandle for this check, not device and inode number.

That said, I believe reiserfs was our only other filesystem with weak
inode number stability guarantees and that's hardly in common use
today, so if we can find a solution that gives strong stability
guarantees for out current problem filesystems, there's no reason not
to use it generally.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Feb. 6, 2017, 9:52 p.m. UTC | #14
On Mon, Feb 06, 2017 at 07:18:16AM -0800, James Bottomley wrote:
> On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> > On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> > > Yes, I know the problem.  However, I believe most current linux
> > > filesystems no longer guarantee stable, for the lifetime of the 
> > > file, inode numbers.  The usual docker container root is overlayfs,
> > > which, similarly doesn't support stable inode numbers.  I see the 
> > > odd complaint about docker with overlayfs having unstable inode
> > > numbers, but none seems to have any serious repercussions.
> > 
> > Um, no.  Most current linux file systems *do* guarantee stable inode
> > numbers.  For one thing, NFS would break horribly if you didn't have
> > stable inode numbers.  Never mind applications which depend on POSIX
> > semantics.  And you wouldn't be able to save games in rogue or
> > nethack, either.  :-)
> 
> I believe that's why we have the superblock export operations to
> manufacture unique filehandles in the absence of inode number
> stability.

Where did you hear that?

I'd expect an NFS client to handle non-unique filehandles
better than non-unique inode numbers.  I believe our client will -EIO on
encountering an inode number change (see nfs_check_inode_attributes().)

See also https://tools.ietf.org/html/rfc5661#section-10.3.4.

--b.

> The generic one uses inode numbers, but it doesn't have to.
>  I thought reiserfs (if we can go back that far) was the first
> generally used filesystem that didn't guarantee stable inode numbers,
> so we have a lot of historical precedence.
> 
> Thanks to reiserfs, I thought we also iterated to weak stability
> guarantees for inode numbers which mean no inconsistencies in
> applications that use inode numbers for caching?  It's still not POSIX,
> but I thought it was good enough for most use cases.
> 
> > Overlayfs may not, currently, but it's considered a bug.
> 
> James
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 12:10 a.m. UTC | #15
On Mon, 2017-02-06 at 16:52 -0500, J. Bruce Fields wrote:
> On Mon, Feb 06, 2017 at 07:18:16AM -0800, James Bottomley wrote:
> > On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> > > On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> > > > Yes, I know the problem.  However, I believe most current linux
> > > > filesystems no longer guarantee stable, for the lifetime of the
> > > > file, inode numbers.  The usual docker container root is 
> > > > overlayfs, which, similarly doesn't support stable inode 
> > > > numbers.  I see the odd complaint about docker with overlayfs 
> > > > having unstable inode numbers, but none seems to have any
> > > > serious repercussions.
> > > 
> > > Um, no.  Most current linux file systems *do* guarantee stable 
> > > inode numbers.  For one thing, NFS would break horribly if you 
> > > didn't have stable inode numbers.  Never mind applications which 
> > > depend on POSIX semantics.  And you wouldn't be able to save 
> > > games in rogue or nethack, either.  :-)
> > 
> > I believe that's why we have the superblock export operations to
> > manufacture unique filehandles in the absence of inode number
> > stability.
> 
> Where did you hear that?
> 
> I'd expect an NFS client to handle non-unique filehandles
> better than non-unique inode numbers.  I believe our client will -EIO 
> on encountering an inode number change (see
> nfs_check_inode_attributes().)
> 
> See also https://tools.ietf.org/html/rfc5661#section-10.3.4.

Could you clarify your point a bit further, please?  Both the
check_inode_attributes() code and section 10.3.4 are talking about
fileids, which are the things that are constructed in the export_ops
... admittedly a lot of fileid_types are based on inode numbers, but
several aren't.  For those that aren't, I believe NFS doesn't care
about the underlying inode number of the exported file.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Feb. 7, 2017, 1:35 a.m. UTC | #16
On Mon, Feb 06, 2017 at 04:10:11PM -0800, James Bottomley wrote:
> On Mon, 2017-02-06 at 16:52 -0500, J. Bruce Fields wrote:
> > On Mon, Feb 06, 2017 at 07:18:16AM -0800, James Bottomley wrote:
> > > On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> > > > On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley wrote:
> > > > > Yes, I know the problem.  However, I believe most current linux
> > > > > filesystems no longer guarantee stable, for the lifetime of the
> > > > > file, inode numbers.  The usual docker container root is 
> > > > > overlayfs, which, similarly doesn't support stable inode 
> > > > > numbers.  I see the odd complaint about docker with overlayfs 
> > > > > having unstable inode numbers, but none seems to have any
> > > > > serious repercussions.
> > > > 
> > > > Um, no.  Most current linux file systems *do* guarantee stable 
> > > > inode numbers.  For one thing, NFS would break horribly if you 
> > > > didn't have stable inode numbers.  Never mind applications which 
> > > > depend on POSIX semantics.  And you wouldn't be able to save 
> > > > games in rogue or nethack, either.  :-)
> > > 
> > > I believe that's why we have the superblock export operations to
> > > manufacture unique filehandles in the absence of inode number
> > > stability.
> > 
> > Where did you hear that?
> > 
> > I'd expect an NFS client to handle non-unique filehandles
> > better than non-unique inode numbers.  I believe our client will -EIO 
> > on encountering an inode number change (see
> > nfs_check_inode_attributes().)
> > 
> > See also https://tools.ietf.org/html/rfc5661#section-10.3.4.
> 
> Could you clarify your point a bit further, please?  Both the
> check_inode_attributes() code and section 10.3.4 are talking about
> fileids, which are the things that are constructed in the export_ops

No, the filehandle structure isn't discussed in the rfc at all, that's
opaque to clients, and the "fileid" you see in the export code isn't
what's discussed here.

The "fileid" here is an NFS attribute, really just the NFS protocol's
name for the inode number.  The server code that returns fileid's:

	if (bmval0 & FATTR4_WORD0_FILEID) {
		p = xdr_reserve_space(xdr, 8);
		if (!p)
			goto out_resource;
		p = xdr_encode_hyper(p, stat.ino);
	}

The client getattr code:

	stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));

--b.

> ... admittedly a lot of fileid_types are based on inode numbers, but
> several aren't.  For those that aren't, I believe NFS doesn't care
> about the underlying inode number of the exported file.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 7, 2017, 9:19 a.m. UTC | #17
On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
> This allows any subtree to be uid/gid shifted and bound elsewhere.  It
> does this by operating simlarly to overlayfs.  Its primary use is for
> shifting the underlying uids of filesystems used to support
> unpriviliged (uid shifted) containers.  The usual use case here is
> that the container is operating with an uid shifted unprivileged root
> but sometimes needs to make use of or work with a filesystem image
> that has root at real uid 0.
> 
> The mechanism is to allow any subordinate mount namespace to mount a
> shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only allowing
> it to mount marked subtrees (using the -o mark option as root).  Once
> mounted, the subtree is mapped via the super block user namespace so
> that the interior ids of the mounting user namespace are the ids
> written to the filesystem.

Please move this into VFS instead of a stackable fs.  We might need
addtional parameters to getattr/setattr to specify the ID translation,
but that's why better than a horrible hack like this.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Djalal Harouni Feb. 7, 2017, 9:39 a.m. UTC | #18
Hi,

On Tue, Feb 7, 2017 at 10:19 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
>> This allows any subtree to be uid/gid shifted and bound elsewhere.  It
>> does this by operating simlarly to overlayfs.  Its primary use is for
>> shifting the underlying uids of filesystems used to support
>> unpriviliged (uid shifted) containers.  The usual use case here is
>> that the container is operating with an uid shifted unprivileged root
>> but sometimes needs to make use of or work with a filesystem image
>> that has root at real uid 0.
>>
>> The mechanism is to allow any subordinate mount namespace to mount a
>> shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only allowing
>> it to mount marked subtrees (using the -o mark option as root).  Once
>> mounted, the subtree is mapped via the super block user namespace so
>> that the interior ids of the mounting user namespace are the ids
>> written to the filesystem.
>
> Please move this into VFS instead of a stackable fs.  We might need
> addtional parameters to getattr/setattr to specify the ID translation,
> but that's why better than a horrible hack like this.

I proposed an RFC months ago which implements all of this at the VFS
layer [1], I received some feedback especially from Dave Chinner,
however I failed to fix my bugs and improve it not enough resources...

The problems discussed here about a new filesystem: inodes numbers,
quota and many other things where all noted in that thread and
previous threads about shiftfs. We are turning this to a heavy problem
compared to all other namespaces... other namespaces integrate
perfectly with other subsystems and the rest of layers, there is no
special treatment...

Christoph, for the getattr/setattr it won't work since internally the
resolved path may point to a different mount context where we do not
want the ID translation, and we may end up using the wrong vfsmount. A
simple getattr/setattr won't work unless there are bigger changes
too...

[1] https://lkml.org/lkml/2016/5/4/411
Christoph Hellwig Feb. 7, 2017, 9:53 a.m. UTC | #19
On Tue, Feb 07, 2017 at 10:39:48AM +0100, Djalal Harouni wrote:
> I proposed an RFC months ago which implements all of this at the VFS
> layer [1], I received some feedback especially from Dave Chinner,
> however I failed to fix my bugs and improve it not enough resources...

And none of the issues goes away by hiding them in a stackable fs,
in fact many of them are getting worse.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 4:37 p.m. UTC | #20
On Tue, 2017-02-07 at 01:19 -0800, Christoph Hellwig wrote:
> On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
> > This allows any subtree to be uid/gid shifted and bound elsewhere. 
> >  It does this by operating simlarly to overlayfs.  Its primary use 
> > is for shifting the underlying uids of filesystems used to support
> > unpriviliged (uid shifted) containers.  The usual use case here is
> > that the container is operating with an uid shifted unprivileged 
> > root but sometimes needs to make use of or work with a filesystem 
> > image that has root at real uid 0.
> > 
> > The mechanism is to allow any subordinate mount namespace to mount 
> > a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only
> > allowing it to mount marked subtrees (using the -o mark option as 
> > root).  Once mounted, the subtree is mapped via the super block 
> > user namespace so that the interior ids of the mounting user 
> > namespace are the ids written to the filesystem.
> 
> Please move this into VFS instead of a stackable fs.  We might need
> addtional parameters to getattr/setattr to specify the ID 
> translation, but that's why better than a horrible hack like this.

I would need a lot more than that: getattr controls the cosmetic
permission display to the user, but enforcement is done in the core
permission checks which are inode based.  To make this a real bind
mount, the core permission checks will have to become subtree aware
because knowledge of whether we need a uid shift in the permission
check becomes a subtree property.  Effectively inode_permission would
become dentry_permission and generic_permission would take a dentry
instead of an inode.  This will be a huge amount of VFS and underlying
filesystem churn, since the permissions calls are threaded through a
huge chunk of code.

Is this the approach that you really want?

I suppose I could see the security people linking it because all the
security hooks in the permission code become path aware.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Feb. 7, 2017, 5:59 p.m. UTC | #21
On Tue, Feb 7, 2017 at 6:37 PM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2017-02-07 at 01:19 -0800, Christoph Hellwig wrote:
>> On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
>> > This allows any subtree to be uid/gid shifted and bound elsewhere.
>> >  It does this by operating simlarly to overlayfs.  Its primary use
>> > is for shifting the underlying uids of filesystems used to support
>> > unpriviliged (uid shifted) containers.  The usual use case here is
>> > that the container is operating with an uid shifted unprivileged
>> > root but sometimes needs to make use of or work with a filesystem
>> > image that has root at real uid 0.
>> >
>> > The mechanism is to allow any subordinate mount namespace to mount
>> > a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only
>> > allowing it to mount marked subtrees (using the -o mark option as
>> > root).  Once mounted, the subtree is mapped via the super block
>> > user namespace so that the interior ids of the mounting user
>> > namespace are the ids written to the filesystem.
>>
>> Please move this into VFS instead of a stackable fs.  We might need
>> addtional parameters to getattr/setattr to specify the ID
>> translation, but that's why better than a horrible hack like this.
>
> I would need a lot more than that: getattr controls the cosmetic
> permission display to the user, but enforcement is done in the core
> permission checks which are inode based.  To make this a real bind
> mount, the core permission checks will have to become subtree aware
> because knowledge of whether we need a uid shift in the permission
> check becomes a subtree property.  Effectively inode_permission would
> become dentry_permission and generic_permission would take a dentry
> instead of an inode.  This will be a huge amount of VFS and underlying
> filesystem churn, since the permissions calls are threaded through a
> huge chunk of code.
>

I am not even sure that would be enough.
dentry does not contain information about the mount user came from,
and sb contains only information about the user ns of the mounter of
the file system, not the mounter of the bind mount, right?
I think I am missing some big pieces of the big picture.
Would love to hear what Eric has to say.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 7, 2017, 6:10 p.m. UTC | #22
On Tue, Feb 07, 2017 at 07:59:00PM +0200, Amir Goldstein wrote:
> I am not even sure that would be enough.
> dentry does not contain information about the mount user came from,
> and sb contains only information about the user ns of the mounter of
> the file system, not the mounter of the bind mount, right?
> I think I am missing some big pieces of the big picture.
> Would love to hear what Eric has to say.

IFF we want to do what shiftfs does properly we need vfsmount + inode,
no need for the dentry.

But maybe we need to go back and decice if we want to allow uid/gid
remapping for arbitrary subtrees anyway.  Another option would be
to require something like a project as used for project quotas as the
root.  This would also be conveniant as it could storge the used
remapping tables.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 6:20 p.m. UTC | #23
On Tue, 2017-02-07 at 19:59 +0200, Amir Goldstein wrote:
> On Tue, Feb 7, 2017 at 6:37 PM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Tue, 2017-02-07 at 01:19 -0800, Christoph Hellwig wrote:
> > > On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
> > > > This allows any subtree to be uid/gid shifted and bound
> > > > elsewhere.
> > > >  It does this by operating simlarly to overlayfs.  Its primary
> > > > use
> > > > is for shifting the underlying uids of filesystems used to
> > > > support
> > > > unpriviliged (uid shifted) containers.  The usual use case here
> > > > is
> > > > that the container is operating with an uid shifted
> > > > unprivileged
> > > > root but sometimes needs to make use of or work with a
> > > > filesystem
> > > > image that has root at real uid 0.
> > > > 
> > > > The mechanism is to allow any subordinate mount namespace to
> > > > mount
> > > > a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only
> > > > allowing it to mount marked subtrees (using the -o mark option
> > > > as
> > > > root).  Once mounted, the subtree is mapped via the super block
> > > > user namespace so that the interior ids of the mounting user
> > > > namespace are the ids written to the filesystem.
> > > 
> > > Please move this into VFS instead of a stackable fs.  We might
> > > need
> > > addtional parameters to getattr/setattr to specify the ID
> > > translation, but that's why better than a horrible hack like
> > > this.
> > 
> > I would need a lot more than that: getattr controls the cosmetic
> > permission display to the user, but enforcement is done in the core
> > permission checks which are inode based.  To make this a real bind
> > mount, the core permission checks will have to become subtree aware
> > because knowledge of whether we need a uid shift in the permission
> > check becomes a subtree property.  Effectively inode_permission
> > would
> > become dentry_permission and generic_permission would take a dentry
> > instead of an inode.  This will be a huge amount of VFS and
> > underlying
> > filesystem churn, since the permissions calls are threaded through
> > a
> > huge chunk of code.
> > 
> 
> I am not even sure that would be enough.
> dentry does not contain information about the mount user came from,
> and sb contains only information about the user ns of the mounter of
> the file system, not the mounter of the bind mount, right?
> I think I am missing some big pieces of the big picture.
> Would love to hear what Eric has to say.

I'm not really sure until it gets prototyped, but I think the
filesystem user namespace would also have to become a subtree property.

The whole reason for shiftfs being a properly mounted filesystem is
because it needs a super block to capture the namespace it's being
mounted in.

However, when you have a container that you want remapping inside, you
must have a user namespace which owns a mount namespace, so we can
deduce the information from the mount namespace.  All we probably need
the subtree to tell us is if we're shifting or not.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 7:01 p.m. UTC | #24
On Mon, 2017-02-06 at 20:35 -0500, J. Bruce Fields wrote:
> On Mon, Feb 06, 2017 at 04:10:11PM -0800, James Bottomley wrote:
> > On Mon, 2017-02-06 at 16:52 -0500, J. Bruce Fields wrote:
> > > On Mon, Feb 06, 2017 at 07:18:16AM -0800, James Bottomley wrote:
> > > > On Mon, 2017-02-06 at 09:50 -0500, Theodore Ts'o wrote:
> > > > > On Sun, Feb 05, 2017 at 10:46:23PM -0800, James Bottomley
> > > > > wrote:
> > > > > > Yes, I know the problem.  However, I believe most current
> > > > > > linux
> > > > > > filesystems no longer guarantee stable, for the lifetime of
> > > > > > the
> > > > > > file, inode numbers.  The usual docker container root is 
> > > > > > overlayfs, which, similarly doesn't support stable inode 
> > > > > > numbers.  I see the odd complaint about docker with
> > > > > > overlayfs 
> > > > > > having unstable inode numbers, but none seems to have any
> > > > > > serious repercussions.
> > > > > 
> > > > > Um, no.  Most current linux file systems *do* guarantee
> > > > > stable 
> > > > > inode numbers.  For one thing, NFS would break horribly if
> > > > > you 
> > > > > didn't have stable inode numbers.  Never mind applications
> > > > > which 
> > > > > depend on POSIX semantics.  And you wouldn't be able to save 
> > > > > games in rogue or nethack, either.  :-)
> > > > 
> > > > I believe that's why we have the superblock export operations
> > > > to
> > > > manufacture unique filehandles in the absence of inode number
> > > > stability.
> > > 
> > > Where did you hear that?
> > > 
> > > I'd expect an NFS client to handle non-unique filehandles
> > > better than non-unique inode numbers.  I believe our client will 
> > > -EIO 
> > > on encountering an inode number change (see
> > > nfs_check_inode_attributes().)
> > > 
> > > See also https://tools.ietf.org/html/rfc5661#section-10.3.4.
> > 
> > Could you clarify your point a bit further, please?  Both the
> > check_inode_attributes() code and section 10.3.4 are talking about
> > fileids, which are the things that are constructed in the
> > export_ops
> 
> No, the filehandle structure isn't discussed in the rfc at all,
> that's
> opaque to clients, and the "fileid" you see in the export code isn't
> what's discussed here.
> 
> The "fileid" here is an NFS attribute, really just the NFS protocol's
> name for the inode number.  The server code that returns fileid's:
> 
> 	if (bmval0 & FATTR4_WORD0_FILEID) {
> 		p = xdr_reserve_space(xdr, 8);
> 		if (!p)
> 			goto out_resource;
> 		p = xdr_encode_hyper(p, stat.ino);
> 	}
> 
> The client getattr code:
> 
> 	stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));

OK, I now believe we may be talking about different things.  When I
said

> I believe that's why we have the superblock export operations to
> manufacture unique filehandles in the absence of inode number
> stability. 

I was talking about inode stability in the filesystem underlying the
export.  I believe you're talking about inode number stability
guarantees of the nfs client code itself, which are unrelated to the
inode number guarantees of the exported filesystem?

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 7:02 p.m. UTC | #25
On Tue, 2017-02-07 at 10:10 -0800, Christoph Hellwig wrote:
> On Tue, Feb 07, 2017 at 07:59:00PM +0200, Amir Goldstein wrote:
> > I am not even sure that would be enough.
> > dentry does not contain information about the mount user came from,
> > and sb contains only information about the user ns of the mounter
> > of
> > the file system, not the mounter of the bind mount, right?
> > I think I am missing some big pieces of the big picture.
> > Would love to hear what Eric has to say.
> 
> IFF we want to do what shiftfs does properly we need vfsmount + 
> inode, no need for the dentry.

Yes, sorry ... I was thinking the dentry contained the mnt, but it
doesn't, that's the path.  However, threading the mnt through looks
substantially harder.

> But maybe we need to go back and decice if we want to allow uid/gid
> remapping for arbitrary subtrees anyway.

So those were the original patches Djalal was referring to.  The
problem there is that a lot of orchestration systems don't store images
they want to bind mount into containers on separately mounted
filesystems, which is what's needed to avoid this being per-subtree. 
 However, the clinching argument for me is that the canonical container
image *is* a subtree (unlike a vm image which has to be mounted).  If
we don't make this work on subtrees people go back to daft stacks for
containers like copying the image subtree into a loopback mounted
filesystem just to make this all work (and then complain about
performance and caching and so on).

>   Another option would be to require something like a project as used 
> for project quotas as the root.  This would also be conveniant as it 
> could storge the used remapping tables.

So this would be like the current project quota except set on a
subtree?  I could see it being done that way but I don't see what
advantage it has over using flags in the subtree itself (the mapping is
known based on the mount namespace, so there's really only a single bit
of information to store).

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 7, 2017, 7:47 p.m. UTC | #26
On Tue, Feb 07, 2017 at 11:01:08AM -0800, James Bottomley wrote:
> I was talking about inode stability in the filesystem underlying the
> export.  I believe you're talking about inode number stability
> guarantees of the nfs client code itself, which are unrelated to the
> inode number guarantees of the exported filesystem?

They are 1:1 correlated for a Linux server at least.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Djalal Harouni Feb. 7, 2017, 7:48 p.m. UTC | #27
On Tue, Feb 7, 2017 at 7:20 PM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2017-02-07 at 19:59 +0200, Amir Goldstein wrote:
>> On Tue, Feb 7, 2017 at 6:37 PM, James Bottomley
>> <James.Bottomley@hansenpartnership.com> wrote:
>> > On Tue, 2017-02-07 at 01:19 -0800, Christoph Hellwig wrote:
>> > > On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
>> > > > This allows any subtree to be uid/gid shifted and bound
>> > > > elsewhere.
>> > > >  It does this by operating simlarly to overlayfs.  Its primary
>> > > > use
>> > > > is for shifting the underlying uids of filesystems used to
>> > > > support
>> > > > unpriviliged (uid shifted) containers.  The usual use case here
>> > > > is
>> > > > that the container is operating with an uid shifted
>> > > > unprivileged
>> > > > root but sometimes needs to make use of or work with a
>> > > > filesystem
>> > > > image that has root at real uid 0.
>> > > >
>> > > > The mechanism is to allow any subordinate mount namespace to
>> > > > mount
>> > > > a shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only
>> > > > allowing it to mount marked subtrees (using the -o mark option
>> > > > as
>> > > > root).  Once mounted, the subtree is mapped via the super block
>> > > > user namespace so that the interior ids of the mounting user
>> > > > namespace are the ids written to the filesystem.
>> > >
>> > > Please move this into VFS instead of a stackable fs.  We might
>> > > need
>> > > addtional parameters to getattr/setattr to specify the ID
>> > > translation, but that's why better than a horrible hack like
>> > > this.
>> >
>> > I would need a lot more than that: getattr controls the cosmetic
>> > permission display to the user, but enforcement is done in the core
>> > permission checks which are inode based.  To make this a real bind
>> > mount, the core permission checks will have to become subtree aware
>> > because knowledge of whether we need a uid shift in the permission
>> > check becomes a subtree property.  Effectively inode_permission
>> > would
>> > become dentry_permission and generic_permission would take a dentry
>> > instead of an inode.  This will be a huge amount of VFS and
>> > underlying
>> > filesystem churn, since the permissions calls are threaded through
>> > a
>> > huge chunk of code.
>> >
>>
>> I am not even sure that would be enough.
>> dentry does not contain information about the mount user came from,
>> and sb contains only information about the user ns of the mounter of
>> the file system, not the mounter of the bind mount, right?
>> I think I am missing some big pieces of the big picture.
>> Would love to hear what Eric has to say.
>
> I'm not really sure until it gets prototyped, but I think the
> filesystem user namespace would also have to become a subtree property.

Sorry I don't want to derail the thread, but that was already prototyped

> The whole reason for shiftfs being a properly mounted filesystem is
> because it needs a super block to capture the namespace it's being
> mounted in.
>
> However, when you have a container that you want remapping inside, you
> must have a user namespace which owns a mount namespace, so we can
> deduce the information from the mount namespace.  All we probably need
> the subtree to tell us is if we're shifting or not.

That's one of the use cases that you will definitely end up with... if
anyone did read that incomplete VFS RFC proposal:

"2) The solution is based on VFS and mount namespaces, we use the user
namespace of the containing mount namespace to check if we should shift
UIDs/GIDs from/to virtual <=> on-disk view.
If a filesystem was mounted with "vfs_shift_uids" and "vfs_shift_gids"
options, and if it shows up inside a mount namespace that supports VFS
UIDs/GIDs shifts then during each access we will remap UID/GID either
to virtual or to on-disk view using simple helper functions to allow the
access. In case the mount or current mount namespace do not support VFS
UID/GID shifts, we fallback to the old behaviour, no shift is performed." [1]

[1] https://lkml.org/lkml/2016/5/4/411
Christoph Hellwig Feb. 7, 2017, 7:49 p.m. UTC | #28
On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley wrote:
> >   Another option would be to require something like a project as used 
> > for project quotas as the root.  This would also be conveniant as it 
> > could storge the used remapping tables.
> 
> So this would be like the current project quota except set on a
> subtree?  I could see it being done that way but I don't see what
> advantage it has over using flags in the subtree itself (the mapping is
> known based on the mount namespace, so there's really only a single bit
> of information to store).

projects (which are the underling concept for project quotas) are
per-subtree in practice - the flag is set on an inode and then
all directories and files underneath inherit the project ID,
hardlinking outside a project is prohinited.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 8:05 p.m. UTC | #29
On Tue, 2017-02-07 at 11:49 -0800, Christoph Hellwig wrote:
> On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley wrote:
> > >   Another option would be to require something like a project as
> > > used 
> > > for project quotas as the root.  This would also be conveniant as
> > > it 
> > > could storge the used remapping tables.
> > 
> > So this would be like the current project quota except set on a
> > subtree?  I could see it being done that way but I don't see what
> > advantage it has over using flags in the subtree itself (the
> > mapping is
> > known based on the mount namespace, so there's really only a single
> > bit
> > of information to store).
> 
> projects (which are the underling concept for project quotas) are
> per-subtree in practice - the flag is set on an inode and then
> all directories and files underneath inherit the project ID,
> hardlinking outside a project is prohinited.

OK, this is what I don't understand: how is something that's inode
based limited to be per-subtree?  The way I've seen the VFS operate it
seems that any given inode (and indeed dentry) can appear in many
subtrees so how do I limit them to just one?

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Feb. 7, 2017, 9:01 p.m. UTC | #30
On Tue, Feb 7, 2017 at 10:05 PM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2017-02-07 at 11:49 -0800, Christoph Hellwig wrote:
>> On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley wrote:
>> > >   Another option would be to require something like a project as
>> > > used
>> > > for project quotas as the root.  This would also be conveniant as
>> > > it
>> > > could storge the used remapping tables.
>> >
>> > So this would be like the current project quota except set on a
>> > subtree?  I could see it being done that way but I don't see what
>> > advantage it has over using flags in the subtree itself (the
>> > mapping is
>> > known based on the mount namespace, so there's really only a single
>> > bit
>> > of information to store).
>>
>> projects (which are the underling concept for project quotas) are
>> per-subtree in practice - the flag is set on an inode and then
>> all directories and files underneath inherit the project ID,
>> hardlinking outside a project is prohinited.
>
> OK, this is what I don't understand: how is something that's inode
> based limited to be per-subtree?  The way I've seen the VFS operate it
> seems that any given inode (and indeed dentry) can appear in many
> subtrees so how do I limit them to just one?
>

Project id's are not exactly "subtree" semantic, but inheritance semantics,
which is not the same when non empty directories get their project id changed.
Here is a recap:
https://lwn.net/Articles/623835/

So if you created an empty directory and "marked" it for shiftuid and all
descendants inherited this property you would be able to check that property
on a per inode basis. Not sure that is what you are looking for?
I guess we should define the semantics for the required sub-tree marking,
before we can talk about solutions.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 7, 2017, 10:25 p.m. UTC | #31
On Tue, Feb 07, 2017 at 11:01:29PM +0200, Amir Goldstein wrote:
> Project id's are not exactly "subtree" semantic, but inheritance semantics,
> which is not the same when non empty directories get their project id changed.
> Here is a recap:
> https://lwn.net/Articles/623835/

Yes - but if we abuse them for containers we could refine the semantics
to simply not allow change of project ids from inside containers
based on say capabilities.

> I guess we should define the semantics for the required sub-tree marking,
> before we can talk about solutions.

Good plan.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 7, 2017, 11:42 p.m. UTC | #32
On Tue, 2017-02-07 at 14:25 -0800, Christoph Hellwig wrote:
> On Tue, Feb 07, 2017 at 11:01:29PM +0200, Amir Goldstein wrote:
> > Project id's are not exactly "subtree" semantic, but inheritance
> > semantics,
> > which is not the same when non empty directories get their project
> > id changed.
> > Here is a recap:
> > https://lwn.net/Articles/623835/
> 
> Yes - but if we abuse them for containers we could refine the 
> semantics to simply not allow change of project ids from inside 
> containers based on say capabilities.

We can't really abuse projectid, it's part of the user namespace
mapping (for project quota).  What we can do is have a new id that
behaves like it.

But like I said, we don't really need a ful ID, it would basically just
be a single bit mark to say remap or not when doing permission checks
against this inode.  It would follow some of the project id semantics
(like inheritance from parent dir)

> > I guess we should define the semantics for the required sub-tree 
> > marking, before we can talk about solutions.
> 
> Good plan.

So I've been thinking about how to do this without subtree marking and
yet retain the subtree properties similar to project id.  The advantage
would be that if it can be done using only inode properties, then none
of the permission prototypes need change.  The only real subtree
property we need is ability to bind into an unprivileged mount
namespace, but we already have that.  The gotcha about marking inodes
is that they're all or nothing, so every subtree that gets access to
the inode inherits the mark.  This means that we cannot allow a user
access to a marked inode without the cover of an unprivileged user
namespace, but I think that's fixable in the permission check
(basically if the inode is marked you *only* get access if you have a
user_ns != init_user_ns and we do the permission shifts or you have
user_ns == init_user_ns and you are admin capable).

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josh Triplett Feb. 8, 2017, 1:54 a.m. UTC | #33
On Tue, Feb 07, 2017 at 11:49:33AM -0800, Christoph Hellwig wrote:
> On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley wrote:
> > >   Another option would be to require something like a project as used 
> > > for project quotas as the root.  This would also be conveniant as it 
> > > could storge the used remapping tables.
> > 
> > So this would be like the current project quota except set on a
> > subtree?  I could see it being done that way but I don't see what
> > advantage it has over using flags in the subtree itself (the mapping is
> > known based on the mount namespace, so there's really only a single bit
> > of information to store).
> 
> projects (which are the underling concept for project quotas) are
> per-subtree in practice - the flag is set on an inode and then
> all directories and files underneath inherit the project ID,
> hardlinking outside a project is prohinited.

I'm interested in having a VFS-level way to do more than just a shift;
I'd like to be able to arbitrarily remap IDs between what's on disk and
the system IDs.  If we're talking about developing a VFS-level solution
for this, I'd like to avoid limiting it to just a shift.  (A shift/range
would definitely be the simplest solution for many common container
cases, but not all.)
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Feb. 8, 2017, 6:44 a.m. UTC | #34
On Wed, Feb 8, 2017 at 1:42 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2017-02-07 at 14:25 -0800, Christoph Hellwig wrote:
>> On Tue, Feb 07, 2017 at 11:01:29PM +0200, Amir Goldstein wrote:
>> > Project id's are not exactly "subtree" semantic, but inheritance
>> > semantics,
>> > which is not the same when non empty directories get their project
>> > id changed.
>> > Here is a recap:
>> > https://lwn.net/Articles/623835/
>>
>> Yes - but if we abuse them for containers we could refine the
>> semantics to simply not allow change of project ids from inside
>> containers based on say capabilities.
>

You mean something like this:
https://lwn.net/Articles/632917/

With the suggested protected_projects, projid 0 (also inside container)
gets a special meaning, much like user 0, so we may do interesting
things with the projid that is mapped to 0.

> We can't really abuse projectid, it's part of the user namespace
> mapping (for project quota).  What we can do is have a new id that
> behaves like it.
>

Perhaps we *can* use projid without abusing it.
userns already maps projids, but there is no concept of "owning project"
for a userns, nor does it make a lot of sense, because projid is not
part of the credentials.
But if we re-brand it as "container root projid", we can try to use it
for defining semantics to grant unprivileged access to a subtree.

The functionality you are trying to get with shiftfs mark does
sounds a bit like "container root projid":
- inodes with mapped projid MAY be uid/gid shifted
- inodes with unmapped projid MAY NOT

I realize this may be very raw, but its a start. If you like this
direction we can try to develop it.

> But like I said, we don't really need a ful ID, it would basically just
> be a single bit mark to say remap or not when doing permission checks
> against this inode.  It would follow some of the project id semantics
> (like inheritance from parent dir)
>

But a single bit would only work for single level of userns nesting won't it?


>> > I guess we should define the semantics for the required sub-tree
>> > marking, before we can talk about solutions.
>>
>> Good plan.
>
> So I've been thinking about how to do this without subtree marking and
> yet retain the subtree properties similar to project id.  The advantage
> would be that if it can be done using only inode properties, then none
> of the permission prototypes need change.  The only real subtree
> property we need is ability to bind into an unprivileged mount
> namespace, but we already have that.  The gotcha about marking inodes
> is that they're all or nothing, so every subtree that gets access to
> the inode inherits the mark.  This means that we cannot allow a user
> access to a marked inode without the cover of an unprivileged user
> namespace, but I think that's fixable in the permission check
> (basically if the inode is marked you *only* get access if you have a
> user_ns != init_user_ns and we do the permission shifts or you have
> user_ns == init_user_ns and you are admin capable).
>

I didn't follow, but it sounds like your proposed solutions is only
good for single level of userns nesting.
Do you think you can redefine it in terms of "container root projid".
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Konstantin Khlebnikov Feb. 8, 2017, 11:45 a.m. UTC | #35
On 08.02.2017 09:44, Amir Goldstein wrote:
> On Wed, Feb 8, 2017 at 1:42 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
>> On Tue, 2017-02-07 at 14:25 -0800, Christoph Hellwig wrote:
>>> On Tue, Feb 07, 2017 at 11:01:29PM +0200, Amir Goldstein wrote:
>>>> Project id's are not exactly "subtree" semantic, but inheritance
>>>> semantics,
>>>> which is not the same when non empty directories get their project
>>>> id changed.
>>>> Here is a recap:
>>>> https://lwn.net/Articles/623835/
>>>
>>> Yes - but if we abuse them for containers we could refine the
>>> semantics to simply not allow change of project ids from inside
>>> containers based on say capabilities.
>>
>
> You mean something like this:
> https://lwn.net/Articles/632917/
>
> With the suggested protected_projects, projid 0 (also inside container)
> gets a special meaning, much like user 0, so we may do interesting
> things with the projid that is mapped to 0.
>
>> We can't really abuse projectid, it's part of the user namespace
>> mapping (for project quota).  What we can do is have a new id that
>> behaves like it.
>>
>
> Perhaps we *can* use projid without abusing it.
> userns already maps projids, but there is no concept of "owning project"
> for a userns, nor does it make a lot of sense, because projid is not
> part of the credentials.
> But if we re-brand it as "container root projid", we can try to use it
> for defining semantics to grant unprivileged access to a subtree.
>
> The functionality you are trying to get with shiftfs mark does
> sounds a bit like "container root projid":
> - inodes with mapped projid MAY be uid/gid shifted
> - inodes with unmapped projid MAY NOT
>
> I realize this may be very raw, but its a start. If you like this
> direction we can try to develop it.
>
>> But like I said, we don't really need a ful ID, it would basically just
>> be a single bit mark to say remap or not when doing permission checks
>> against this inode.  It would follow some of the project id semantics
>> (like inheritance from parent dir)
>>
>
> But a single bit would only work for single level of userns nesting won't it?
>
>
>>>> I guess we should define the semantics for the required sub-tree
>>>> marking, before we can talk about solutions.
>>>
>>> Good plan.
>>
>> So I've been thinking about how to do this without subtree marking and
>> yet retain the subtree properties similar to project id.  The advantage
>> would be that if it can be done using only inode properties, then none
>> of the permission prototypes need change.  The only real subtree
>> property we need is ability to bind into an unprivileged mount
>> namespace, but we already have that.  The gotcha about marking inodes
>> is that they're all or nothing, so every subtree that gets access to
>> the inode inherits the mark.  This means that we cannot allow a user
>> access to a marked inode without the cover of an unprivileged user
>> namespace, but I think that's fixable in the permission check
>> (basically if the inode is marked you *only* get access if you have a
>> user_ns != init_user_ns and we do the permission shifts or you have
>> user_ns == init_user_ns and you are admin capable).
>>
>
> I didn't follow, but it sounds like your proposed solutions is only
> good for single level of userns nesting.
> Do you think you can redefine it in terms of "container root projid".
>

Looks like all this started from mangling uid/gid or some other metadata.
As usual, I have to propose funny/insane solutions:
proxify filesystem with fuse and mangle everything in userspace.
Or add some kind of userspace-driver remapping/mangling into overlay,
for example using BPF script (I see it everywhere nowdays).
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 8, 2017, 2:57 p.m. UTC | #36
On Wed, 2017-02-08 at 08:44 +0200, Amir Goldstein wrote:
> On Wed, Feb 8, 2017 at 1:42 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Tue, 2017-02-07 at 14:25 -0800, Christoph Hellwig wrote:
> > > On Tue, Feb 07, 2017 at 11:01:29PM +0200, Amir Goldstein wrote:
> > > > Project id's are not exactly "subtree" semantic, but 
> > > > inheritance semantics,
> > > > which is not the same when non empty directories get their
> > > > project
> > > > id changed.
> > > > Here is a recap:
> > > > https://lwn.net/Articles/623835/
> > > 
> > > Yes - but if we abuse them for containers we could refine the
> > > semantics to simply not allow change of project ids from inside
> > > containers based on say capabilities.
> > 
> 
> You mean something like this:
> https://lwn.net/Articles/632917/
> 
> With the suggested protected_projects, projid 0 (also inside 
> container) gets a special meaning, much like user 0, so we may do 
> interesting things with the projid that is mapped to 0.
> 
> > We can't really abuse projectid, it's part of the user namespace
> > mapping (for project quota).  What we can do is have a new id that
> > behaves like it.
> > 
> 
> Perhaps we *can* use projid without abusing it. userns already maps 
> projids, but there is no concept of "owning project" for a userns, 
> nor does it make a lot of sense, because projid is not part of the 
> credentials. But if we re-brand it as "container root projid", we can 
> try to use it for defining semantics to grant unprivileged access to
> a subtree.
> 
> The functionality you are trying to get with shiftfs mark does
> sounds a bit like "container root projid":
> - inodes with mapped projid MAY be uid/gid shifted
> - inodes with unmapped projid MAY NOT
> 
> I realize this may be very raw, but its a start. If you like this
> direction we can try to develop it.

So I don't think hijacking project id is the way to go.  If we do that
we interfere with using project quotas within containers.  Now that
project quotas work for both xfs and ext4, it's no longer really an xfs
specific feature.

I could see adding a shift on a per projectid basis, so project id
still had its quota meaning, but you could get the uid/gid shift from a
given project id.  However, the big kicker is that the only filesystems
you can actually set a projectid on (via the fsxattr) are ext4 and xfs.
 That's too few to make it work universally (we'd at least need btrfs
and possibly a few others).

However, that's just mechanism.  We can begin with a volatile mark and
work out how we want to store it later.  I think following projectid
properties is the important one, so the choice of whether to hijack, or
attach to projectid is preserved but not mandated.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 8, 2017, 3:15 p.m. UTC | #37
On Wed, 2017-02-08 at 08:44 +0200, Amir Goldstein wrote:
> On Wed, Feb 8, 2017 at 1:42 AM, James Bottomley
[...]
> > So I've been thinking about how to do this without subtree marking 
> > and yet retain the subtree properties similar to project id.  The
> > advantage would be that if it can be done using only inode 
> > properties, then none of the permission prototypes need change. 
> >  The only real subtree property we need is ability to bind into an 
> > unprivileged mount namespace, but we already have that.  The gotcha 
> > about marking inodes is that they're all or nothing, so every 
> > subtree that gets access to the inode inherits the mark.  This 
> > means that we cannot allow a user access to a marked inode without 
> > the cover of an unprivileged user namespace, but I think that's 
> > fixable in the permission check (basically if the inode is marked 
> > you *only* get access if you have a user_ns != init_user_ns and we 
> > do the permission shifts or you have user_ns == init_user_ns and
> > you are admin capable).
> > 
> 
> I didn't follow, but it sounds like your proposed solutions is only
> good for single level of userns nesting. Do you think you can
> redefine it in terms of "container root projid".

I don't quite understand what you're getting at.  user_ns mappings
nest, but what we see depends on where you're trying to look at it. 
 Let's take the kernel's view as the primary one.  That's the kuid_t. 
 The user has a different view, the uid_t and now we have the
filesystem view (no actual type for this).  The user view is produced
by from the kernel view by chaining up all the maps from the
current_user_ns and the filesystem view is produced by doing the same
thing for the s_user_ns.  So however many levels of user namespace
nesting we have operating, we only have three views of what an id is:
the user view, the kernel view and the filesystem view.  All nesting
does is change how those views are mapped but it doesn't alter the
number of views.

What the original shiftfs patches (not the ones that use s_user_ns) did
was to introduce effectively an inode view and map between the kernel
and the inode view using the shift mapping parameters; then the inode
view would get mapped through the s_user_ns to become the filesystem
view.  In the s_user_ns version of shiftfs (the current patches),
there's still an inode view, but we know that what we want to write to
disk is the user view, so effectively the user view and the inode view
become the same if the filesystem is marked otherwise the inode view
and the kernel view are the same if it isn't.  That's why I only need a
single bit to tell me if I'm mapping or not and there are two separate
regimes to check the permissions in: the user == inode view and the
kernel == inode view.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 8, 2017, 3:22 p.m. UTC | #38
On Tue, 2017-02-07 at 17:54 -0800, Josh Triplett wrote:
> On Tue, Feb 07, 2017 at 11:49:33AM -0800, Christoph Hellwig wrote:
> > On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley wrote:
> > > >   Another option would be to require something like a project
> > > > as used 
> > > > for project quotas as the root.  This would also be conveniant
> > > > as it 
> > > > could storge the used remapping tables.
> > > 
> > > So this would be like the current project quota except set on a
> > > subtree?  I could see it being done that way but I don't see what
> > > advantage it has over using flags in the subtree itself (the
> > > mapping is
> > > known based on the mount namespace, so there's really only a
> > > single bit
> > > of information to store).
> > 
> > projects (which are the underling concept for project quotas) are
> > per-subtree in practice - the flag is set on an inode and then
> > all directories and files underneath inherit the project ID,
> > hardlinking outside a project is prohinited.
> 
> I'm interested in having a VFS-level way to do more than just a 
> shift; I'd like to be able to arbitrarily remap IDs between what's on 
> disk and the system IDs.

OK, so the shift is effectively an arbitrary remap because it allows
multiple ranges to be mapped (althought the userns currently imposes a
maximum number of five extents but that limit is a bit arbitrary just
to try to limit the amount of space the parametrisation takes).  See
kernel/user_namespace.c:map_id_up/down()

>   If we're talking about developing a VFS-level solution for this, 
> I'd like to avoid limiting it to just a shift.  (A shift/range
> would definitely be the simplest solution for many common container
> cases, but not all.)

I assume the above satisfies you on this point, but raises the
question: do you want an arbitrary shift not parametrised by a user
namespace?  If so how many such shifts do you want ... giving some
details of the use case would be helpful.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josh Triplett Feb. 9, 2017, 10:36 a.m. UTC | #39
On Wed, Feb 08, 2017 at 07:22:45AM -0800, James Bottomley wrote:
> On Tue, 2017-02-07 at 17:54 -0800, Josh Triplett wrote:
> > On Tue, Feb 07, 2017 at 11:49:33AM -0800, Christoph Hellwig wrote:
> > > On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley wrote:
> > > > >   Another option would be to require something like a project
> > > > > as used 
> > > > > for project quotas as the root.  This would also be conveniant
> > > > > as it 
> > > > > could storge the used remapping tables.
> > > > 
> > > > So this would be like the current project quota except set on a
> > > > subtree?  I could see it being done that way but I don't see what
> > > > advantage it has over using flags in the subtree itself (the
> > > > mapping is
> > > > known based on the mount namespace, so there's really only a
> > > > single bit
> > > > of information to store).
> > > 
> > > projects (which are the underling concept for project quotas) are
> > > per-subtree in practice - the flag is set on an inode and then
> > > all directories and files underneath inherit the project ID,
> > > hardlinking outside a project is prohinited.
> > 
> > I'm interested in having a VFS-level way to do more than just a 
> > shift; I'd like to be able to arbitrarily remap IDs between what's on 
> > disk and the system IDs.
> 
> OK, so the shift is effectively an arbitrary remap because it allows
> multiple ranges to be mapped (althought the userns currently imposes a
> maximum number of five extents but that limit is a bit arbitrary just
> to try to limit the amount of space the parametrisation takes).  See
> kernel/user_namespace.c:map_id_up/down()
> 
> >   If we're talking about developing a VFS-level solution for this, 
> > I'd like to avoid limiting it to just a shift.  (A shift/range
> > would definitely be the simplest solution for many common container
> > cases, but not all.)
> 
> I assume the above satisfies you on this point, but raises the
> question: do you want an arbitrary shift not parametrised by a user
> namespace?  If so how many such shifts do you want ... giving some
> details of the use case would be helpful.

The limit of five extents means this may not work in the most general
case, no.

One use case: given an on-disk filesystem, its name-to-number mapping,
and your host name-to-number mapping, mount the filesystem with all the
UIDs bidirectionally mapped to those on your host system.

Another use case: given an on-disk filesystem with potentially arbitrary
UIDs (not necessarily in a clean contiguous block), and a pile of
unprivileged UIDs, mount the filesystem such that every on-disk UID gets
a unique unprivileged UID.

(I have some additional use cases, but they would require the ability to
extend the mapping on the fly without remounting.)
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 9, 2017, 3:34 p.m. UTC | #40
On Thu, 2017-02-09 at 02:36 -0800, Josh Triplett wrote:
> On Wed, Feb 08, 2017 at 07:22:45AM -0800, James Bottomley wrote:
> > On Tue, 2017-02-07 at 17:54 -0800, Josh Triplett wrote:
> > > On Tue, Feb 07, 2017 at 11:49:33AM -0800, Christoph Hellwig
> > > wrote:
> > > > On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley
> > > > wrote:
> > > > > >   Another option would be to require something like a 
> > > > > > project as used for project quotas as the root.  This would 
> > > > > > also be conveniant as it could storge the used remapping
> > > > > > tables.
> > > > > 
> > > > > So this would be like the current project quota except set on 
> > > > > a subtree?  I could see it being done that way but I don't 
> > > > > see what advantage it has over using flags in the subtree 
> > > > > itself (the mapping is known based on the mount namespace, so 
> > > > > there's really only a single bit of information to store).
> > > > 
> > > > projects (which are the underling concept for project quotas) 
> > > > are per-subtree in practice - the flag is set on an inode and 
> > > > then all directories and files underneath inherit the project 
> > > > ID, hardlinking outside a project is prohinited.
> > > 
> > > I'm interested in having a VFS-level way to do more than just a 
> > > shift; I'd like to be able to arbitrarily remap IDs between 
> > > what's on disk and the system IDs.
> > 
> > OK, so the shift is effectively an arbitrary remap because it 
> > allows multiple ranges to be mapped (althought the userns currently
> > imposes a maximum number of five extents but that limit is a bit 
> > arbitrary just to try to limit the amount of space the 
> > parametrisation takes).  See
> > kernel/user_namespace.c:map_id_up/down()
> > 
> > >   If we're talking about developing a VFS-level solution for 
> > > this, I'd like to avoid limiting it to just a shift.  (A 
> > > shift/range would definitely be the simplest solution for many 
> > > common container cases, but not all.)
> > 
> > I assume the above satisfies you on this point, but raises the
> > question: do you want an arbitrary shift not parametrised by a user
> > namespace?  If so how many such shifts do you want ... giving some
> > details of the use case would be helpful.
> 
> The limit of five extents means this may not work in the most general
> case, no.

That's not an API limit, so it can be changed if there's a need.  The
problem was merely how to parametrise a mapping without taking too much
space.

> One use case: given an on-disk filesystem, its name-to-number 
> mapping, and your host name-to-number mapping, mount the filesystem 
> with all the UIDs bidirectionally mapped to those on your host
> system.

This is pretty much what the s_user_ns does.

> Another use case: given an on-disk filesystem with potentially 
> arbitrary UIDs (not necessarily in a clean contiguous block), and a 
> pile of unprivileged UIDs, mount the filesystem such that every on
> -disk UID gets a unique unprivileged UID.

So is this.  Basically anything that begins by mounting gets a super
block and can use the s_user_ns to map from the filesystem view to the
kernel view of ids.  Apart from greater sophistication in the
parametrisation, it sounds like we have all the machinery you need. 
 I'm sure the containers people will consider reasonable patches to
change this.

James

> (I have some additional use cases, but they would require the ability 
> to extend the mapping on the fly without remounting.)
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Feb. 13, 2017, 10:15 a.m. UTC | #41
James Bottomley <James.Bottomley@HansenPartnership.com> writes:

> On Thu, 2017-02-09 at 02:36 -0800, Josh Triplett wrote:
>> On Wed, Feb 08, 2017 at 07:22:45AM -0800, James Bottomley wrote:
>> > On Tue, 2017-02-07 at 17:54 -0800, Josh Triplett wrote:
>> > > On Tue, Feb 07, 2017 at 11:49:33AM -0800, Christoph Hellwig
>> > > wrote:
>> > > > On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley
>> > > > wrote:
>> > > > > >   Another option would be to require something like a 
>> > > > > > project as used for project quotas as the root.  This would 
>> > > > > > also be conveniant as it could storge the used remapping
>> > > > > > tables.
>> > > > > 
>> > > > > So this would be like the current project quota except set on 
>> > > > > a subtree?  I could see it being done that way but I don't 
>> > > > > see what advantage it has over using flags in the subtree 
>> > > > > itself (the mapping is known based on the mount namespace, so 
>> > > > > there's really only a single bit of information to store).
>> > > > 
>> > > > projects (which are the underling concept for project quotas) 
>> > > > are per-subtree in practice - the flag is set on an inode and 
>> > > > then all directories and files underneath inherit the project 
>> > > > ID, hardlinking outside a project is prohinited.
>> > > 
>> > > I'm interested in having a VFS-level way to do more than just a 
>> > > shift; I'd like to be able to arbitrarily remap IDs between 
>> > > what's on disk and the system IDs.
>> > 
>> > OK, so the shift is effectively an arbitrary remap because it 
>> > allows multiple ranges to be mapped (althought the userns currently
>> > imposes a maximum number of five extents but that limit is a bit 
>> > arbitrary just to try to limit the amount of space the 
>> > parametrisation takes).  See
>> > kernel/user_namespace.c:map_id_up/down()
>> > 
>> > >   If we're talking about developing a VFS-level solution for 
>> > > this, I'd like to avoid limiting it to just a shift.  (A 
>> > > shift/range would definitely be the simplest solution for many 
>> > > common container cases, but not all.)
>> > 
>> > I assume the above satisfies you on this point, but raises the
>> > question: do you want an arbitrary shift not parametrised by a user
>> > namespace?  If so how many such shifts do you want ... giving some
>> > details of the use case would be helpful.
>> 
>> The limit of five extents means this may not work in the most general
>> case, no.
>
> That's not an API limit, so it can be changed if there's a need.  The
> problem was merely how to parametrise a mapping without taking too much
> space.
>
>> One use case: given an on-disk filesystem, its name-to-number 
>> mapping, and your host name-to-number mapping, mount the filesystem 
>> with all the UIDs bidirectionally mapped to those on your host
>> system.
>
> This is pretty much what the s_user_ns does.
>
>> Another use case: given an on-disk filesystem with potentially 
>> arbitrary UIDs (not necessarily in a clean contiguous block), and a 
>> pile of unprivileged UIDs, mount the filesystem such that every on
>> -disk UID gets a unique unprivileged UID.
>
> So is this.  Basically anything that begins by mounting gets a super
> block and can use the s_user_ns to map from the filesystem view to the
> kernel view of ids.  Apart from greater sophistication in the
> parametrisation, it sounds like we have all the machinery you need. 
>  I'm sure the containers people will consider reasonable patches to
> change this.

Yes.

And to be clear we have all of that merged now and mostly present and
hooked up in all filesystems without any shiftfs like changes needed.

To use this with a filesystem a last pass needs to be had to verify that
the cases where something does not map are handled cleanly.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vivek Goyal Feb. 14, 2017, 11:03 p.m. UTC | #42
On Sun, Feb 05, 2017 at 05:18:11PM -0800, James Bottomley wrote:

[..]
> >  shiftfs is going to miss out on overlayfs bug fixes related to user 
> > credentials differ from mounter credentials, like fd3220d ("ovl: 
> > update S_ISGID when setting posix ACLs"). I am not sure that this 
> > specific case is relevant to shiftfs, but there could be other.
> 
> OK, so shiftfs doesn't have this bug and the reason why is
> illustrative: basically shiftfs does three things
> 
>    1. lookups via a uid/gid shifted dentry cache
>    2. shifted credential inode operations permission checks on the
>       underlying filesystem
>    3. location marking for unprivileged mount
> 
> I think we've already seen that 1. isn't from overlayfs but the
> functionality could be added to overlayfs, I suppose.  The big problem
> is 2.  The overlayfs code emulates the permission checks, which makes
> it rather complex (this is where you get your bugs like the above
> from).  I did actually look at adding 2. to overlayfs on the theory
> that a single layer overlay might be closest to what this is, but
> eventually concluded I'd have to take the special cases and add a whole
> lot more to them ... it really would increase the maintenance burden
> substantially and make the code an unreadable rats nest.

Hi James,

If we merge this functionality in overlayfs, then we could avoid extra
copy of dentry/inode and that might be a significant advantage.

W.r.t permission checks, I am wondering will it make sense to do what
overlayfs is doing for shiftfs. That is permission is checked on
two inodes. We use creds of task for checking permission on
shiftfs/overlay inode and mounter's creds on real inode.

Given we have already shifted the uid/gid for shiftfs inode, I am 
wondering that why can't we simply call generic_permission(shiftfs_inode,
mask) directly in the context of caller. Something like..

shiftfs_permission() {
  err = generic_permission(inode, mask);
  if (err)
  	return err;

  switch_to_mounter_creds;
  err = inode_permission(reali, mask);
  revert_creds();

  return err;
}

Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 14, 2017, 11:45 p.m. UTC | #43
On Tue, 2017-02-14 at 18:03 -0500, Vivek Goyal wrote:
> On Sun, Feb 05, 2017 at 05:18:11PM -0800, James Bottomley wrote:
> 
> [..]
> > >  shiftfs is going to miss out on overlayfs bug fixes related to
> > > user 
> > > credentials differ from mounter credentials, like fd3220d ("ovl: 
> > > update S_ISGID when setting posix ACLs"). I am not sure that this
> > > specific case is relevant to shiftfs, but there could be other.
> > 
> > OK, so shiftfs doesn't have this bug and the reason why is
> > illustrative: basically shiftfs does three things
> > 
> >    1. lookups via a uid/gid shifted dentry cache
> >    2. shifted credential inode operations permission checks on the
> >       underlying filesystem
> >    3. location marking for unprivileged mount
> > 
> > I think we've already seen that 1. isn't from overlayfs but the
> > functionality could be added to overlayfs, I suppose.  The big 
> > problem is 2.  The overlayfs code emulates the permission checks, 
> > which makes it rather complex (this is where you get your bugs like 
> > the above from).  I did actually look at adding 2. to overlayfs on 
> > the theory that a single layer overlay might be closest to what 
> > this is, but eventually concluded I'd have to take the special 
> > cases and add a whole lot more to them ... it really would increase 
> > the maintenance burden substantially and make the code an
> > unreadable rats nest.
> 
> Hi James,
> 
> If we merge this functionality in overlayfs, then we could avoid 
> extra copy of dentry/inode and that might be a significant advantage.

I made that argument to Viro originally when I tried to do all lookups
via the underlying cache.  In the end, it's 192 bytes per dentry and
584 per inode, all of which are reclaimable, so it's not much of an
advantage and it is a great simplification to the code.  In general if
you have a natural separation, you should make the layers reflect it.

My container use case doesn't use overlayfs currently, so to me it
wouldn't provide any advantage whatsoever.

> W.r.t permission checks, I am wondering will it make sense to do what
> overlayfs is doing for shiftfs. That is permission is checked on
> two inodes. We use creds of task for checking permission on
> shiftfs/overlay inode and mounter's creds on real inode.

The mounter's creds for overlay are usually admin ones, so it's local
permission check asks should I? and the later one asks can I? (as in
would my original admin creds allow this).  In many ways, overlayfs is
ignoring the fact that the underlying ->permissions() call might have
failed for some good reason on the current creds.  I don't think any
serious trouble results from this but it strikes me as icky.

> Given we have already shifted the uid/gid for shiftfs inode, I am 
> wondering that why can't we simply call generic_permission(shiftfs_in
> ode, mask) directly in the context of caller. Something like..
> 
> shiftfs_permission() {
>   err = generic_permission(inode, mask);
>   if (err)
>   	return err;
> 
>   switch_to_mounter_creds;
>   err = inode_permission(reali, mask);
>   revert_creds();
> 
>   return err;
> }

Because if the reali->d_iop->permission exists, you should use it.  You
could argue shiftfs_permission should be

	if (iop->permission) {
		oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
		err = iop->permission(reali, mask);
		shiftfs_old_creds(oldcred, &newcred);
	} else
		err = generic_permission(inode, mask);

But really that's a small optimisation.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Djalal Harouni Feb. 15, 2017, 9:33 a.m. UTC | #44
On Mon, Feb 13, 2017 at 11:15 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> James Bottomley <James.Bottomley@HansenPartnership.com> writes:
>
>> On Thu, 2017-02-09 at 02:36 -0800, Josh Triplett wrote:
>>> On Wed, Feb 08, 2017 at 07:22:45AM -0800, James Bottomley wrote:
>>> > On Tue, 2017-02-07 at 17:54 -0800, Josh Triplett wrote:
>>> > > On Tue, Feb 07, 2017 at 11:49:33AM -0800, Christoph Hellwig
>>> > > wrote:
>>> > > > On Tue, Feb 07, 2017 at 11:02:03AM -0800, James Bottomley
>>> > > > wrote:
>>> > > > > >   Another option would be to require something like a
>>> > > > > > project as used for project quotas as the root.  This would
>>> > > > > > also be conveniant as it could storge the used remapping
>>> > > > > > tables.
>>> > > > >
>>> > > > > So this would be like the current project quota except set on
>>> > > > > a subtree?  I could see it being done that way but I don't
>>> > > > > see what advantage it has over using flags in the subtree
>>> > > > > itself (the mapping is known based on the mount namespace, so
>>> > > > > there's really only a single bit of information to store).
>>> > > >
>>> > > > projects (which are the underling concept for project quotas)
>>> > > > are per-subtree in practice - the flag is set on an inode and
>>> > > > then all directories and files underneath inherit the project
>>> > > > ID, hardlinking outside a project is prohinited.
>>> > >
>>> > > I'm interested in having a VFS-level way to do more than just a
>>> > > shift; I'd like to be able to arbitrarily remap IDs between
>>> > > what's on disk and the system IDs.
>>> >
>>> > OK, so the shift is effectively an arbitrary remap because it
>>> > allows multiple ranges to be mapped (althought the userns currently
>>> > imposes a maximum number of five extents but that limit is a bit
>>> > arbitrary just to try to limit the amount of space the
>>> > parametrisation takes).  See
>>> > kernel/user_namespace.c:map_id_up/down()
>>> >
>>> > >   If we're talking about developing a VFS-level solution for
>>> > > this, I'd like to avoid limiting it to just a shift.  (A
>>> > > shift/range would definitely be the simplest solution for many
>>> > > common container cases, but not all.)
>>> >
>>> > I assume the above satisfies you on this point, but raises the
>>> > question: do you want an arbitrary shift not parametrised by a user
>>> > namespace?  If so how many such shifts do you want ... giving some
>>> > details of the use case would be helpful.
>>>
>>> The limit of five extents means this may not work in the most general
>>> case, no.
>>
>> That's not an API limit, so it can be changed if there's a need.  The
>> problem was merely how to parametrise a mapping without taking too much
>> space.
>>
>>> One use case: given an on-disk filesystem, its name-to-number
>>> mapping, and your host name-to-number mapping, mount the filesystem
>>> with all the UIDs bidirectionally mapped to those on your host
>>> system.
>>
>> This is pretty much what the s_user_ns does.
>>
>>> Another use case: given an on-disk filesystem with potentially
>>> arbitrary UIDs (not necessarily in a clean contiguous block), and a
>>> pile of unprivileged UIDs, mount the filesystem such that every on
>>> -disk UID gets a unique unprivileged UID.
>>
>> So is this.  Basically anything that begins by mounting gets a super
>> block and can use the s_user_ns to map from the filesystem view to the
>> kernel view of ids.  Apart from greater sophistication in the
>> parametrisation, it sounds like we have all the machinery you need.
>>  I'm sure the containers people will consider reasonable patches to
>> change this.
>
> Yes.
>
> And to be clear we have all of that merged now and mostly present and
> hooked up in all filesystems without any shiftfs like changes needed.
>
> To use this with a filesystem a last pass needs to be had to verify that
> the cases where something does not map are handled cleanly.

Still this does not answer the question how to dynamically
*attach/share* data or read-only volumes as defined by
orchestration/container tools into several containers. Am I missing
something or is the plan to have per superblock mount for each one ?
Eric W. Biederman Feb. 15, 2017, 9:37 a.m. UTC | #45
Djalal Harouni <tixxdz@gmail.com> writes:

> On Mon, Feb 13, 2017 at 11:15 AM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>> James Bottomley <James.Bottomley@HansenPartnership.com> writes:

>>> So is this.  Basically anything that begins by mounting gets a super
>>> block and can use the s_user_ns to map from the filesystem view to the
>>> kernel view of ids.  Apart from greater sophistication in the
>>> parametrisation, it sounds like we have all the machinery you need.
>>>  I'm sure the containers people will consider reasonable patches to
>>> change this.
>>
>> Yes.
>>
>> And to be clear we have all of that merged now and mostly present and
>> hooked up in all filesystems without any shiftfs like changes needed.
>>
>> To use this with a filesystem a last pass needs to be had to verify that
>> the cases where something does not map are handled cleanly.
>
> Still this does not answer the question how to dynamically
> *attach/share* data or read-only volumes as defined by
> orchestration/container tools into several containers. Am I missing
> something or is the plan to have per superblock mount for each one ?

Agreed.  That is a related problem and the problem that shiftfs
is working to solve.

If you only need a single mapping the infrastructure is basically done
in the kernel today.  If you need multiple mappings we need something
more.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Djalal Harouni Feb. 15, 2017, 10:04 a.m. UTC | #46
On Wed, Feb 15, 2017 at 10:37 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> Djalal Harouni <tixxdz@gmail.com> writes:
>
>> On Mon, Feb 13, 2017 at 11:15 AM, Eric W. Biederman
>> <ebiederm@xmission.com> wrote:
>>> James Bottomley <James.Bottomley@HansenPartnership.com> writes:
>
>>>> So is this.  Basically anything that begins by mounting gets a super
>>>> block and can use the s_user_ns to map from the filesystem view to the
>>>> kernel view of ids.  Apart from greater sophistication in the
>>>> parametrisation, it sounds like we have all the machinery you need.
>>>>  I'm sure the containers people will consider reasonable patches to
>>>> change this.
>>>
>>> Yes.
>>>
>>> And to be clear we have all of that merged now and mostly present and
>>> hooked up in all filesystems without any shiftfs like changes needed.
>>>
>>> To use this with a filesystem a last pass needs to be had to verify that
>>> the cases where something does not map are handled cleanly.
>>
>> Still this does not answer the question how to dynamically
>> *attach/share* data or read-only volumes as defined by
>> orchestration/container tools into several containers. Am I missing
>> something or is the plan to have per superblock mount for each one ?
>
> Agreed.  That is a related problem and the problem that shiftfs
> is working to solve.
>
> If you only need a single mapping the infrastructure is basically done
> in the kernel today.  If you need multiple mappings we need something
> more.

Yes, I'm asking since there is that vfs+userns proposed approach that
I linked in this thread, that deals with this particular problem: in
which mount namespace<->container the volume appears, maybe that can
be used on top of the s_user_ns ...

Thanks!
Vivek Goyal Feb. 15, 2017, 2:17 p.m. UTC | #47
On Tue, Feb 14, 2017 at 03:45:55PM -0800, James Bottomley wrote:
> On Tue, 2017-02-14 at 18:03 -0500, Vivek Goyal wrote:
> > On Sun, Feb 05, 2017 at 05:18:11PM -0800, James Bottomley wrote:
> > 
> > [..]
> > > >  shiftfs is going to miss out on overlayfs bug fixes related to
> > > > user 
> > > > credentials differ from mounter credentials, like fd3220d ("ovl: 
> > > > update S_ISGID when setting posix ACLs"). I am not sure that this
> > > > specific case is relevant to shiftfs, but there could be other.
> > > 
> > > OK, so shiftfs doesn't have this bug and the reason why is
> > > illustrative: basically shiftfs does three things
> > > 
> > >    1. lookups via a uid/gid shifted dentry cache
> > >    2. shifted credential inode operations permission checks on the
> > >       underlying filesystem
> > >    3. location marking for unprivileged mount
> > > 
> > > I think we've already seen that 1. isn't from overlayfs but the
> > > functionality could be added to overlayfs, I suppose.  The big 
> > > problem is 2.  The overlayfs code emulates the permission checks, 
> > > which makes it rather complex (this is where you get your bugs like 
> > > the above from).  I did actually look at adding 2. to overlayfs on 
> > > the theory that a single layer overlay might be closest to what 
> > > this is, but eventually concluded I'd have to take the special 
> > > cases and add a whole lot more to them ... it really would increase 
> > > the maintenance burden substantially and make the code an
> > > unreadable rats nest.
> > 
> > Hi James,
> > 
> > If we merge this functionality in overlayfs, then we could avoid 
> > extra copy of dentry/inode and that might be a significant advantage.
> 
> I made that argument to Viro originally when I tried to do all lookups
> via the underlying cache.  In the end, it's 192 bytes per dentry and
> 584 per inode, all of which are reclaimable, so it's not much of an
> advantage and it is a great simplification to the code.  In general if
> you have a natural separation, you should make the layers reflect it.

ok.

> 
> My container use case doesn't use overlayfs currently, so to me it
> wouldn't provide any advantage whatsoever.

In docker and other use cases, this probably will be used in conjunction
with overlayfs as containers would like to write data and that should not
go back to image dir and should be sent to container specific dir.

> 
> > W.r.t permission checks, I am wondering will it make sense to do what
> > overlayfs is doing for shiftfs. That is permission is checked on
> > two inodes. We use creds of task for checking permission on
> > shiftfs/overlay inode and mounter's creds on real inode.
> 
> The mounter's creds for overlay are usually admin ones, so it's local
> permission check asks should I? and the later one asks can I? (as in
> would my original admin creds allow this).  In many ways, overlayfs is
> ignoring the fact that the underlying ->permissions() call might have
> failed for some good reason on the current creds.  I don't think any
> serious trouble results from this but it strikes me as icky.

So we do call ->permission() of underlying inode but with the creds of
mounter (as you noted). Given we don't call reali->permission() with
the creds of task, it resulted in issues with disk quota. mounter
had CAP_SYS_RESOURCE so disk quota was being ignored. But that's easily
fixable by taking away CAP_SYS_RESOURCE from mounter's creds if caller
does not have CAP_SYS_RESOURCE.

> 
> > Given we have already shifted the uid/gid for shiftfs inode, I am 
> > wondering that why can't we simply call generic_permission(shiftfs_in
> > ode, mask) directly in the context of caller. Something like..
> > 
> > shiftfs_permission() {
> >   err = generic_permission(inode, mask);
> >   if (err)
> >   	return err;
> > 
> >   switch_to_mounter_creds;
> >   err = inode_permission(reali, mask);
> >   revert_creds();
> > 
> >   return err;
> > }
> 
> Because if the reali->d_iop->permission exists, you should use it.  You
> could argue shiftfs_permission should be
> 
> 	if (iop->permission) {
> 		oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
> 		err = iop->permission(reali, mask);
> 		shiftfs_old_creds(oldcred, &newcred);
> 	} else
> 		err = generic_permission(inode, mask);
> 
> But really that's a small optimisation.

ok. I thought using mounter's creds for real inode checks, will probably
do away with need of modifying caller's user namespace in
shiftfs_get_up_creds().

cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
cred->user_ns = ssi->userns;

IIUC, we are shifting caller's fsuid and fsgid into caller's user
namespace but at the same time using the user_ns of reali->sb->sb_user_ns.
Feels little twisted to me. May be I am misunderstanding it.

Two levels of checks will simplify this a bit. Top level inode will belong
to the user namespace of caller and checks should pass. And mounter's
creds will have ownership over the real inode so no additional namespace
shifting required there. We could also save these creds at mount time
once and don't have to prepare it for every call. (not sure if it has
significant performance issue or not).  Just thinking aloud...

Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vivek Goyal Feb. 15, 2017, 8:34 p.m. UTC | #48
On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:

[..]
> +static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
> +				     unsigned int flags)
> +{
> +	struct dentry *real = dir->i_private, *new;
> +	struct inode *reali = real->d_inode, *newi;
> +	const struct cred *oldcred, *newcred;
> +
> +	inode_lock(reali);
> +	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> +	new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
> +	shiftfs_old_creds(oldcred, &newcred);
> +	inode_unlock(reali);
> +
> +	if (IS_ERR(new))
> +		return new;
> +
> +	dentry->d_fsdata = new;
> +
> +	if (!new->d_inode)
> +		return NULL;
> +
> +	newi = shiftfs_new_inode(dentry->d_sb, new->d_inode->i_mode, new);
> +	if (!newi) {
> +		dput(new);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	d_splice_alias(newi, dentry);

Hi James,

Should it be "return d_splice_alias()" so that if we find an alias it is
returned back to caller and passed in dentry can be freed. Though I don't
know in what cases alias can be found. And if alias is found how do we
make sure alias_dentry->d_fsdata is pointing to new (real dentry).

> +
> +	return NULL;
> +}

Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 16, 2017, 3:51 p.m. UTC | #49
On Wed, 2017-02-15 at 09:17 -0500, Vivek Goyal wrote:
> On Tue, Feb 14, 2017 at 03:45:55PM -0800, James Bottomley wrote:
> > On Tue, 2017-02-14 at 18:03 -0500, Vivek Goyal wrote:

[...]
> > > Given we have already shifted the uid/gid for shiftfs inode, I am
> > > wondering that why can't we simply call
> > > generic_permission(shiftfs_inode, mask) directly in the context
> > > of caller. Something like..
> > > 
> > > shiftfs_permission() {
> > >   err = generic_permission(inode, mask);
> > >   if (err)
> > >   	return err;
> > > 
> > >   switch_to_mounter_creds;
> > >   err = inode_permission(reali, mask);
> > >   revert_creds();
> > > 
> > >   return err;
> > > }
> > 
> > Because if the reali->d_iop->permission exists, you should use it. 
> >  You could argue shiftfs_permission should be
> > 
> > 	if (iop->permission) {
> > 		oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
> > 		err = iop->permission(reali, mask);
> > 		shiftfs_old_creds(oldcred, &newcred);
> > 	} else
> > 		err = generic_permission(inode, mask);
> > 
> > But really that's a small optimisation.
> 
> ok. I thought using mounter's creds for real inode checks, will 
> probably do away with need of modifying caller's user namespace in
> shiftfs_get_up_creds().

Well, no ... the mounter of a marked superblock is container admin, but
the owner in the filesystem view is real root.  The unprivileged
mounter's credentials aren't sufficient, therefore. 

> cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
> cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
> cred->user_ns = ssi->userns;
> 
> IIUC, we are shifting caller's fsuid and fsgid into caller's user
> namespace but at the same time using the user_ns of reali->sb
> ->sb_user_ns. Feels little twisted to me. May be I am
> misunderstanding it.

Actually what we're doing is shifting the credentials into the
underlying mount's filesystem view.

> Two levels of checks will simplify this a bit. Top level inode will 
> belong to the user namespace of caller and checks should pass. And 
> mounter's creds will have ownership over the real inode so no 
> additional namespace shifting required there.

That's the problem: for a marked mount, they don't.

>  We could also save these creds at mount time once and don't have to
> prepare it for every call. (not sure if it has significant
> performance issue or not).  Just thinking aloud...

If it proves to be an issue, I suppose the shift could be cached, but I
really don't think it matters that much.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 16, 2017, 3:56 p.m. UTC | #50
On Wed, 2017-02-15 at 15:34 -0500, Vivek Goyal wrote:
> On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:
> 
> [..]
> > +static struct dentry *shiftfs_lookup(struct inode *dir, struct
> > dentry *dentry,
> > +				     unsigned int flags)
> > +{
> > +	struct dentry *real = dir->i_private, *new;
> > +	struct inode *reali = real->d_inode, *newi;
> > +	const struct cred *oldcred, *newcred;
> > +
> > +	inode_lock(reali);
> > +	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > +	new = lookup_one_len(dentry->d_name.name, real, dentry
> > ->d_name.len);
> > +	shiftfs_old_creds(oldcred, &newcred);
> > +	inode_unlock(reali);
> > +
> > +	if (IS_ERR(new))
> > +		return new;
> > +
> > +	dentry->d_fsdata = new;
> > +
> > +	if (!new->d_inode)
> > +		return NULL;
> > +
> > +	newi = shiftfs_new_inode(dentry->d_sb, new->d_inode
> > ->i_mode, new);
> > +	if (!newi) {
> > +		dput(new);
> > +		return ERR_PTR(-ENOMEM);
> > +	}
> > +
> > +	d_splice_alias(newi, dentry);
> 
> Hi James,
> 
> Should it be "return d_splice_alias()" so that if we find an alias it 
> is returned back to caller and passed in dentry can be freed. Though 
> I don't know in what cases alias can be found. And if alias is found 
> how do we make sure alias_dentry->d_fsdata is pointing to new (real
> dentry).

It probably should be for the sake of the pattern.  In our case I don't
think we can have any root aliases because the root dentry is always
pinned in the cache, so cache lookup should always find it.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vivek Goyal Feb. 16, 2017, 4:42 p.m. UTC | #51
On Thu, Feb 16, 2017 at 07:51:58AM -0800, James Bottomley wrote:

[..]
> > Two levels of checks will simplify this a bit. Top level inode will 
> > belong to the user namespace of caller and checks should pass. And 
> > mounter's creds will have ownership over the real inode so no 
> > additional namespace shifting required there.
> 
> That's the problem: for a marked mount, they don't.

In this new model it does not fit directly. 

I was playing with a slightly different approach and modified patches so
that real root still does the mounting and takes an mount option which
specifies which user namespace we want to shift into. Thanks to Eric for
the idea.

mount -t shiftfs -o userns_fd=<fd> source shifted-fs

In this case real-root is mounter and notion of using mounter's creds on
real-inode works.

This requires a user namespace to be created before shiftfs can be mounted
and then container admin should be able to bind mount shifted-fs.

In this model, intervention of real-root is still required to setup
container and shiftfs. I guess that might not satisfy your needs where
unprivileged user should be able to launch container and be able to
make use of shiftfs, IIUC.

Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 16, 2017, 4:58 p.m. UTC | #52
On Thu, 2017-02-16 at 11:42 -0500, Vivek Goyal wrote:
> On Thu, Feb 16, 2017 at 07:51:58AM -0800, James Bottomley wrote:
> 
> [..]
> > > Two levels of checks will simplify this a bit. Top level inode 
> > > will belong to the user namespace of caller and checks should 
> > > pass. And mounter's creds will have ownership over the real inode 
> > > so no additional namespace shifting required there.
> > 
> > That's the problem: for a marked mount, they don't.
> 
> In this new model it does not fit directly. 
> 
> I was playing with a slightly different approach and modified patches 
> so that real root still does the mounting and takes an mount option
> which specifies which user namespace we want to shift into. Thanks to 
> Eric for the idea.
> 
> mount -t shiftfs -o userns_fd=<fd> source shifted-fs

This is a non-starter because it doesn't work for the unprivileged use
case, which is what I'm really interested in.  For fully unprivileged
containers you don't have an orchestration system to ask to build the
container.  You can get init scripts to set stuff up for you, like the
marks, but ideally it should just work even without that (so an inode
flag following project semantics seems really appealing), but after
that the unprivileged user should be able to build their own
containers.

As you saw from the reply to Eric, this approach (which I have tried)
also opens up a whole can of worms for non-FS_USERNS_MOUNT filesystems.

James


> In this case real-root is mounter and notion of using mounter's creds 
> on real-inode works. 
> This requires a user namespace to be created before shiftfs can be 
> mounted and then container admin should be able to bind mount shifted
> -fs.
> 
> In this model, intervention of real-root is still required to setup
> container and shiftfs. I guess that might not satisfy your needs 
> where unprivileged user should be able to launch container and be 
> able to make use of shiftfs, IIUC.
> 
> Vivek
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Feb. 17, 2017, 1:57 a.m. UTC | #53
James Bottomley <James.Bottomley@HansenPartnership.com> writes:

> On Thu, 2017-02-16 at 11:42 -0500, Vivek Goyal wrote:
>> On Thu, Feb 16, 2017 at 07:51:58AM -0800, James Bottomley wrote:
>> 
>> [..]
>> > > Two levels of checks will simplify this a bit. Top level inode 
>> > > will belong to the user namespace of caller and checks should 
>> > > pass. And mounter's creds will have ownership over the real inode 
>> > > so no additional namespace shifting required there.
>> > 
>> > That's the problem: for a marked mount, they don't.
>> 
>> In this new model it does not fit directly. 
>> 
>> I was playing with a slightly different approach and modified patches 
>> so that real root still does the mounting and takes an mount option
>> which specifies which user namespace we want to shift into. Thanks to 
>> Eric for the idea.
>> 
>> mount -t shiftfs -o userns_fd=<fd> source shifted-fs

> This is a non-starter because it doesn't work for the unprivileged use
> case, which is what I'm really interested in.

But I believe it does.  It just requires a bit more work for in the
shiftfs filesystem above.  It should be perfectly possible with the help
of newuidmap to create a user namespace with the desired mappings.

My understanding is that Vivek started with requiring root to mount the
filesystem only as a simplification during development, and that the
plan is to get the basic use case working and then allow unprivileged
mounting.

> For fully unprivileged
> containers you don't have an orchestration system to ask to build the
> container.  You can get init scripts to set stuff up for you, like the
> marks, but ideally it should just work even without that (so an inode
> flag following project semantics seems really appealing), but after
> that the unprivileged user should be able to build their own
> containers.
>
> As you saw from the reply to Eric, this approach (which I have tried)
> also opens up a whole can of worms for non-FS_USERNS_MOUNT filesystems.
>

From what I can see we have two cases we care about.
A) A non-default mapping from the filesystem to the rest of the system
   and roughly s_user_ns provides that but we need a review of the
   filesystems to make certain something has not been forgotten.

B) A filesystem image sitting around in a directory somewhere that
   we want to map differently into different user namespaces while
   using the same files as backing store.

For the second case what is interesting technically is that we want
multiple mappings.  A user namespace appears adequate to specify those
extra mappings (effectively from kuids to kuids).

So we need something to associate the additional mapping with a
directory tree.  A stackable filesystem with it's own s_user_ns field
appears a very straight forward way to do that.  Especially if it can
figure out how to assert that the underlying filesystem image is
read-only (doesn't overlayfs require that?).  Making the entire stack
read-only.

I don't see a problem with that for unprivileged use (except possibly
the read-only enforcement on the unerlying directory tree).

What Vivek is talking about appears to be perfectly correct.  Performing
the underlying filesystem permission checks as the possibly unprivileged
user who mounted shiftfs.  After performing a set of permission checks
(at the shiftfs level) as the user who is accessing the files.


. . .


I think I am missing something but I completely do not understand that
subthread that says use file marks and perform the work in the vfs.
The problem is that fundamentally we need multiple mappings and I don't
see a mark on a file (even an inherited mark) providing the mapping so I
don't see the point.

Which leaves two possible places to store the extra mapping.  In the
struct mount.  Or in a stacked filesystem super_block.  For a stacked
filesystem I can see where to store the extra translation.  In the upper
filesystems upper inode.   And we can perform the practical permission
check on the upper inode as well.

For a vfs level solution it looks like we would have to change all of
the permission checking code in the kernel to have a special case for
this kind of mapping.  Which does not sound maintainable.

So at the moment I don't think a vfs level solution makes any sense.

And then if you have a stacked filesystem with FS_USERNS_MOUNT set so it
can be mounted by an unprivileged user.   I think it makes sense to
check the mounters creds agains the real inode.  To verify the user that
mounted the filesystem has the permission to perform the desired access.

Which makes only allows the mounter as much permisison as the mounter
would have if they performed the work with fuse instead of a special
in-kernel filesystem.

In a DAC model of the world that makes lots of sense.  I don't know what
actually makes sense in a MAC world.  But I am certain that is something
that can be worked through.

Eric




--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 17, 2017, 2:29 a.m. UTC | #54
On Sat, Feb 04, 2017 at 11:19:32AM -0800, James Bottomley wrote:

> +static const struct dentry_operations shiftfs_dentry_ops = {
> +	.d_release	= shiftfs_d_release,
> +	.d_real		= shiftfs_d_real,
> +};

In other words, those dentries are *never* revalidated.  Nevermind that
underlying fs might be mounted elsewhere and be actively modified under
you.

> +static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
> +				     unsigned int flags)
> +{
> +	struct dentry *real = dir->i_private, *new;
> +	struct inode *reali = real->d_inode, *newi;
> +	const struct cred *oldcred, *newcred;
> +
> +	inode_lock(reali);
> +	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> +	new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
> +	shiftfs_old_creds(oldcred, &newcred);
> +	inode_unlock(reali);
> +
> +	if (IS_ERR(new))
> +		return new;
> +
> +	dentry->d_fsdata = new;
> +
> +	if (!new->d_inode)
> +		return NULL;

What happens when somebody comes along and creates the damn thing on the
underlying fs?  _Not_ via your code, that is - using the underlying fs
mounted elsewhere.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 17, 2017, 2:55 a.m. UTC | #55
On Thu, Feb 16, 2017 at 07:56:30AM -0800, James Bottomley wrote:

> > Hi James,
> > 
> > Should it be "return d_splice_alias()" so that if we find an alias it 
> > is returned back to caller and passed in dentry can be freed. Though 
> > I don't know in what cases alias can be found. And if alias is found 
> > how do we make sure alias_dentry->d_fsdata is pointing to new (real
> > dentry).
> 
> It probably should be for the sake of the pattern.  In our case I don't
> think we can have any root aliases because the root dentry is always
> pinned in the cache, so cache lookup should always find it.

What does that have to do with root dentry?  The real reason why that code
works (FVerySVO) is that the damn thing allocates a new inode every time.
Including the hardlinks, BTW.  So d_splice_alias() will always return NULL -
there's no way for any dentries to be pointing to in-core struct inode you've
just allocated.  Short of a use-after-free, that is...

Unless I'm missing something subtle, the whole thing is fucked
in head wrt cache coherency - its dentries are blindly assumed to be
forever valid, no matter what's happening with the underlying filesystem.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Djalal Harouni Feb. 17, 2017, 8:39 a.m. UTC | #56
On Fri, Feb 17, 2017 at 2:57 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> James Bottomley <James.Bottomley@HansenPartnership.com> writes:
>
>> On Thu, 2017-02-16 at 11:42 -0500, Vivek Goyal wrote:
>>> On Thu, Feb 16, 2017 at 07:51:58AM -0800, James Bottomley wrote:
>>>
>>> [..]
>>> > > Two levels of checks will simplify this a bit. Top level inode
>>> > > will belong to the user namespace of caller and checks should
>>> > > pass. And mounter's creds will have ownership over the real inode
>>> > > so no additional namespace shifting required there.
>>> >
>>> > That's the problem: for a marked mount, they don't.
>>>
>>> In this new model it does not fit directly.
>>>
>>> I was playing with a slightly different approach and modified patches
>>> so that real root still does the mounting and takes an mount option
>>> which specifies which user namespace we want to shift into. Thanks to
>>> Eric for the idea.
>>>
>>> mount -t shiftfs -o userns_fd=<fd> source shifted-fs
>
>> This is a non-starter because it doesn't work for the unprivileged use
>> case, which is what I'm really interested in.
>
> But I believe it does.  It just requires a bit more work for in the
> shiftfs filesystem above.  It should be perfectly possible with the help
> of newuidmap to create a user namespace with the desired mappings.
>
> My understanding is that Vivek started with requiring root to mount the
> filesystem only as a simplification during development, and that the
> plan is to get the basic use case working and then allow unprivileged
> mounting.
>
>> For fully unprivileged
>> containers you don't have an orchestration system to ask to build the
>> container.  You can get init scripts to set stuff up for you, like the
>> marks, but ideally it should just work even without that (so an inode
>> flag following project semantics seems really appealing), but after
>> that the unprivileged user should be able to build their own
>> containers.
>>
>> As you saw from the reply to Eric, this approach (which I have tried)
>> also opens up a whole can of worms for non-FS_USERNS_MOUNT filesystems.
>>
>
> From what I can see we have two cases we care about.
> A) A non-default mapping from the filesystem to the rest of the system
>    and roughly s_user_ns provides that but we need a review of the
>    filesystems to make certain something has not been forgotten.
>
> B) A filesystem image sitting around in a directory somewhere that
>    we want to map differently into different user namespaces while
>    using the same files as backing store.
>
> For the second case what is interesting technically is that we want
> multiple mappings.  A user namespace appears adequate to specify those
> extra mappings (effectively from kuids to kuids).
>
> So we need something to associate the additional mapping with a
> directory tree.  A stackable filesystem with it's own s_user_ns field
> appears a very straight forward way to do that.  Especially if it can
> figure out how to assert that the underlying filesystem image is
> read-only (doesn't overlayfs require that?).  Making the entire stack
> read-only.
>
> I don't see a problem with that for unprivileged use (except possibly
> the read-only enforcement on the unerlying directory tree).
>
> What Vivek is talking about appears to be perfectly correct.  Performing
> the underlying filesystem permission checks as the possibly unprivileged
> user who mounted shiftfs.  After performing a set of permission checks
> (at the shiftfs level) as the user who is accessing the files.
>
>
> . . .
>
>
> I think I am missing something but I completely do not understand that
> subthread that says use file marks and perform the work in the vfs.
> The problem is that fundamentally we need multiple mappings and I don't
> see a mark on a file (even an inherited mark) providing the mapping so I
> don't see the point.
>
> Which leaves two possible places to store the extra mapping.  In the
> struct mount.  Or in a stacked filesystem super_block.  For a stacked
> filesystem I can see where to store the extra translation.  In the upper
> filesystems upper inode.   And we can perform the practical permission
> check on the upper inode as well.
>
> For a vfs level solution it looks like we would have to change all of
> the permission checking code in the kernel to have a special case for
> this kind of mapping.  Which does not sound maintainable.

Facts: for basic permissions: 3 files changed, 19 insertions(+), 6
deletions(-)  https://lkml.org/lkml/2016/5/4/417

That made permissions work for basically *all* filesystems. However
yes it does not handle xattr acls...

> So at the moment I don't think a vfs level solution makes any sense.
>
The permissions change was already done when userns were merged. What
you may need is VFS accessors, instead of working directly on
inode->i_uid ask the VFS to give you the right i_uid (which can also
be the case of projectid proposed by Christoph iff I got it right...)
you need it for both ways: to report to userspace and the other way to
pass it to the underlying filesystem for writes/quota which Dave
Chinner pointed out.

Any way seems the ship has settled, so my thoughts at that time were
to follow the change made for i_uid_read(), i_gid_read() helpers where
userns were merged. The code comment says: "Helper functions so that
in most cases filesystems will not need to deal directly with kuid_t
and kgid_t" so the start was from there: VFS should be the one to
handle everything using accessors for both directions. Now if you guys
think that having multiple user namespaces contexts for every
container, mount namepsace user namespace, s_user_ns and shiftfs user
ns ... or multiple APIs that will just add confusion, me I can see
this directly with orchestration/containers developers they just don't
understand what's happening... ? they want something like bind mounts!
A new filesystem is a new filesystem.

Maybe Eric you will find something useful from these comments.

Thanks!
James Bottomley Feb. 17, 2017, 5:19 p.m. UTC | #57
On Fri, 2017-02-17 at 14:57 +1300, Eric W. Biederman wrote:
> I think I am missing something but I completely do not understand 
> that subthread that says use file marks and perform the work in the 
> vfs. The problem is that fundamentally we need multiple mappings and 
> I don't see a mark on a file (even an inherited mark) providing the 
> mapping so I don't see the point.

The point of the mark is that it's a statement by the system
administrator that the underlying subtree is safe to be mounted by an
unprivileged container in the containers user view (i.e. with
current_user_ns() == s_user_ns).  For the unprivileged container
there's no real arbitrary s_user_ns use case because the unprivileged
container must prove it can set up the mapping, so it would likely
always be mounting from within a user_ns with the mapping it wanted.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 17, 2017, 5:34 p.m. UTC | #58
On Fri, 2017-02-17 at 02:55 +0000, Al Viro wrote:
> On Thu, Feb 16, 2017 at 07:56:30AM -0800, James Bottomley wrote:
> 
> > > Hi James,
> > > 
> > > Should it be "return d_splice_alias()" so that if we find an 
> > > alias it is returned back to caller and passed in dentry can be 
> > > freed. Though I don't know in what cases alias can be found. And 
> > > if alias is found how do we make sure alias_dentry->d_fsdata is 
> > > pointing to new (real dentry).
> > 
> > It probably should be for the sake of the pattern.  In our case I 
> > don't think we can have any root aliases because the root dentry is
> > always pinned in the cache, so cache lookup should always find it.
> 
> What does that have to do with root dentry?  The real reason why that 
> code works (FVerySVO) is that the damn thing allocates a new inode 
> every time. Including the hardlinks, BTW.

Yes, this is a known characteristic of stacked filesystems.  Is there
some magic I don't know about that would make it easier to reflect hard
links as aliases?

>   So d_splice_alias() will always return NULL - there's no way for 
> any dentries to be pointing to in-core struct inode you've
> just allocated.  Short of a use-after-free, that is...
> 
> Unless I'm missing something subtle, the whole thing is fucked
> in head wrt cache coherency - its dentries are blindly assumed to be
> forever valid, no matter what's happening with the underlying 
> filesystem.

Hopefully the patch in the previous email fixes this.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vivek Goyal Feb. 17, 2017, 8:35 p.m. UTC | #59
On Fri, Feb 17, 2017 at 09:34:07AM -0800, James Bottomley wrote:
> On Fri, 2017-02-17 at 02:55 +0000, Al Viro wrote:
> > On Thu, Feb 16, 2017 at 07:56:30AM -0800, James Bottomley wrote:
> > 
> > > > Hi James,
> > > > 
> > > > Should it be "return d_splice_alias()" so that if we find an 
> > > > alias it is returned back to caller and passed in dentry can be 
> > > > freed. Though I don't know in what cases alias can be found. And 
> > > > if alias is found how do we make sure alias_dentry->d_fsdata is 
> > > > pointing to new (real dentry).
> > > 
> > > It probably should be for the sake of the pattern.  In our case I 
> > > don't think we can have any root aliases because the root dentry is
> > > always pinned in the cache, so cache lookup should always find it.
> > 
> > What does that have to do with root dentry?  The real reason why that 
> > code works (FVerySVO) is that the damn thing allocates a new inode 
> > every time. Including the hardlinks, BTW.
> 
> Yes, this is a known characteristic of stacked filesystems.  Is there
> some magic I don't know about that would make it easier to reflect hard
> links as aliases?

I think overlayfs had the same issue in the beginning and miklos fixed it.

commit 51f7e52dc943468c6929fa0a82d4afac3c8e9636
Author: Miklos Szeredi <mszeredi@redhat.com>
Date:   Fri Jul 29 12:05:24 2016 +0200

    ovl: share inode for hard link

Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Feb. 20, 2017, 4:24 a.m. UTC | #60
James Bottomley <James.Bottomley@HansenPartnership.com> writes:

> On Fri, 2017-02-17 at 14:57 +1300, Eric W. Biederman wrote:
>> I think I am missing something but I completely do not understand 
>> that subthread that says use file marks and perform the work in the 
>> vfs. The problem is that fundamentally we need multiple mappings and 
>> I don't see a mark on a file (even an inherited mark) providing the 
>> mapping so I don't see the point.
>
> The point of the mark is that it's a statement by the system
> administrator that the underlying subtree is safe to be mounted by an
> unprivileged container in the containers user view (i.e. with
> current_user_ns() == s_user_ns).  For the unprivileged container
> there's no real arbitrary s_user_ns use case because the unprivileged
> container must prove it can set up the mapping, so it would likely
> always be mounting from within a user_ns with the mapping it wanted.

As a statement that it is ok for the unprivileged mapping code to
operate that seems reasonable.  I don't currently the need for such an
ok from the system adminstrator, but if you need it a flag that
propagates to children and child directories seems reasonable.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 21, 2017, 12:48 a.m. UTC | #61
On Tue, 2017-02-07 at 01:24 +0900, J. R. Okajima wrote:
> James Bottomley:
> > Yes, I know the problem.  However, I believe most current linux
> > filesystems no longer guarantee stable, for the lifetime of the 
> > file, inode numbers.  The usual docker container root is overlayfs,
> > which, similarly doesn't support stable inode numbers.  I see the 
> > odd complaint about docker with overlayfs having unstable inode
> > numbers, but none seems to have any serious repercussions.
> 
> I think it serious.
> Reusing the backend fs' inum is a good approach which Amir wrote.
> Based on this, I'd suggest you to support the hardlinks.

I realised as I was trimming down the vestigial inode properties in the
patch that actually shiftfs does use the i_ino from the underlying for
userspace.  The reason why is that it comes from the getattr call in
stat and that's fully what the underlying filesystem returns (including
the inode number).

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. R. Okajima Feb. 21, 2017, 2:57 a.m. UTC | #62
James Bottomley:
> I realised as I was trimming down the vestigial inode properties in the
> patch that actually shiftfs does use the i_ino from the underlying for
> userspace.  The reason why is that it comes from the getattr call in
> stat and that's fully what the underlying filesystem returns (including
> the inode number).

Let me make sure.
- shiftfs has its own inode, but it will never be visible to userspace.
- the inode attr visible to users are equivalent to the underlying one,
  includeing dev:ino pair.
right?
If so, I am afraid it will make users confused. The dev:ino pair is a
system-wide identity, but shiftfs creates the same dev:ino pair with
different owner. Though I don't know whether the actual application or
LSM exists or not who will be damaged by this situation.
For git-status case which I wrote previously, it might not be a problem
as long as dev:ino is unchanged from git index.
But such filesystem looks weird.


J. R. Okajima
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 21, 2017, 4:07 a.m. UTC | #63
On Tue, 2017-02-21 at 11:57 +0900, J. R. Okajima wrote:
> James Bottomley:
> > I realised as I was trimming down the vestigial inode properties in 
> > the patch that actually shiftfs does use the i_ino from the 
> > underlying for userspace.  The reason why is that it comes from the 
> > getattr call in stat and that's fully what the underlying 
> > filesystem returns (including the inode number).
> 
> Let me make sure.
> - shiftfs has its own inode, but it will never be visible to 
> userspace. - the inode attr visible to users are equivalent to the 
> underlying one,   includeing dev:ino pair.
> right?

Yes, it behaves like a bind mount.

> If so, I am afraid it will make users confused. The dev:ino pair is a
> system-wide identity,

I don't believe it will, otherwise they'd have the same confusion over
a real bind mount.  The dev:inum pair identifies an inode.  An inode
may have many paths and shiftfs just adds a path.

>  but shiftfs creates the same dev:ino pair with different owner.

With a different owner view, but that's irrelevant to the underlying
inode.

>  Though I don't know whether the actual application or LSM exists or
> not who will be damaged by this situation.
> For git-status case which I wrote previously, it might not be a 
> problem as long as dev:ino is unchanged from git index.
> But such filesystem looks weird.

It behaves as much as possible like a bind mount and the user view is
standard behaviour, so it can't really be classified as "weird".  What
won't work like a classic bind mount in this scenario is NFS exporting,
but that's about the only thing.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. R. Okajima Feb. 21, 2017, 4:34 a.m. UTC | #64
James Bottomley:
> With a different owner view, but that's irrelevant to the underlying
> inode.

Ok, the different ownership is limited within shitfs (or userns,
container). Good. I might forget that shiftfs wants to behave like
bind-mount.

And I noticed that shiftfs setattr() converts uid/gid before calling
backend fs' ->setattr(). It is good too.
But how about acl? Won't such conversion be necessary for acl too?


J. R. Okajima
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Feb. 22, 2017, 12:01 p.m. UTC | #65
On Mon, 2017-02-20 at 17:24 +1300, Eric W. Biederman wrote:
> James Bottomley <James.Bottomley@HansenPartnership.com> writes:
> 
> > On Fri, 2017-02-17 at 14:57 +1300, Eric W. Biederman wrote:
> > > I think I am missing something but I completely do not understand
> > > that subthread that says use file marks and perform the work in 
> > > the vfs. The problem is that fundamentally we need multiple 
> > > mappings and I don't see a mark on a file (even an inherited 
> > > mark) providing the mapping so I don't see the point.
> > 
> > The point of the mark is that it's a statement by the system
> > administrator that the underlying subtree is safe to be mounted by 
> > an unprivileged container in the containers user view (i.e. with
> > current_user_ns() == s_user_ns).  For the unprivileged container
> > there's no real arbitrary s_user_ns use case because the 
> > unprivileged container must prove it can set up the mapping, so it 
> > would likely always be mounting from within a user_ns with the 
> > mapping it wanted.
> 
> As a statement that it is ok for the unprivileged mapping code to
> operate that seems reasonable.  I don't currently the need for such 
> an ok from the system adminstrator, but if you need it a flag that
> propagates to children and child directories seems reasonable.

The other way to do this is with an extended attribute.  I've played
around with that approach and quite like it: the advantage is that it's
sticky across system reboots; The down side is that it requires
additional VFS code to make sure you can't execute from the non-user_ns
view.

James

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/Kconfig b/fs/Kconfig
index c2a377c..b6adac0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -104,6 +104,14 @@  source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 source "fs/overlayfs/Kconfig"
 
+config SHIFT_FS
+	tristate "UID/GID shifting overlay filesystem for containers"
+	help
+	  This filesystem can overlay any mounted filesystem and shift
+	  the uid/gid the files appear at.  The idea is that
+	  unprivileged containers can use this to mount root volumes
+	  using this technique.
+
 menu "Caches"
 
 source "fs/fscache/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9..2aa3ad4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -128,3 +128,4 @@  obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_SHIFT_FS)		+= shiftfs.o
diff --git a/fs/shiftfs.c b/fs/shiftfs.c
new file mode 100644
index 0000000..a4a1f98
--- /dev/null
+++ b/fs/shiftfs.c
@@ -0,0 +1,728 @@ 
+#include <linux/cred.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/xattr.h>
+
+struct shiftfs_super_info {
+	struct vfsmount *mnt;
+	struct user_namespace *userns;
+	bool mark;
+};
+
+static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
+				       struct dentry *dentry);
+
+enum {
+	OPT_MARK,
+	OPT_LAST,
+};
+
+/* global filesystem options */
+static const match_table_t tokens = {
+	{ OPT_MARK, "mark" },
+	{ OPT_LAST, NULL }
+};
+
+static const struct cred *shiftfs_get_up_creds(struct super_block *sb)
+{
+	struct shiftfs_super_info *ssi = sb->s_fs_info;
+	struct cred *cred = prepare_creds();
+
+	if (!cred)
+		return NULL;
+
+	cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
+	cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
+	cred->user_ns = ssi->userns;
+
+	return cred;
+}
+
+static const struct cred *shiftfs_new_creds(const struct cred **newcred,
+					    struct super_block *sb)
+{
+	const struct cred *cred = shiftfs_get_up_creds(sb);
+
+	*newcred = cred;
+
+	if (cred)
+		cred = override_creds(cred);
+	else
+		printk(KERN_ERR "shiftfs: Credential override failed: no memory\n");
+
+	return cred;
+}
+
+static void shiftfs_old_creds(const struct cred *oldcred,
+			      const struct cred **newcred)
+{
+	if (!*newcred)
+		return;
+
+	revert_creds(oldcred);
+	put_cred(*newcred);
+}
+
+static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	ssi->mark = false;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case OPT_MARK:
+			ssi->mark = true;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void shiftfs_d_release(struct dentry *dentry)
+{
+	struct dentry *real = dentry->d_fsdata;
+
+	dput(real);
+}
+
+static struct dentry *shiftfs_d_real(struct dentry *dentry,
+				     const struct inode *inode,
+				     unsigned int flags)
+{
+	struct dentry *real = dentry->d_fsdata;
+
+	if (unlikely(real->d_flags & DCACHE_OP_REAL))
+		return real->d_op->d_real(real, real->d_inode, flags);
+
+	return real;
+}
+
+static const struct dentry_operations shiftfs_dentry_ops = {
+	.d_release	= shiftfs_d_release,
+	.d_real		= shiftfs_d_real,
+};
+
+static int shiftfs_readlink(struct dentry *dentry, char __user *data,
+			    int flags)
+{
+	struct dentry *real = dentry->d_fsdata;
+	const struct inode_operations *iop = real->d_inode->i_op;
+
+	if (iop->readlink)
+		return iop->readlink(real, data, flags);
+
+	return -EINVAL;
+}
+
+static const char *shiftfs_get_link(struct dentry *dentry, struct inode *inode,
+				    struct delayed_call *done)
+{
+	if (dentry) {
+		struct dentry *real = dentry->d_fsdata;
+		struct inode *reali = real->d_inode;
+		const struct inode_operations *iop = reali->i_op;
+		const char *res = ERR_PTR(-EPERM);
+
+		if (iop->get_link)
+			res = iop->get_link(real, reali, done);
+
+		return res;
+	} else {
+		/* RCU lookup not supported */
+		return ERR_PTR(-ECHILD);
+	}
+}
+
+static int shiftfs_setxattr(struct dentry *dentry, struct inode *inode,
+			    const char *name, const void *value,
+			    size_t size, int flags)
+{
+	struct dentry *real = dentry->d_fsdata;
+	int err = -EOPNOTSUPP;
+	const struct cred *oldcred, *newcred;
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	err = vfs_setxattr(real, name, value, size, flags);
+	shiftfs_old_creds(oldcred, &newcred);
+
+	return err;
+}
+
+static int shiftfs_xattr_get(const struct xattr_handler *handler,
+			     struct dentry *dentry, struct inode *inode,
+			     const char *name, void *value, size_t size)
+{
+	struct dentry *real = dentry->d_fsdata;
+	int err;
+	const struct cred *oldcred, *newcred;
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	err = vfs_getxattr(real, name, value, size);
+	shiftfs_old_creds(oldcred, &newcred);
+
+	return err;
+}
+
+static ssize_t shiftfs_listxattr(struct dentry *dentry, char *list,
+				 size_t size)
+{
+	struct dentry *real = dentry->d_fsdata;
+	int err;
+	const struct cred *oldcred, *newcred;
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	err = vfs_listxattr(real, list, size);
+	shiftfs_old_creds(oldcred, &newcred);
+
+	return err;
+}
+
+static int shiftfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct dentry *real = dentry->d_fsdata;
+	int err;
+	const struct cred *oldcred, *newcred;
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	err = vfs_removexattr(real, name);
+	shiftfs_old_creds(oldcred, &newcred);
+
+	return err;
+}
+
+static int shiftfs_xattr_set(const struct xattr_handler *handler,
+			     struct dentry *dentry, struct inode *inode,
+			     const char *name, const void *value, size_t size,
+			     int flags)
+{
+	if (!value)
+		return shiftfs_removexattr(dentry, name);
+	return shiftfs_setxattr(dentry, inode, name, value, size, flags);
+}
+
+static void shiftfs_fill_inode(struct inode *inode, struct dentry *dentry)
+{
+	struct inode *reali;
+
+	if (!dentry)
+		return;
+
+	reali = dentry->d_inode;
+
+	if (!reali->i_op->get_link)
+		inode->i_opflags |= IOP_NOFOLLOW;
+
+	inode->i_mapping = reali->i_mapping;
+	inode->i_private = dentry;
+
+	i_uid_write(inode, __kuid_val(reali->i_uid));
+	i_gid_write(inode, __kgid_val(reali->i_gid));
+}
+
+static int shiftfs_make_object(struct inode *dir, struct dentry *dentry,
+			       umode_t mode, const char *symlink,
+			       struct dentry *hardlink, bool excl)
+{
+	struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
+	struct inode *reali = real->d_inode, *newi;
+	const struct inode_operations *iop = reali->i_op;
+	int err;
+	const struct cred *oldcred, *newcred;
+	bool op_ok = false;
+
+	if (hardlink) {
+		op_ok = iop->link;
+	} else {
+		switch (mode & S_IFMT) {
+		case S_IFDIR:
+			op_ok = iop->mkdir;
+			break;
+		case S_IFREG:
+			op_ok = iop->create;
+			break;
+		case S_IFLNK:
+			op_ok = iop->symlink;
+		}
+	}
+	if (!op_ok)
+		return -EINVAL;
+
+
+	newi = shiftfs_new_inode(dentry->d_sb, mode, NULL);
+	if (!newi)
+		return -ENOMEM;
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+
+	inode_lock_nested(reali, I_MUTEX_PARENT);
+
+	err = -EINVAL;		/* shut gcc up about uninit var */
+	if (hardlink) {
+		struct dentry *realhardlink = hardlink->d_fsdata;
+
+		err = vfs_link(realhardlink, reali, new, NULL);
+	} else {
+		switch (mode & S_IFMT) {
+		case S_IFDIR:
+			err = vfs_mkdir(reali, new, mode);
+			break;
+		case S_IFREG:
+			err = vfs_create(reali, new, mode, excl);
+			break;
+		case S_IFLNK:
+			err = vfs_symlink(reali, new, symlink);
+		}
+	}
+
+	shiftfs_old_creds(oldcred, &newcred);
+
+	if (err)
+		goto out_dput;
+
+	shiftfs_fill_inode(newi, new);
+
+	d_instantiate(dentry, newi);
+
+	new = NULL;
+	newi = NULL;
+
+ out_dput:
+	dput(new);
+	iput(newi);
+	inode_unlock(reali);
+
+	return err;
+}
+
+static int shiftfs_create(struct inode *dir, struct dentry *dentry,
+			  umode_t mode,  bool excl)
+{
+	mode |= S_IFREG;
+
+	return shiftfs_make_object(dir, dentry, mode, NULL, NULL, excl);
+}
+
+static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
+			 umode_t mode)
+{
+	mode |= S_IFDIR;
+
+	return shiftfs_make_object(dir, dentry, mode, NULL, NULL, false);
+}
+
+static int shiftfs_link(struct dentry *hardlink, struct inode *dir,
+			struct dentry *dentry)
+{
+	return shiftfs_make_object(dir, dentry, 0, NULL, hardlink, false);
+}
+
+static int shiftfs_symlink(struct inode *dir, struct dentry *dentry,
+			   const char *symlink)
+{
+	return shiftfs_make_object(dir, dentry, S_IFLNK, symlink, NULL, false);
+}
+
+static int shiftfs_rm(struct inode *dir, struct dentry *dentry, bool rmdir)
+{
+	struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
+	struct inode *reali = real->d_inode;
+	int err;
+	const struct cred *oldcred, *newcred;
+
+	inode_lock_nested(reali, I_MUTEX_PARENT);
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+
+	if (rmdir)
+		err = vfs_rmdir(reali, new);
+	else
+		err = vfs_unlink(reali, new, NULL);
+
+	shiftfs_old_creds(oldcred, &newcred);
+	inode_unlock(reali);
+
+	return err;
+}
+
+static int shiftfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return shiftfs_rm(dir, dentry, false);
+}
+
+static int shiftfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return shiftfs_rm(dir, dentry, true);
+}
+
+static int shiftfs_rename(struct inode *olddir, struct dentry *old,
+			  struct inode *newdir, struct dentry *new,
+			  unsigned int flags)
+{
+	struct dentry *rodd = olddir->i_private, *rndd = newdir->i_private,
+		*realold = old->d_fsdata,
+		*realnew = new->d_fsdata, *trap;
+	struct inode *realolddir = rodd->d_inode, *realnewdir = rndd->d_inode;
+	int err = -EINVAL;
+	const struct cred *oldcred, *newcred;
+
+	trap = lock_rename(rndd, rodd);
+
+	if (trap == realold || trap == realnew)
+		goto out_unlock;
+
+	oldcred = shiftfs_new_creds(&newcred, old->d_sb);
+
+	err = vfs_rename(realolddir, realold, realnewdir,
+			 realnew, NULL, flags);
+
+	shiftfs_old_creds(oldcred, &newcred);
+
+ out_unlock:
+	unlock_rename(rndd, rodd);
+
+	return err;
+}
+
+static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
+				     unsigned int flags)
+{
+	struct dentry *real = dir->i_private, *new;
+	struct inode *reali = real->d_inode, *newi;
+	const struct cred *oldcred, *newcred;
+
+	inode_lock(reali);
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
+	shiftfs_old_creds(oldcred, &newcred);
+	inode_unlock(reali);
+
+	if (IS_ERR(new))
+		return new;
+
+	dentry->d_fsdata = new;
+
+	if (!new->d_inode)
+		return NULL;
+
+	newi = shiftfs_new_inode(dentry->d_sb, new->d_inode->i_mode, new);
+	if (!newi) {
+		dput(new);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	d_splice_alias(newi, dentry);
+
+	return NULL;
+}
+
+static int shiftfs_permission(struct inode *inode, int mask)
+{
+	struct dentry *real = inode->i_private;
+	struct inode *reali = real->d_inode;
+	const struct inode_operations *iop = reali->i_op;
+	int err;
+	const struct cred *oldcred, *newcred;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
+	if (iop->permission)
+		err = iop->permission(reali, mask);
+	else
+		err = generic_permission(reali, mask);
+	shiftfs_old_creds(oldcred, &newcred);
+
+	return err;
+}
+
+static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct dentry *real = dentry->d_fsdata;
+	struct inode *reali = real->d_inode;
+	const struct inode_operations *iop = reali->i_op;
+	struct iattr newattr = *attr;
+	const struct cred *oldcred, *newcred;
+	struct super_block *sb = dentry->d_sb;
+	int err;
+
+	newattr.ia_uid = KUIDT_INIT(from_kuid(sb->s_user_ns, attr->ia_uid));
+	newattr.ia_gid = KGIDT_INIT(from_kgid(sb->s_user_ns, attr->ia_gid));
+
+	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	inode_lock(reali);
+	if (iop->setattr)
+		err = iop->setattr(real, &newattr);
+	else
+		err = simple_setattr(real, &newattr);
+	inode_unlock(reali);
+	shiftfs_old_creds(oldcred, &newcred);
+
+	if (err)
+		return err;
+
+	/* all OK, reflect the change on our inode */
+	setattr_copy(d_inode(dentry), attr);
+	return 0;
+}
+
+static int shiftfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			   struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *real = inode->i_private;
+	struct inode *reali = real->d_inode;
+	const struct inode_operations *iop = reali->i_op;
+	int err = 0;
+
+	mnt = dentry->d_sb->s_fs_info;
+
+	if (iop->getattr)
+		err = iop->getattr(mnt, real, stat);
+	else
+		generic_fillattr(reali, stat);
+
+	if (err)
+		return err;
+
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	return 0;
+}
+
+static const struct inode_operations shiftfs_inode_ops = {
+	.lookup		= shiftfs_lookup,
+	.getattr	= shiftfs_getattr,
+	.setattr	= shiftfs_setattr,
+	.permission	= shiftfs_permission,
+	.mkdir		= shiftfs_mkdir,
+	.symlink	= shiftfs_symlink,
+	.get_link	= shiftfs_get_link,
+	.readlink	= shiftfs_readlink,
+	.unlink		= shiftfs_unlink,
+	.rmdir		= shiftfs_rmdir,
+	.rename		= shiftfs_rename,
+	.link		= shiftfs_link,
+	.create		= shiftfs_create,
+	.mknod		= NULL,	/* no special files currently */
+	.listxattr	= shiftfs_listxattr,
+};
+
+static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
+				       struct dentry *dentry)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	mode &= S_IFMT;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	inode->i_op = &shiftfs_inode_ops;
+
+	shiftfs_fill_inode(inode, dentry);
+
+	return inode;
+}
+
+static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct shiftfs_super_info *ssi = sb->s_fs_info;
+
+	if (ssi->mark)
+		seq_show_option(m, "mark", NULL);
+
+	return 0;
+}
+
+static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct shiftfs_super_info *ssi = sb->s_fs_info;
+	struct dentry *root = sb->s_root;
+	struct dentry *realroot = root->d_fsdata;
+	struct path realpath = { .mnt = ssi->mnt, .dentry = realroot };
+	int err;
+
+	err = vfs_statfs(&realpath, buf);
+	if (err)
+		return err;
+
+	buf->f_type = sb->s_magic;
+
+	return 0;
+}
+
+static void shiftfs_put_super(struct super_block *sb)
+{
+	struct shiftfs_super_info *ssi = sb->s_fs_info;
+
+	mntput(ssi->mnt);
+	put_user_ns(ssi->userns);
+	kfree(ssi);
+}
+
+static const struct xattr_handler shiftfs_xattr_handler = {
+	.prefix = "",
+	.get    = shiftfs_xattr_get,
+	.set    = shiftfs_xattr_set,
+};
+
+const struct xattr_handler *shiftfs_xattr_handlers[] = {
+	&shiftfs_xattr_handler,
+	NULL
+};
+
+static const struct super_operations shiftfs_super_ops = {
+	.put_super	= shiftfs_put_super,
+	.show_options	= shiftfs_show_options,
+	.statfs		= shiftfs_statfs,
+};
+
+struct shiftfs_data {
+	void *data;
+	const char *path;
+};
+
+static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
+			      int silent)
+{
+	struct shiftfs_data *data = raw_data;
+	char *name = kstrdup(data->path, GFP_KERNEL);
+	int err = -ENOMEM;
+	struct shiftfs_super_info *ssi = NULL;
+	struct path path;
+	struct dentry *dentry;
+
+	if (!name)
+		goto out;
+
+	ssi = kzalloc(sizeof(*ssi), GFP_KERNEL);
+	if (!ssi)
+		goto out;
+
+	err = -EPERM;
+	err = shiftfs_parse_options(ssi, data->data);
+	if (err)
+		goto out;
+
+	/* to mark a mount point, must be real root */
+	if (ssi->mark && !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	/* else to mount a mark, must be userns admin */
+	if (!ssi->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		goto out;
+
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (err)
+		goto out;
+
+	err = -EPERM;
+	if (!S_ISDIR(path.dentry->d_inode->i_mode)) {
+		err = -ENOTDIR;
+		goto out_put;
+	}
+	if (ssi->mark) {
+		/*
+		 * this part is visible unshifted, so make sure no
+		 * executables that could be used to give suid
+		 * privileges
+		 */
+		sb->s_iflags = SB_I_NOEXEC;
+		ssi->mnt = path.mnt;
+		dentry = path.dentry;
+	} else {
+		struct shiftfs_super_info *mp_ssi;
+
+		/*
+		 * this leg executes if we're admin capable in
+		 * the namespace, so be very careful
+		 */
+		if (path.dentry->d_sb->s_magic != SHIFTFS_MAGIC)
+			goto out_put;
+		mp_ssi = path.dentry->d_sb->s_fs_info;
+		if (!mp_ssi->mark)
+			goto out_put;
+		ssi->mnt = mntget(mp_ssi->mnt);
+		dentry = dget(path.dentry->d_fsdata);
+		path_put(&path);
+	}
+	ssi->userns = get_user_ns(dentry->d_sb->s_user_ns);
+	sb->s_fs_info = ssi;
+	sb->s_magic = SHIFTFS_MAGIC;
+	sb->s_op = &shiftfs_super_ops;
+	sb->s_xattr = shiftfs_xattr_handlers;
+	sb->s_d_op = &shiftfs_dentry_ops;
+	sb->s_root = d_make_root(shiftfs_new_inode(sb, S_IFDIR, dentry));
+	sb->s_root->d_fsdata = dentry;
+
+	return 0;
+
+ out_put:
+	path_put(&path);
+ out:
+	kfree(name);
+	kfree(ssi);
+	return err;
+}
+
+static struct dentry *shiftfs_mount(struct file_system_type *fs_type,
+				    int flags, const char *dev_name, void *data)
+{
+	struct shiftfs_data d = { data, dev_name };
+
+	return mount_nodev(fs_type, flags, &d, shiftfs_fill_super);
+}
+
+static struct file_system_type shiftfs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "shiftfs",
+	.mount		= shiftfs_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_USERNS_MOUNT,
+};
+
+static int __init shiftfs_init(void)
+{
+	return register_filesystem(&shiftfs_type);
+}
+
+static void __exit shiftfs_exit(void)
+{
+	unregister_filesystem(&shiftfs_type);
+}
+
+MODULE_ALIAS_FS("shiftfs");
+MODULE_AUTHOR("James Bottomley");
+MODULE_DESCRIPTION("uid/gid shifting bind filesystem");
+MODULE_LICENSE("GPL v2");
+module_init(shiftfs_init)
+module_exit(shiftfs_exit)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index e230af2..a2fdb01 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -85,4 +85,6 @@ 
 #define BALLOON_KVM_MAGIC	0x13661366
 #define ZSMALLOC_MAGIC		0x58295829
 
+#define SHIFTFS_MAGIC		0x6a656a62
+
 #endif /* __LINUX_MAGIC_H__ */