diff mbox

[review,6/6] vfs: Cache the results of path_connected

Message ID 8738009i0h.fsf_-_@x220.int.ebiederm.org (mailing list archive)
State New, archived
Headers show

Commit Message

Eric W. Biederman Aug. 3, 2015, 9:30 p.m. UTC
Add a new field mnt_escape_count in nameidata, initialize it to 0 and
cache the value of read_mnt_escape_count in nd->mnt_escape_count.

This allows a single check in path_connected in the common case where
either the mount has had no escapes (mnt_escape_count == 0) or there
has been an escape and it has been validated that the current path
does not escape.

To keep the cache valid nd->mnt_escape_count must be set to 0 whenever
the nd->path.mnt changes or when nd->path.dentry changes such that
the connectedness of the previous value of nd->path.dentry does
not imply the connected of the new value of nd->path.dentry.

Various locations in fs/namei.c are updated to set
nd->mnt_escape_count to 0 as necessary.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namei.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

Comments

Andrei Vagin Aug. 4, 2015, 11:52 a.m. UTC | #1
On Mon, Aug 03, 2015 at 04:30:54PM -0500, Eric W. Biederman wrote:
> 
> Add a new field mnt_escape_count in nameidata, initialize it to 0 and
> cache the value of read_mnt_escape_count in nd->mnt_escape_count.
> 
> This allows a single check in path_connected in the common case where
> either the mount has had no escapes (mnt_escape_count == 0) or there
> has been an escape and it has been validated that the current path
> does not escape.
> 
> To keep the cache valid nd->mnt_escape_count must be set to 0 whenever
> the nd->path.mnt changes or when nd->path.dentry changes such that
> the connectedness of the previous value of nd->path.dentry does
> not imply the connected of the new value of nd->path.dentry.
> 
> Various locations in fs/namei.c are updated to set
> nd->mnt_escape_count to 0 as necessary.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/namei.c | 26 +++++++++++++++++++++++---
>  1 file changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index bccd3810ff60..79a5dca073f5 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -514,6 +514,7 @@ struct nameidata {
>  	struct nameidata *saved;
>  	unsigned	root_seq;
>  	int		dfd;
> +	unsigned	mnt_escape_count;
>  };
>  
>  static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
> @@ -572,12 +573,13 @@ static bool path_connected(struct nameidata *nd)
>  	struct vfsmount *mnt = nd->path.mnt;
>  	unsigned escape_count = read_mnt_escape_count(mnt);
>  
> -	if (likely(escape_count == 0))
> +	if (likely(escape_count == nd->mnt_escape_count))
>  		return true;

The size of mnt_escape_count is only 4 bytes. Looks like it possible to
make UINT_MAX / 2 operations for the resonable time and get the same
value of mnt_escape_count, path_connected() will return true, but the
path may be already detached. What do you think about this?

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Aug. 4, 2015, 5:41 p.m. UTC | #2
Andrew Vagin <avagin@gmail.com> writes:

> On Mon, Aug 03, 2015 at 04:30:54PM -0500, Eric W. Biederman wrote:
>> 
>> Add a new field mnt_escape_count in nameidata, initialize it to 0 and
>> cache the value of read_mnt_escape_count in nd->mnt_escape_count.
>> 
>> This allows a single check in path_connected in the common case where
>> either the mount has had no escapes (mnt_escape_count == 0) or there
>> has been an escape and it has been validated that the current path
>> does not escape.
>> 
>> To keep the cache valid nd->mnt_escape_count must be set to 0 whenever
>> the nd->path.mnt changes or when nd->path.dentry changes such that
>> the connectedness of the previous value of nd->path.dentry does
>> not imply the connected of the new value of nd->path.dentry.
>> 
>> Various locations in fs/namei.c are updated to set
>> nd->mnt_escape_count to 0 as necessary.
>> 
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  fs/namei.c | 26 +++++++++++++++++++++++---
>>  1 file changed, 23 insertions(+), 3 deletions(-)
>> 
>> diff --git a/fs/namei.c b/fs/namei.c
>> index bccd3810ff60..79a5dca073f5 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -514,6 +514,7 @@ struct nameidata {
>>  	struct nameidata *saved;
>>  	unsigned	root_seq;
>>  	int		dfd;
>> +	unsigned	mnt_escape_count;
>>  };
>>  
>>  static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
>> @@ -572,12 +573,13 @@ static bool path_connected(struct nameidata *nd)
>>  	struct vfsmount *mnt = nd->path.mnt;
>>  	unsigned escape_count = read_mnt_escape_count(mnt);
>>  
>> -	if (likely(escape_count == 0))
>> +	if (likely(escape_count == nd->mnt_escape_count))
>>  		return true;
>
> The size of mnt_escape_count is only 4 bytes. Looks like it possible to
> make UINT_MAX / 2 operations for the resonable time and get the same
> value of mnt_escape_count, path_connected() will return true, but the
> path may be already detached. What do you think about this?

It is an interesting question.  

For the locking on rename we have something that looks like:
mutex_lock(&...s_vfs_mutex);
mutex_lock(&p2->d_inode->i_mutex);
mutex_lock(&p1->d_inode->i_mutex);
read_seqlock_excl(&mount_lock);
escape_count++;
write_seqlock(&rename_lock);
write_seqcount_begin(&dentry->d_seq);
write_seqcount_begin_nested(&target->d_seq);
write_seqcount_end_nested(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
write_sequnlock(&rename_lock);
escape_count++;
read_sequnlock_excl(&mount_lock);
mutex_unlock(&p1->d_inode->i_mutex);
mutex_unlock(&p2->d_inode->i_mutex);
mutex_unlock(&...s_vfs_mutex);

Which is at least 16 serialized cpu operations.  To reach overflow then
it would take at least 16 * 2**32 operations cpu operations.  On a 4Ghz
16 * 2**32 operations would take roughly 16 seconds.  In practice I
think it would take noticably more than 16 seconds to perform that many
renames as there is a lot more going on than just the locking I signled
out.

A pathname lookup taking 16 seconds seems absurd.  But perhaps in the
worst case.

The maximum length of a path that can be passed into path_lookup is
4096.  For a lookup to be problematic there must be at least as many
instances of .. as there are of any other path component.  So each pair
of a minium length path element and a .. element must take at least 5
bytes. Which in 4096 bytes leaves room for 819 path elements.  If every
one of those 819 path components triggered a disk seek at 100 seeks per
second I could see a path name lookup potentially taking 8 seconds.

MAXSYMLINKS is 40.  So you could perhaps if you have a truly pathlogical
set of file names might make that 320 seconds. Ick.


To get some real figures I have performed a few renames on my 2.5Ghz
laptop with ext4 on an ssd.  Performing a simple rename in the same
directory (which involves much less than in required to rename a file
out of a mount point) I get the following timings:

   renames    time
   200,000      1.2s
 2,000,000     17.2s
20,000,000    205.6s
40,000,000    418.5s

At that kind of speed I would expect 4,000,000,000 renames to take 41850
seconds or 11.625 hours.

Going the other way on an old system with spinning rust for a hard drive
I created 1000 nested directories all named a and times how long it
would take to stat a pathlogical pathname.  With a simple pathname
without symlinks involved the worst case I have been able to trigger
is a 0.3 second path name lookup time.   Not being satisified with
that I managed to create a file about 83,968 directories deep and a
set of 40 symlinks setup up to get me there in one stat call the first
symlink being 8192 directories deep.  The worst case time I have been
able to measure from stat -L on the symlink that leads me to that file
is 14.5 seconds. 

If my numbers are at all representative I don't see a realistic
possibility of being able to perform enough renames to roll over a 32bit
mnt_escape_count during a path name lookup.  Even my most optimistic
estimate required 16 seconds to perform the renames and I have not been
able to get any pathname lookup to run that long.

At the same time I do think it is probably worth it to make escape count
an unsigned long, because that is practically free and it removes the
any theoretical concern on 64bit.

Am I missing anything?

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 4, 2015, 7:44 p.m. UTC | #3
On Tue, Aug 04, 2015 at 12:41:32PM -0500, Eric W. Biederman wrote:
> A pathname lookup taking 16 seconds seems absurd.  But perhaps in the
> worst case.
> 
> The maximum length of a path that can be passed into path_lookup is
> 4096.  For a lookup to be problematic there must be at least as many
> instances of .. as there are of any other path component.  So each pair
> of a minium length path element and a .. element must take at least 5
> bytes. Which in 4096 bytes leaves room for 819 path elements.  If every
> one of those 819 path components triggered a disk seek at 100 seeks per
> second I could see a path name lookup potentially taking 8 seconds.

A lookup on NFS while a server's rebooting or the network's flaky could
take arbitrary long.  Other network filesystems and fuse can have
similar problems.  Depending on threat model an attacker might have
quite precise control over that timing.  Disk filesystems could have all
the same problems since there's no guarantee the underlying block device
is really local.  Even ignoring that, hardware can be slow or flaky.
And couldn't an allocation in theory block for an arbitrary long time?

Apologies for just dropping into the middle here!  I haven't read the
rest and don't have the context to know whether any of that's relevant.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Aug. 4, 2015, 10:58 p.m. UTC | #4
"J. Bruce Fields" <bfields@fieldses.org> writes:

> On Tue, Aug 04, 2015 at 12:41:32PM -0500, Eric W. Biederman wrote:
>> A pathname lookup taking 16 seconds seems absurd.  But perhaps in the
>> worst case.
>> 
>> The maximum length of a path that can be passed into path_lookup is
>> 4096.  For a lookup to be problematic there must be at least as many
>> instances of .. as there are of any other path component.  So each pair
>> of a minium length path element and a .. element must take at least 5
>> bytes. Which in 4096 bytes leaves room for 819 path elements.  If every
>> one of those 819 path components triggered a disk seek at 100 seeks per
>> second I could see a path name lookup potentially taking 8 seconds.
>
> A lookup on NFS while a server's rebooting or the network's flaky could
> take arbitrary long.  Other network filesystems and fuse can have
> similar problems.  Depending on threat model an attacker might have
> quite precise control over that timing.  Disk filesystems could have all
> the same problems since there's no guarantee the underlying block device
> is really local.  Even ignoring that, hardware can be slow or flaky.
> And couldn't an allocation in theory block for an arbitrary long time?
>
> Apologies for just dropping into the middle here!  I haven't read the
> rest and don't have the context to know whether any of that's relevant.

No problem.  The basic question is: can 2Billion renames be performed on
the same filesystem in less time than a single path lookup?  Allowing
the use of a 32bit counter.

Most of the issues that slow up lookup also slow up rename so I have
not been focusing on them.

If you could look up thread and tell me what you think of the issue with
file handle to dentry conversion and bind mounts I would be appreciate.

I have been testing a little more on my system and it appears that it
takes an 60minutes give or take to perform 2 Billino renames on ramfs.
A faster cpu (5Ghz?) could perhaps get that down to 30 minutes.

With no slow downs and no weirdness I have been able to get a single
pathname lookup to take just over 2 minutes, and I expect I could get
that to take another minute more.

Those numbers are within a factor of 10 of each other, and I expect
someone clever could finagle something to overcome the rest.  So sigh.

There just is not enough margin in there to be certain of things.  Now
with the small change to make that counter 64bit and that 30 minutes to
wrap the counter becomes 240,000+ years.  I think I can safely not worry
about the issue.  I just need to come up with a good 32bit implemenation.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 5, 2015, 3:59 p.m. UTC | #5
On Tue, Aug 04, 2015 at 05:58:59PM -0500, Eric W. Biederman wrote:
> "J. Bruce Fields" <bfields@fieldses.org> writes:
> 
> > On Tue, Aug 04, 2015 at 12:41:32PM -0500, Eric W. Biederman wrote:
> >> A pathname lookup taking 16 seconds seems absurd.  But perhaps in the
> >> worst case.
> >> 
> >> The maximum length of a path that can be passed into path_lookup is
> >> 4096.  For a lookup to be problematic there must be at least as many
> >> instances of .. as there are of any other path component.  So each pair
> >> of a minium length path element and a .. element must take at least 5
> >> bytes. Which in 4096 bytes leaves room for 819 path elements.  If every
> >> one of those 819 path components triggered a disk seek at 100 seeks per
> >> second I could see a path name lookup potentially taking 8 seconds.
> >
> > A lookup on NFS while a server's rebooting or the network's flaky could
> > take arbitrary long.  Other network filesystems and fuse can have
> > similar problems.  Depending on threat model an attacker might have
> > quite precise control over that timing.  Disk filesystems could have all
> > the same problems since there's no guarantee the underlying block device
> > is really local.  Even ignoring that, hardware can be slow or flaky.
> > And couldn't an allocation in theory block for an arbitrary long time?
> >
> > Apologies for just dropping into the middle here!  I haven't read the
> > rest and don't have the context to know whether any of that's relevant.
> 
> No problem.  The basic question is: can 2Billion renames be performed on
> the same filesystem in less time than a single path lookup?  Allowing
> the use of a 32bit counter.

Certainly if you have control over an NFS or FUSE server then you can
arrange for that to happen--just delay the lookup until you've processed
enough renames.  I don't know if that's interesting....

> If you could look up thread and tell me what you think of the issue with
> file handle to dentry conversion and bind mounts I would be appreciate.

OK, I see your comments in "[PATCH review 0/6] Bind mount escape fixes",
I'm not sure I understand yet, I'll take a closer look.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Aug. 5, 2015, 4:28 p.m. UTC | #6
"J. Bruce Fields" <bfields@fieldses.org> writes:

> On Tue, Aug 04, 2015 at 05:58:59PM -0500, Eric W. Biederman wrote:
>>
>> No problem.  The basic question is: can 2Billion renames be performed on
>> the same filesystem in less time than a single path lookup?  Allowing
>> the use of a 32bit counter.
>
> Certainly if you have control over an NFS or FUSE server then you can
> arrange for that to happen--just delay the lookup until you've processed
> enough renames.  I don't know if that's interesting....

Not particularly when the whole point is to start with a bind mount, do
something trick and then have access to the whole filesystem instead of
just the part of the filesystem exposed by the bind mount.

If you control the filesystem you already have access to the entire
filesystem, so you don't need to do something trick.

That something tricky is a well placed rename that borks the tree
structure and causes .. to never see the subdirectory that is the root
of the bind mount.

>> If you could look up thread and tell me what you think of the issue with
>> file handle to dentry conversion and bind mounts I would be appreciate.
>
> OK, I see your comments in "[PATCH review 0/6] Bind mount escape fixes",
> I'm not sure I understand yet, I'll take a closer look.

Thanks.

The file handle reconstitution code can certainly be affected by all of
this.  Given that it is an failure if reconnect_path can't reconnect the
path of a file handle.  I think it can reasonably considered an error in
all cases if that path is outside of an exported bind mount, but I don't
know that area particularly well.  The solution might just be don't
export file handles from bind mounts.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 28, 2015, 7:43 p.m. UTC | #7
This is response is kind of ridiculously delayed; vacation and a couple
other things interfered!:

On Wed, Aug 05, 2015 at 11:28:55AM -0500, Eric W. Biederman wrote:
> "J. Bruce Fields" <bfields@fieldses.org> writes:
> 
> > On Tue, Aug 04, 2015 at 05:58:59PM -0500, Eric W. Biederman wrote:
> >>
> >> No problem.  The basic question is: can 2Billion renames be performed on
> >> the same filesystem in less time than a single path lookup?  Allowing
> >> the use of a 32bit counter.
> >
> > Certainly if you have control over an NFS or FUSE server then you can
> > arrange for that to happen--just delay the lookup until you've processed
> > enough renames.  I don't know if that's interesting....
> 
> Not particularly when the whole point is to start with a bind mount, do
> something trick and then have access to the whole filesystem instead of
> just the part of the filesystem exposed by the bind mount.
> 
> If you control the filesystem you already have access to the entire
> filesystem, so you don't need to do something trick.

I thought there was also a concern about impact on the sanity of the
system as a whole beyond just the contents of one filesystem: e.g. you
don't want an unprivileged user to be able to create an unmountable
mountpoint.

> That something tricky is a well placed rename that borks the tree
> structure and causes .. to never see the subdirectory that is the root
> of the bind mount.
> 
> >> If you could look up thread and tell me what you think of the issue with
> >> file handle to dentry conversion and bind mounts I would be appreciate.
> >
> > OK, I see your comments in "[PATCH review 0/6] Bind mount escape fixes",
> > I'm not sure I understand yet, I'll take a closer look.
> 
> Thanks.
> 
> The file handle reconstitution code can certainly be affected by all of
> this.  Given that it is an failure if reconnect_path can't reconnect the
> path of a file handle.  I think it can reasonably considered an error in
> all cases if that path is outside of an exported bind mount, but I don't
> know that area particularly well.  The solution might just be don't
> export file handles from bind mounts.

I don't think there's any new cause for concern here.

I'd quibble with the language "don't export filehandles", *if* by that
you mean "don't tell allow anyone to know filehandles".  They're
guessable, so keeping them secret doesn't guarantee much security.

The dangerous operation is open_by_handle, and people need to understand
that if you allow someone to call that then you're effectively giving
access to the whole filesystem.  That's always been true.  (We don't
really have an efficient way to determine if a non-directory is in a
given subtree anyway.)

Such filehandle-guessing attacks on NFS have long been well-understood.
NOSUBTREECHECK can prevent them but causes other problems, so isn't the
default.

So the basic rule I think is "don't allow lookup-by-filehandle (or NFS
export) on part of a filesystem unless you'd be willing to allow it on
the whole thing".

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Aug. 28, 2015, 7:45 p.m. UTC | #8
On Fri, Aug 28, 2015 at 03:43:02PM -0400, J. Bruce Fields wrote:
> On Wed, Aug 05, 2015 at 11:28:55AM -0500, Eric W. Biederman wrote:
> > The file handle reconstitution code can certainly be affected by all of
> > this.  Given that it is an failure if reconnect_path can't reconnect the
> > path of a file handle.  I think it can reasonably considered an error in
> > all cases if that path is outside of an exported bind mount, but I don't
> > know that area particularly well.  The solution might just be don't
> > export file handles from bind mounts.
> 
> I don't think there's any new cause for concern here.
> 
> I'd quibble with the language "don't export filehandles", *if* by that
> you mean "don't tell allow anyone to know filehandles".  They're
> guessable, so keeping them secret doesn't guarantee much security.
> 
> The dangerous operation is open_by_handle, and people need to understand
> that if you allow someone to call that then you're effectively giving
> access to the whole filesystem.  That's always been true.  (We don't
> really have an efficient way to determine if a non-directory is in a
> given subtree anyway.)
> 
> Such filehandle-guessing attacks on NFS have long been well-understood.
> NOSUBTREECHECK can prevent them but causes other problems, so isn't the
> default.
> 
> So the basic rule I think is "don't allow lookup-by-filehandle (or NFS
> export) on part of a filesystem unless you'd be willing to allow it on
> the whole thing".

(So in case it wasn't clear: ACK to just ignoring this, I don't think
your (otherwise interesting) observations point to anything that needs
fixing in the lookup-by-filehandle case.)

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Aug. 31, 2015, 9:17 p.m. UTC | #9
"J. Bruce Fields" <bfields@fieldses.org> writes:

> On Fri, Aug 28, 2015 at 03:43:02PM -0400, J. Bruce Fields wrote:
>> On Wed, Aug 05, 2015 at 11:28:55AM -0500, Eric W. Biederman wrote:
>> > The file handle reconstitution code can certainly be affected by all of
>> > this.  Given that it is an failure if reconnect_path can't reconnect the
>> > path of a file handle.  I think it can reasonably considered an error in
>> > all cases if that path is outside of an exported bind mount, but I don't
>> > know that area particularly well.  The solution might just be don't
>> > export file handles from bind mounts.
>> 
>> I don't think there's any new cause for concern here.
>> 
>> I'd quibble with the language "don't export filehandles", *if* by that
>> you mean "don't tell allow anyone to know filehandles".  They're
>> guessable, so keeping them secret doesn't guarantee much security.
>> 
>> The dangerous operation is open_by_handle, and people need to understand
>> that if you allow someone to call that then you're effectively giving
>> access to the whole filesystem.  That's always been true.  (We don't
>> really have an efficient way to determine if a non-directory is in a
>> given subtree anyway.)
>>
>>
>> Such filehandle-guessing attacks on NFS have long been well-understood.
>> NOSUBTREECHECK can prevent them but causes other problems, so isn't the
>> default.

Interesting.  I guess it makes sense that filehandles can be guessed.  I
wonder if a crypto variant that is resistant to plain text attacks would
be a practical defense.

We do have d_ancestor/is_subdir that is a compartively efficient way to
see if a dentry is in a given subtree.  As it does not need to perform
the permission checks I believe it would be some cheaper than the
current nfs subtree check code.  I don't know if that would avoid the
known problem with the subtree check code.  Nor do I know if it would be
cheap enough to use for every nfsd operation when a file handle is
received.

>> So the basic rule I think is "don't allow lookup-by-filehandle (or NFS
>> export) on part of a filesystem unless you'd be willing to allow it on
>> the whole thing".
>
> (So in case it wasn't clear: ACK to just ignoring this, I don't think
> your (otherwise interesting) observations point to anything that needs
> fixing in the lookup-by-filehandle case.)

Thanks for looking into this.  It helps to know that someone who knows
the history of what happens with filehandles has looked at this.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Sept. 1, 2015, 2:46 p.m. UTC | #10
On Mon, Aug 31, 2015 at 04:17:28PM -0500, Eric W. Biederman wrote:
> "J. Bruce Fields" <bfields@fieldses.org> writes:
> 
> > On Fri, Aug 28, 2015 at 03:43:02PM -0400, J. Bruce Fields wrote:
> >> On Wed, Aug 05, 2015 at 11:28:55AM -0500, Eric W. Biederman wrote:
> >> > The file handle reconstitution code can certainly be affected by all of
> >> > this.  Given that it is an failure if reconnect_path can't reconnect the
> >> > path of a file handle.  I think it can reasonably considered an error in
> >> > all cases if that path is outside of an exported bind mount, but I don't
> >> > know that area particularly well.  The solution might just be don't
> >> > export file handles from bind mounts.
> >> 
> >> I don't think there's any new cause for concern here.
> >> 
> >> I'd quibble with the language "don't export filehandles", *if* by that
> >> you mean "don't tell allow anyone to know filehandles".  They're
> >> guessable, so keeping them secret doesn't guarantee much security.
> >> 
> >> The dangerous operation is open_by_handle, and people need to understand
> >> that if you allow someone to call that then you're effectively giving
> >> access to the whole filesystem.  That's always been true.  (We don't
> >> really have an efficient way to determine if a non-directory is in a
> >> given subtree anyway.)
> >>
> >>
> >> Such filehandle-guessing attacks on NFS have long been well-understood.
> >> NOSUBTREECHECK can prevent them but causes other problems, so isn't the
> >> default.
> 
> Interesting.  I guess it makes sense that filehandles can be guessed.  I
> wonder if a crypto variant that is resistant to plain text attacks would
> be a practical defense.

People have considered it.  I don't think it would be hard: just
generate some permanent server secret and use it to encrypt all
filehandles (and decrypt them again when looking them up).

Some of the reasons I don't think it's been done:

	- Well, it's work, and nobody's really felt that strongly about
	  it.

	- It's usually not that hard to create another filesystem when
	  you need a real security boundary.

	- Filehandles are forever.  But it's hard to keep secrets
	  forever, especially when they have to be transmitted over the
	  network a lot.  (In more detail: client applications can hold
	  long-lived references to filesystem objects through open file
	  descriptors or current working directories.  They expect those
	  to keep working even over server reboots.  We don't even know
	  about those references.  So any filehandle we give out could
	  be looked up at any time in the future.  The only case where
	  we consider it acceptable to return ESTALE is when the
	  object's actually gone.)

> We do have d_ancestor/is_subdir that is a compartively efficient way to
> see if a dentry is in a given subtree.  As it does not need to perform
> the permission checks I believe it would be some cheaper than the
> current nfs subtree check code.  I don't know if that would avoid the
> known problem with the subtree check code.  Nor do I know if it would be
> cheap enough to use for every nfsd operation when a file handle is
> received.

That would solve the problem for directories, but not for files.  For
non-directories we'd need special support from the filesystem (since at
the time we look up the filehandle the parent(s) may not be in the
dcache yet).  Steps to check subtree membership would be roughly (number
of hardlinks) * (average depth).  I think it's probably not worth it.

Anyway, forgive the digressions....

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Sept. 1, 2015, 6 p.m. UTC | #11
"J. Bruce Fields" <bfields@fieldses.org> writes:

> On Mon, Aug 31, 2015 at 04:17:28PM -0500, Eric W. Biederman wrote:
>> "J. Bruce Fields" <bfields@fieldses.org> writes:
>> 
>> > On Fri, Aug 28, 2015 at 03:43:02PM -0400, J. Bruce Fields wrote:
>> >> On Wed, Aug 05, 2015 at 11:28:55AM -0500, Eric W. Biederman wrote:
>> >> > The file handle reconstitution code can certainly be affected by all of
>> >> > this.  Given that it is an failure if reconnect_path can't reconnect the
>> >> > path of a file handle.  I think it can reasonably considered an error in
>> >> > all cases if that path is outside of an exported bind mount, but I don't
>> >> > know that area particularly well.  The solution might just be don't
>> >> > export file handles from bind mounts.
>> >> 
>> >> I don't think there's any new cause for concern here.
>> >> 
>> >> I'd quibble with the language "don't export filehandles", *if* by that
>> >> you mean "don't tell allow anyone to know filehandles".  They're
>> >> guessable, so keeping them secret doesn't guarantee much security.
>> >> 
>> >> The dangerous operation is open_by_handle, and people need to understand
>> >> that if you allow someone to call that then you're effectively giving
>> >> access to the whole filesystem.  That's always been true.  (We don't
>> >> really have an efficient way to determine if a non-directory is in a
>> >> given subtree anyway.)
>> >>
>> >>
>> >> Such filehandle-guessing attacks on NFS have long been well-understood.
>> >> NOSUBTREECHECK can prevent them but causes other problems, so isn't the
>> >> default.
>> 
>> Interesting.  I guess it makes sense that filehandles can be guessed.  I
>> wonder if a crypto variant that is resistant to plain text attacks would
>> be a practical defense.
>
> People have considered it.  I don't think it would be hard: just
> generate some permanent server secret and use it to encrypt all
> filehandles (and decrypt them again when looking them up).
>
> Some of the reasons I don't think it's been done:
>
> 	- Well, it's work, and nobody's really felt that strongly about
> 	  it.
>
> 	- It's usually not that hard to create another filesystem when
> 	  you need a real security boundary.
>
> 	- Filehandles are forever.  But it's hard to keep secrets
> 	  forever, especially when they have to be transmitted over the
> 	  network a lot.  (In more detail: client applications can hold
> 	  long-lived references to filesystem objects through open file
> 	  descriptors or current working directories.  They expect those
> 	  to keep working even over server reboots.  We don't even know
> 	  about those references.  So any filehandle we give out could
> 	  be looked up at any time in the future.  The only case where
> 	  we consider it acceptable to return ESTALE is when the
> 	  object's actually gone.)
>
>> We do have d_ancestor/is_subdir that is a compartively efficient way to
>> see if a dentry is in a given subtree.  As it does not need to perform
>> the permission checks I believe it would be some cheaper than the
>> current nfs subtree check code.  I don't know if that would avoid the
>> known problem with the subtree check code.  Nor do I know if it would be
>> cheap enough to use for every nfsd operation when a file handle is
>> received.
>
> That would solve the problem for directories, but not for files.  For
> non-directories we'd need special support from the filesystem (since at
> the time we look up the filehandle the parent(s) may not be in the
> dcache yet).  Steps to check subtree membership would be roughly (number
> of hardlinks) * (average depth).  I think it's probably not worth it.

If viewed that way you are probably right.

I was simply thinking about using the existing subtree check without the
inode_permission test.  Which works at the cost of a readdir to
reconnect an inode to a directory.  I had not noticed the readdir
before.  So I admit it does not sound like something that is a
particularly speedy way to go.

> Anyway, forgive the digressions....

No problem.  Thank you for the discussion.  This has if nothing else
allowed me to understand this from a real world perspective, and in
particular allows me to understand which permission checks would be
necessary to safely allow file handles in a user namespace (if we ever
decide it is safe to allow that).

In short if you did not mount the filesystem you better not be nfs
exporting the filesystem, or parts of the filesystem, or be allowed to
use file handle access to the filesystem.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Sept. 1, 2015, 6:11 p.m. UTC | #12
On Tue, Sep 01, 2015 at 01:00:20PM -0500, Eric W. Biederman wrote:
> No problem.  Thank you for the discussion.  This has if nothing else
> allowed me to understand this from a real world perspective, and in
> particular allows me to understand which permission checks would be
> necessary to safely allow file handles in a user namespace (if we ever
> decide it is safe to allow that).
> 
> In short if you did not mount the filesystem you better not be nfs
> exporting the filesystem, or parts of the filesystem, or be allowed to
> use file handle access to the filesystem.

Agreed.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/namei.c b/fs/namei.c
index bccd3810ff60..79a5dca073f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -514,6 +514,7 @@  struct nameidata {
 	struct nameidata *saved;
 	unsigned	root_seq;
 	int		dfd;
+	unsigned	mnt_escape_count;
 };
 
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
@@ -572,12 +573,13 @@  static bool path_connected(struct nameidata *nd)
 	struct vfsmount *mnt = nd->path.mnt;
 	unsigned escape_count = read_mnt_escape_count(mnt);
 
-	if (likely(escape_count == 0))
+	if (likely(escape_count == nd->mnt_escape_count))
 		return true;
 
 	if (!is_subdir(nd->path.dentry, mnt->mnt_root))
 		return false;
 
+	cache_mnt_escape_count(&nd->mnt_escape_count, escape_count);
 	return true;
 }
 
@@ -840,6 +842,9 @@  static inline void path_to_nameidata(const struct path *path,
 		if (nd->path.mnt != path->mnt)
 			mntput(nd->path.mnt);
 	}
+	if (unlikely((nd->path.mnt != path->mnt) ||
+		     (nd->path.dentry != path->dentry->d_parent)))
+		nd->mnt_escape_count = 0;
 	nd->path.mnt = path->mnt;
 	nd->path.dentry = path->dentry;
 }
@@ -856,6 +861,7 @@  void nd_jump_link(struct path *path)
 	nd->path = *path;
 	nd->inode = nd->path.dentry->d_inode;
 	nd->flags |= LOOKUP_JUMPED;
+	nd->mnt_escape_count = 0;
 }
 
 static inline void put_link(struct nameidata *nd)
@@ -1040,6 +1046,7 @@  const char *get_link(struct nameidata *nd)
 			nd->inode = nd->path.dentry->d_inode;
 		}
 		nd->flags |= LOOKUP_JUMPED;
+		nd->mnt_escape_count = 0;
 		while (unlikely(*++res == '/'))
 			;
 	}
@@ -1335,6 +1342,7 @@  static int follow_dotdot_rcu(struct nameidata *nd)
 			nd->path.mnt = &mparent->mnt;
 			inode = inode2;
 			nd->seq = seq;
+			nd->mnt_escape_count = 0;
 		}
 	}
 	while (unlikely(d_mountpoint(nd->path.dentry))) {
@@ -1348,6 +1356,7 @@  static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->path.dentry = mounted->mnt.mnt_root;
 		inode = nd->path.dentry->d_inode;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		nd->mnt_escape_count = 0;
 	}
 	nd->inode = inode;
 	return 0;
@@ -1406,8 +1415,9 @@  EXPORT_SYMBOL(follow_down);
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
  */
-static void follow_mount(struct path *path)
+static bool follow_mount(struct path *path)
 {
+	bool followed = false;
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
@@ -1416,7 +1426,9 @@  static void follow_mount(struct path *path)
 		mntput(path->mnt);
 		path->mnt = mounted;
 		path->dentry = dget(mounted->mnt_root);
+		followed = true;
 	}
+	return followed;
 }
 
 static int follow_dotdot(struct nameidata *nd)
@@ -1444,8 +1456,10 @@  static int follow_dotdot(struct nameidata *nd)
 		}
 		if (!follow_up(&nd->path))
 			break;
+		nd->mnt_escape_count = 0;
 	}
-	follow_mount(&nd->path);
+	if (follow_mount(&nd->path))
+		nd->mnt_escape_count = 0;
 	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 }
@@ -1997,6 +2011,7 @@  static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
 	nd->depth = 0;
 	nd->total_link_count = 0;
+	nd->mnt_escape_count = 0;
 	if (flags & LOOKUP_ROOT) {
 		struct dentry *root = nd->root.dentry;
 		struct inode *inode = root->d_inode;
@@ -3026,6 +3041,7 @@  static int do_last(struct nameidata *nd,
 	unsigned seq;
 	struct inode *inode;
 	struct path save_parent = { .dentry = NULL, .mnt = NULL };
+	unsigned save_parent_escape_count = 0;
 	struct path path;
 	bool retried = false;
 	int error;
@@ -3155,6 +3171,9 @@  finish_lookup:
 	} else {
 		save_parent.dentry = nd->path.dentry;
 		save_parent.mnt = mntget(path.mnt);
+		save_parent_escape_count = nd->mnt_escape_count;
+		if (nd->path.dentry != path.dentry->d_parent)
+			nd->mnt_escape_count = 0;
 		nd->path.dentry = path.dentry;
 
 	}
@@ -3227,6 +3246,7 @@  stale_open:
 
 	BUG_ON(save_parent.dentry != dir);
 	path_put(&nd->path);
+	nd->mnt_escape_count = save_parent_escape_count;
 	nd->path = save_parent;
 	nd->inode = dir->d_inode;
 	save_parent.mnt = NULL;