diff mbox

[v4,2/2] procfs/tasks: add a simple per-task procfs hidepid= field

Message ID 1484572984-13388-3-git-send-email-djalal@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Djalal Harouni Jan. 16, 2017, 1:23 p.m. UTC
From: Djalal Harouni <tixxdz@gmail.com>

This adds a new per-task hidepid= flag that is honored by procfs when
presenting /proc to the user, in addition to the existing hidepid= mount
option. So far, hidepid= was exclusively a per-pidns setting. Locking
down a set of processes so that they cannot see other user's processes
without affecting the rest of the system thus currently requires
creation of a private PID namespace, with all the complexity it brings,
including maintaining a stub init process as PID 1 and losing the
ability to see processes of the same user on the rest of the system.

With this patch all acesss and visibility checks in procfs now
honour two fields:

	a) the existing hide_pid field in the PID namespace
	b) the new hide_pid in struct task_struct

Access/visibility is only granted if both fields permit it; the more
restrictive one wins. By default the new task_struct hide_pid value
defaults to 0, which means behaviour is not changed from the status quo.

Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID
prctl() option which takes the same three supported values as the
hidepid= mount option. The per-process hide_pid may only be increased,
never decreased, thus ensuring that once applied, processes can never
escape such a hide_pid jail.  When a process forks it inherits its
parent's hide_pid value.

Suggested usecase: let's say nginx runs as user "www-data". After
dropping privileges it may now call:

	…
	prctl(PR_SET_HIDEPID, 2);
	…

And from that point on neither nginx itself, nor any of its child
processes may see processes in /proc anymore that belong to a different
user than "www-data". Other services running on the same system remain
unaffected.

This should permit Linux distributions to more comprehensively lock down
their services, as it allows an isolated opt-in for hidepid= for
specific services. Previously hidepid= could only be set system-wide,
and then specific services had to be excluded by group membership,
essentially a more complex concept of opt-out.

A tool to test this is available here:
https://gist.github.com/tixxdz/4e6d21071463ad2c5a043984e3efb5a1

Original-author: Lafcadio Wluiki <wluikil@gmail.com>
Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
---
 Documentation/filesystems/proc.txt |  2 ++
 fs/proc/array.c                    |  3 +++
 fs/proc/base.c                     |  8 ++++++--
 include/linux/init_task.h          |  1 +
 include/linux/sched.h              |  1 +
 include/uapi/linux/prctl.h         |  4 ++++
 kernel/fork.c                      |  1 +
 kernel/sys.c                       | 13 +++++++++++++
 8 files changed, 31 insertions(+), 2 deletions(-)

Comments

Djalal Harouni Jan. 16, 2017, 5:12 p.m. UTC | #1
Cc linux-api

On Mon, Jan 16, 2017 at 2:23 PM, Djalal Harouni <tixxdz@gmail.com> wrote:

> From: Djalal Harouni <tixxdz@gmail.com>
>
> This adds a new per-task hidepid= flag that is honored by procfs when
> presenting /proc to the user, in addition to the existing hidepid= mount
> option. So far, hidepid= was exclusively a per-pidns setting. Locking
> down a set of processes so that they cannot see other user's processes
> without affecting the rest of the system thus currently requires
> creation of a private PID namespace, with all the complexity it brings,
> including maintaining a stub init process as PID 1 and losing the
> ability to see processes of the same user on the rest of the system.
>
> With this patch all acesss and visibility checks in procfs now
> honour two fields:
>
>         a) the existing hide_pid field in the PID namespace
>         b) the new hide_pid in struct task_struct
>
> Access/visibility is only granted if both fields permit it; the more
> restrictive one wins. By default the new task_struct hide_pid value
> defaults to 0, which means behaviour is not changed from the status quo.
>
> Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID
> prctl() option which takes the same three supported values as the
> hidepid= mount option. The per-process hide_pid may only be increased,
> never decreased, thus ensuring that once applied, processes can never
> escape such a hide_pid jail.  When a process forks it inherits its
> parent's hide_pid value.
>
> Suggested usecase: let's say nginx runs as user "www-data". After
> dropping privileges it may now call:
>
>         …
>         prctl(PR_SET_HIDEPID, 2);
>         …
>
> And from that point on neither nginx itself, nor any of its child
> processes may see processes in /proc anymore that belong to a different
> user than "www-data". Other services running on the same system remain
> unaffected.
>
> This should permit Linux distributions to more comprehensively lock down
> their services, as it allows an isolated opt-in for hidepid= for
> specific services. Previously hidepid= could only be set system-wide,
> and then specific services had to be excluded by group membership,
> essentially a more complex concept of opt-out.
>
> A tool to test this is available here:
> https://gist.github.com/tixxdz/4e6d21071463ad2c5a043984e3efb5a1
>
> Original-author: Lafcadio Wluiki <wluikil@gmail.com>
> Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> ---
>  Documentation/filesystems/proc.txt |  2 ++
>  fs/proc/array.c                    |  3 +++
>  fs/proc/base.c                     |  8 ++++++--
>  include/linux/init_task.h          |  1 +
>  include/linux/sched.h              |  1 +
>  include/uapi/linux/prctl.h         |  4 ++++
>  kernel/fork.c                      |  1 +
>  kernel/sys.c                       | 13 +++++++++++++
>  8 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/filesystems/proc.txt
> b/Documentation/filesystems/proc.txt
> index 72624a1..fc95261 100644
> --- a/Documentation/filesystems/proc.txt
> +++ b/Documentation/filesystems/proc.txt
> @@ -164,6 +164,7 @@ read the file /proc/PID/status:
>    Uid:    501     501     501     501
>    Gid:    100     100     100     100
>    FDSize: 256
> +  HidePid:      0
>    Groups: 100 14 16
>    VmPeak:     5004 kB
>    VmSize:     5004 kB
> @@ -228,6 +229,7 @@ Table 1-2: Contents of the status files (as of 4.1)
>   Gid                         Real, effective, saved set, and  file system
> GIDs
>   Umask                       file mode creation mask
>   FDSize                      number of file descriptor slots currently
> allocated
> + HidePid                     process access mode of /proc/<pid>/
>   Groups                      supplementary group list
>   NStgid                      descendant namespace thread group ID
> hierarchy
>   NSpid                       descendant namespace process ID hierarchy
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 51a4213..e6cd1a1 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -163,6 +163,7 @@ static inline void task_state(struct seq_file *m,
> struct pid_namespace *ns,
>         const struct cred *cred;
>         pid_t ppid, tpid = 0, tgid, ngid;
>         unsigned int max_fds = 0;
> +       int hide_pid;
>
>         rcu_read_lock();
>         ppid = pid_alive(p) ?
> @@ -183,6 +184,7 @@ static inline void task_state(struct seq_file *m,
> struct pid_namespace *ns,
>         task_lock(p);
>         if (p->files)
>                 max_fds = files_fdtable(p->files)->max_fds;
> +       hide_pid = p->hide_pid;
>         task_unlock(p);
>         rcu_read_unlock();
>
> @@ -201,6 +203,7 @@ static inline void task_state(struct seq_file *m,
> struct pid_namespace *ns,
>         seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns,
> cred->egid));
>         seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns,
> cred->sgid));
>         seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns,
> cred->fsgid));
> +       seq_put_decimal_ull(m, "\nHidePid:\t", hide_pid);
>         seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
>
>         seq_puts(m, "\nGroups:\t");
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index cd8dd15..596b17f 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -712,7 +712,9 @@ static bool has_pid_permissions(struct pid_namespace
> *pid,
>                                  struct task_struct *task,
>                                  int hide_pid_min)
>  {
> -       if (pid->hide_pid < hide_pid_min)
> +       int hide_pid = max(pid->hide_pid, (int) current->hide_pid);
> +
> +       if (hide_pid < hide_pid_min)
>                 return true;
>         if (in_group_p(pid->pid_gid))
>                 return true;
> @@ -733,7 +735,9 @@ static int proc_pid_permission(struct inode *inode,
> int mask)
>         put_task_struct(task);
>
>         if (!has_perms) {
> -               if (pid->hide_pid == HIDEPID_INVISIBLE) {
> +               int hide_pid = max(pid->hide_pid, (int) current->hide_pid);
> +
> +               if (hide_pid == HIDEPID_INVISIBLE) {
>                         /*
>                          * Let's make getdents(), stat(), and open()
>                          * consistent with each other.  If a process
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 325f649..c87de0e 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -250,6 +250,7 @@ extern struct task_group root_task_group;
>         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
>         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
>         .timer_slack_ns = 50000, /* 50 usec default slack */            \
> +       .hide_pid       = 0,                                            \
>         .pids = {                                                       \
>                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
>                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ad3ec9e..ba9f1d5 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1608,6 +1608,7 @@ struct task_struct {
>         /* unserialized, strictly 'current' */
>         unsigned in_execve:1; /* bit to tell LSMs we're in execve */
>         unsigned in_iowait:1;
> +       unsigned hide_pid:2; /* per-process procfs hidepid= */
>  #if !defined(TIF_RESTORE_SIGMASK)
>         unsigned restore_sigmask:1;
>  #endif
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index a8d0759..ada62b6 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -197,4 +197,8 @@ struct prctl_mm_map {
>  # define PR_CAP_AMBIENT_LOWER          3
>  # define PR_CAP_AMBIENT_CLEAR_ALL      4
>
> +/* Per process, non-revokable procfs hidepid= option */
> +#define PR_SET_HIDEPID 48
> +#define PR_GET_HIDEPID 49
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 11c5c8a..a701a77 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1574,6 +1574,7 @@ static __latent_entropy struct task_struct
> *copy_process(
>  #endif
>
>         p->default_timer_slack_ns = current->timer_slack_ns;
> +       p->hide_pid = current->hide_pid;
>
>         task_io_accounting_init(&p->ioac);
>         acct_clear_integrals(p);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 842914e..4041ff4 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2261,6 +2261,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long,
> arg2, unsigned long, arg3,
>         case PR_GET_FP_MODE:
>                 error = GET_FP_MODE(me);
>                 break;
> +       case PR_SET_HIDEPID:
> +               if (arg2 < HIDEPID_OFF || arg2 > HIDEPID_INVISIBLE ||
> +                   arg3 || arg4 || arg5)
> +                       return -EINVAL;
> +               if (arg2 < me->hide_pid)
> +                       return -EPERM;
> +               me->hide_pid = arg2;
> +               break;
> +       case PR_GET_HIDEPID:
> +               if (arg2 || arg3 || arg4 || arg5)
> +                       return -EINVAL;
> +               error = me->hide_pid;
> +               break;
>         default:
>                 error = -EINVAL;
>                 break;
> --
> 2.5.5
>
>
Daniel Micay Jan. 16, 2017, 6:24 p.m. UTC | #2
> This should permit Linux distributions to more comprehensively lock
> down
> their services, as it allows an isolated opt-in for hidepid= for
> specific services. Previously hidepid= could only be set system-wide,
> and then specific services had to be excluded by group membership,
> essentially a more complex concept of opt-out.

I think it's a lot easier for them to introduce a proc group and then
figure out the very few exceptions that are needed vs. requiring a huge
number of opt-ins. I don't think the issue is difficulty in deploying
it, it's lack of interest. Android deployed it in 7.x without any major
issues. A good way to get people to use it would be adding proc groups
to major distributions and getting systemd to expose a simple toggle for
this, instead of requiring users to add /proc to fstab (not there by
default with systemd) and hard-wired the correct proc gid for that
distribution. Can then file bugs for packages needing the proc group.
For systemd itself, logind needs it since it drops the capability that
allows bypassing it. Other than that, it's mostly just polkit.
Lafcadio Wluiki Jan. 17, 2017, 9:54 a.m. UTC | #3
Turning on the hidepid group thing for a tightly managed OS like
Android is much easier than doing that for an established distro,
without breaking compatibility. Moreover using groups for this kind of
sandboxing isn't that great anyway as group memberships are "sticky":
whenever some code had it, it can always get it back, by creating a
setgid binary for it. The prctl() feature has the benefit that hidepid
exceptions cannot be made sticky that way...

I think it's pretty unrealistic for big distros such as Debian/Ubuntu
to move to the hidepid group feature. (I mean, if this was easy, then
somebody would already have done that, hidepid is pretty old after
all). OTOH some other opt-in security features (such as systemd's
PrivateTmp= on Fedora) are pretty well used these days, and I don't
think that the prctl would be any different there. The prctl() is a
very easy-to-grasp boolean-like concept after all.

A big benefit of the prctl is that it makes it easy to lock things
down per-process, i.e. to be more fine-grained than just per-user. For
example, my own user on my own system should probably be able to see
the process tree, to make things easy to administer for me. However,
my web browser running under the same user ID probably should not be
able to see the entire process tree, and with the prctl that's very
easy to set up, as this kind of privilege control does not require
privileges with the prctl patch.

And also, let's not forget: what is opt-in on the kernel level could
actually be made opt-out on the service manager level anyway...

Finally, note that the prctl doesn't conflict with the group thing. It
just makes it easier to adopt on a per-service and even per-process
basis, that's all.

Djalal, thank you very much for picking this up!

On Mon, Jan 16, 2017 at 7:24 PM, Daniel Micay <danielmicay@gmail.com> wrote:
>> This should permit Linux distributions to more comprehensively lock
>> down
>> their services, as it allows an isolated opt-in for hidepid= for
>> specific services. Previously hidepid= could only be set system-wide,
>> and then specific services had to be excluded by group membership,
>> essentially a more complex concept of opt-out.
>
> I think it's a lot easier for them to introduce a proc group and then
> figure out the very few exceptions that are needed vs. requiring a huge
> number of opt-ins. I don't think the issue is difficulty in deploying
> it, it's lack of interest. Android deployed it in 7.x without any major
> issues. A good way to get people to use it would be adding proc groups
> to major distributions and getting systemd to expose a simple toggle for
> this, instead of requiring users to add /proc to fstab (not there by
> default with systemd) and hard-wired the correct proc gid for that
> distribution. Can then file bugs for packages needing the proc group.
> For systemd itself, logind needs it since it drops the capability that
> allows bypassing it. Other than that, it's mostly just polkit.
Djalal Harouni Jan. 18, 2017, 10:50 p.m. UTC | #4
On Tue, Jan 17, 2017 at 9:33 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jan 16, 2017 at 9:15 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> Cc linux-api
>>
>> On Mon, Jan 16, 2017 at 2:23 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>
>>> From: Djalal Harouni <tixxdz@gmail.com>
>>>
>>> This adds a new per-task hidepid= flag that is honored by procfs when
>>> presenting /proc to the user, in addition to the existing hidepid= mount
>>> option. So far, hidepid= was exclusively a per-pidns setting. Locking
>>> down a set of processes so that they cannot see other user's processes
>>> without affecting the rest of the system thus currently requires
>>> creation of a private PID namespace, with all the complexity it brings,
>>> including maintaining a stub init process as PID 1 and losing the
>>> ability to see processes of the same user on the rest of the system.
>>>
>>> With this patch all acesss and visibility checks in procfs now
>>> honour two fields:
>>>
>>>         a) the existing hide_pid field in the PID namespace
>>>         b) the new hide_pid in struct task_struct
>>>
>>> Access/visibility is only granted if both fields permit it; the more
>>> restrictive one wins. By default the new task_struct hide_pid value
>>> defaults to 0, which means behaviour is not changed from the status quo.
>>>
>>> Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID
>>> prctl() option which takes the same three supported values as the
>>> hidepid= mount option. The per-process hide_pid may only be increased,
>>> never decreased, thus ensuring that once applied, processes can never
>>> escape such a hide_pid jail.  When a process forks it inherits its
>>> parent's hide_pid value.
>>>
>>> Suggested usecase: let's say nginx runs as user "www-data". After
>>> dropping privileges it may now call:
>>>
>>>         …
>>>         prctl(PR_SET_HIDEPID, 2);
>>>         …
>>>
>>> And from that point on neither nginx itself, nor any of its child
>>> processes may see processes in /proc anymore that belong to a different
>>> user than "www-data". Other services running on the same system remain
>>> unaffected.
>
> What affect, if any, does this have on ptrace() permissions?

This should not affect ptrace() permissions or other system calls that
work directly on pids, the test in procfs is related to inodes before
the ptrace check, hmm what do you have in mind ?


> Also, this one-way thing seems wrong to me.  I think it should roughly
> follow the no_new_privs rules instead.  IOW, if you unshare your
> pidns, it gets cleared.  Also, maybe you shouldn't be able to set it

Andy I don't follow here, no_new_privs is never cleared right ? I
can't see the corresponding clear bit code for it.

For this one I want it to act like no_new_privs. Also pidns can be
created with userns which means it can be revoked. For my use case I
want it to be part of *one* single operation where it is set with the
other sandbox operations that are all preserved... instead of setting
it *again* each time where it can already be late.


> without either having CAP_SYS_ADMIN over your userns or having
> no_new_privs set.

For this one I can add it sure. Historically that logic was added to
make seccomp more usable, for this patch the values can't be relaxed,
they are always increased never decreased. However one minor advantage
if you require no_new_privs is that this option hidepid will also
assert that you can't setuid to access some procfs inodes... though
you can also just set 'no_new_privs + hidepid' both of them in any
order. Also it allows unprivileged without userns to setup a minimal
jail while performing some operations that can be blocked by
no_new_privs.

Andy, Kees any other comments please on it ? I'm not sure if overusing
no_new_privs in this case is a good idea. Seems to me that seccomp +
no_new_privs is different than this hidepid feature that overlaps
nicely with no_new_privs.

If there are no responses for this question, then I will just add the
"CAP_SYS_ADMIN || no_new_privs" test in the next iteration.

> --Andy

Thanks!
Andy Lutomirski Jan. 18, 2017, 11:35 p.m. UTC | #5
On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Tue, Jan 17, 2017 at 9:33 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Mon, Jan 16, 2017 at 9:15 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>> Cc linux-api
>>>
>>> On Mon, Jan 16, 2017 at 2:23 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>
>>>> From: Djalal Harouni <tixxdz@gmail.com>
>>>>
>>>> This adds a new per-task hidepid= flag that is honored by procfs when
>>>> presenting /proc to the user, in addition to the existing hidepid= mount
>>>> option. So far, hidepid= was exclusively a per-pidns setting. Locking
>>>> down a set of processes so that they cannot see other user's processes
>>>> without affecting the rest of the system thus currently requires
>>>> creation of a private PID namespace, with all the complexity it brings,
>>>> including maintaining a stub init process as PID 1 and losing the
>>>> ability to see processes of the same user on the rest of the system.
>>>>
>>>> With this patch all acesss and visibility checks in procfs now
>>>> honour two fields:
>>>>
>>>>         a) the existing hide_pid field in the PID namespace
>>>>         b) the new hide_pid in struct task_struct
>>>>
>>>> Access/visibility is only granted if both fields permit it; the more
>>>> restrictive one wins. By default the new task_struct hide_pid value
>>>> defaults to 0, which means behaviour is not changed from the status quo.
>>>>
>>>> Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID
>>>> prctl() option which takes the same three supported values as the
>>>> hidepid= mount option. The per-process hide_pid may only be increased,
>>>> never decreased, thus ensuring that once applied, processes can never
>>>> escape such a hide_pid jail.  When a process forks it inherits its
>>>> parent's hide_pid value.
>>>>
>>>> Suggested usecase: let's say nginx runs as user "www-data". After
>>>> dropping privileges it may now call:
>>>>
>>>>         …
>>>>         prctl(PR_SET_HIDEPID, 2);
>>>>         …
>>>>
>>>> And from that point on neither nginx itself, nor any of its child
>>>> processes may see processes in /proc anymore that belong to a different
>>>> user than "www-data". Other services running on the same system remain
>>>> unaffected.
>>
>> What affect, if any, does this have on ptrace() permissions?
>
> This should not affect ptrace() permissions or other system calls that
> work directly on pids, the test in procfs is related to inodes before
> the ptrace check, hmm what do you have in mind ?
>

I'm wondering what problem you're trying to solve, then.  hidepid
helps lock down procfs, but ISTM you might still want to lock down
other PID-based APIs.

>
>> Also, this one-way thing seems wrong to me.  I think it should roughly
>> follow the no_new_privs rules instead.  IOW, if you unshare your
>> pidns, it gets cleared.  Also, maybe you shouldn't be able to set it
>
> Andy I don't follow here, no_new_privs is never cleared right ? I
> can't see the corresponding clear bit code for it.

I believe that unsharing userns clears no_new_privs.

>
> For this one I want it to act like no_new_privs. Also pidns can be
> created with userns which means it can be revoked. For my use case I
> want it to be part of *one* single operation where it is set with the
> other sandbox operations that are all preserved... instead of setting
> it *again* each time where it can already be late.
>

I don't see the problem as long as this gets implemented carefully
enough.  If you unshare your userns and your pidns, then you should be
able to see all tasks in the new pidns, even if you mount a fresh
procfs pointing at that pidns -- after all, you are privileged in that
namespace.

>
>> without either having CAP_SYS_ADMIN over your userns or having
>> no_new_privs set.
>
> For this one I can add it sure. Historically that logic was added to
> make seccomp more usable, for this patch the values can't be relaxed,
> they are always increased never decreased. However one minor advantage
> if you require no_new_privs is that this option hidepid will also
> assert that you can't setuid to access some procfs inodes... though
> you can also just set 'no_new_privs + hidepid' both of them in any
> order. Also it allows unprivileged without userns to setup a minimal
> jail while performing some operations that can be blocked by
> no_new_privs.
>
> Andy, Kees any other comments please on it ? I'm not sure if overusing
> no_new_privs in this case is a good idea. Seems to me that seccomp +
> no_new_privs is different than this hidepid feature that overlaps
> nicely with no_new_privs.
>
> If there are no responses for this question, then I will just add the
> "CAP_SYS_ADMIN || no_new_privs" test in the next iteration.

I feel like this feature (per-task hidepid) is subtle and complex
enough that it should have a very clear purpose and use case before
it's merged and that we should make sure that there isn't a better way
to accomplish what you're trying to do.
Djalal Harouni Jan. 19, 2017, 1:53 p.m. UTC | #6
On Thu, Jan 19, 2017 at 12:35 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
[...]
>>>>>
>>>>>         …
>>>>>         prctl(PR_SET_HIDEPID, 2);
>>>>>         …
>>>>>
>>>>> And from that point on neither nginx itself, nor any of its child
>>>>> processes may see processes in /proc anymore that belong to a different
>>>>> user than "www-data". Other services running on the same system remain
>>>>> unaffected.
>>>
>>> What affect, if any, does this have on ptrace() permissions?
>>
>> This should not affect ptrace() permissions or other system calls that
>> work directly on pids, the test in procfs is related to inodes before
>> the ptrace check, hmm what do you have in mind ?
>>
>
> I'm wondering what problem you're trying to solve, then.  hidepid
> helps lock down procfs, but ISTM you might still want to lock down
> other PID-based APIs.

Yes but they are already locked based on uid checks. procfs was not
and this patch is specifically to align it, and to reduce the ability
to peek data from other processes.

>>
>>> Also, this one-way thing seems wrong to me.  I think it should roughly
>>> follow the no_new_privs rules instead.  IOW, if you unshare your
>>> pidns, it gets cleared.  Also, maybe you shouldn't be able to set it
>>
>> Andy I don't follow here, no_new_privs is never cleared right ? I
>> can't see the corresponding clear bit code for it.
>
> I believe that unsharing userns clears no_new_privs.
No, it is not cleared, and I can't see the clear bit for it. Maybe due
to userns+filesystems limitations it was not noticed.


>>
>> For this one I want it to act like no_new_privs. Also pidns can be
>> created with userns which means it can be revoked. For my use case I
>> want it to be part of *one* single operation where it is set with the
>> other sandbox operations that are all preserved... instead of setting
>> it *again* each time where it can already be late.
>>
>
> I don't see the problem as long as this gets implemented carefully
> enough.  If you unshare your userns and your pidns, then you should be
> able to see all tasks in the new pidns, even if you mount a fresh
> procfs pointing at that pidns -- after all, you are privileged in that
> namespace.

That's already the case, if you are privileged you can see all tasks,
the code is written that the per-task hidepid does not overwrite
capabilities.


>>
>>> without either having CAP_SYS_ADMIN over your userns or having
>>> no_new_privs set.
>>
>> For this one I can add it sure. Historically that logic was added to
>> make seccomp more usable, for this patch the values can't be relaxed,
>> they are always increased never decreased. However one minor advantage
>> if you require no_new_privs is that this option hidepid will also
>> assert that you can't setuid to access some procfs inodes... though
>> you can also just set 'no_new_privs + hidepid' both of them in any
>> order. Also it allows unprivileged without userns to setup a minimal
>> jail while performing some operations that can be blocked by
>> no_new_privs.
>>
>> Andy, Kees any other comments please on it ? I'm not sure if overusing
>> no_new_privs in this case is a good idea. Seems to me that seccomp +
>> no_new_privs is different than this hidepid feature that overlaps
>> nicely with no_new_privs.
>>
>> If there are no responses for this question, then I will just add the
>> "CAP_SYS_ADMIN || no_new_privs" test in the next iteration.
>
> I feel like this feature (per-task hidepid) is subtle and complex
> enough that it should have a very clear purpose and use case before
> it's merged and that we should make sure that there isn't a better way
> to accomplish what you're trying to do.

Sure, the hidepid mount option is old enough, and this per-task
hidepid is clearly defined only for procfs and per task, we can't add
another switch that's relate to both a filesystem and pid namespaces,
it will be a bit complicated and not really useful for cases that are
in *same* pidns where *each* one have to mount its procfs, it will
propagate. Also as noted by Lafcadio, the gid thing is a bit hard to
use now.

Thanks!
Andy Lutomirski Jan. 19, 2017, 7:52 p.m. UTC | #7
On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Thu, Jan 19, 2017 at 12:35 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>> Also, this one-way thing seems wrong to me.  I think it should roughly
>>>> follow the no_new_privs rules instead.  IOW, if you unshare your
>>>> pidns, it gets cleared.  Also, maybe you shouldn't be able to set it
>>>
>>> Andy I don't follow here, no_new_privs is never cleared right ? I
>>> can't see the corresponding clear bit code for it.
>>
>> I believe that unsharing userns clears no_new_privs.
> No, it is not cleared, and I can't see the clear bit for it. Maybe due
> to userns+filesystems limitations it was not noticed.

Hmm, maybe I remembered wrong.

>> I feel like this feature (per-task hidepid) is subtle and complex
>> enough that it should have a very clear purpose and use case before
>> it's merged and that we should make sure that there isn't a better way
>> to accomplish what you're trying to do.
>
> Sure, the hidepid mount option is old enough, and this per-task
> hidepid is clearly defined only for procfs and per task, we can't add
> another switch that's relate to both a filesystem and pid namespaces,
> it will be a bit complicated and not really useful for cases that are
> in *same* pidns where *each* one have to mount its procfs, it will
> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to
> use now.

What I'm trying to say is that I want to understand a complete,
real-world use case.  Adding a security-related per-task flag is can
be quite messy and requires a lot of careful thought to get right, and
I'd rather avoid it if at all possible.

I'm imaging something like a new RestrictPidVisisbility= option in
systemd.  I agree that this is currently a mess to do.  But maybe a
simpler solution would be to add a new mount option local_hidepid to
procfs.  If you set that option, then it overrides hidepid for that
instance.  Most of these semi-sandboxed daemon processes already have
their own mount namespace, so the overhead should be minimal.

--Andy
Lafcadio Wluiki Jan. 20, 2017, 3:44 p.m. UTC | #8
On Thu, Jan 19, 2017 at 12:35 AM, Andy Lutomirski <luto@amacapital.net> wrote:

>>>>> And from that point on neither nginx itself, nor any of its child
>>>>> processes may see processes in /proc anymore that belong to a different
>>>>> user than "www-data". Other services running on the same system remain
>>>>> unaffected.
>>>
>>> What affect, if any, does this have on ptrace() permissions?
>>
>> This should not affect ptrace() permissions or other system calls that
>> work directly on pids, the test in procfs is related to inodes before
>> the ptrace check, hmm what do you have in mind ?
>
> I'm wondering what problem you're trying to solve, then.  hidepid
> helps lock down procfs, but ISTM you might still want to lock down
> other PID-based APIs.

The (pseudo-) mount option hidepid= is about reducing information
leakage, that's all. It's not about enforcing access to PIDs if you
happen to know them.

This patch set simply make this global option usable locally, that's
all. It does not change semantics, it does not alter permission
checks, it doesn't turn the thing into something different than it is
right now. It just permits to turn on its effect in a more local way,
without having to change the whole system.

>>> Also, this one-way thing seems wrong to me.  I think it should roughly
>>> follow the no_new_privs rules instead.  IOW, if you unshare your
>>> pidns, it gets cleared.  Also, maybe you shouldn't be able to set it
>>
>> Andy I don't follow here, no_new_privs is never cleared right ? I
>> can't see the corresponding clear bit code for it.
>
> I believe that unsharing userns clears no_new_privs.

Reverting the a zero hidepid value if you create your own pid/user
namespace sounds OK to me. After all, if you create your own kingdom
anyway,  it should be up to you what you want to be able to see inside
of it...

> I feel like this feature (per-task hidepid) is subtle and complex
> enough that it should have a very clear purpose and use case before
> it's merged and that we should make sure that there isn't a better way
> to accomplish what you're trying to do.

The purpose for me is really reducing information leakage, the same
thing hidepid= always has done, and not more. To quote the proc(5) man
page about this:

"… This doesn't hide the fact that a process with a specific PID value
exists (it can be learned by other means, for example, by "kill -0
$PID"), but it hides a process's UID and GID, which could otherwise be
learned by employing stat(2) on a /proc/[pid] directory.  This greatly
complicates an attacker's task of gathering information about running
processes  (e.g.,  discovering whether some daemon is running with
elevated privileges, whether another user is running some sensitive
program, whether other users are running any program at all, and so
on). …"

And, yeah, I think this is useful for services, but to make it usable
in general-purpose distros an individual per-service and per-process
opt-in would be much better than a global opt-in.

Yes, I think there's value in reducing information leakage. And yes,
what was true when the proc(5) man page was written, is still relevant
today, I think.

L.
Lafcadio Wluiki Jan. 20, 2017, 3:56 p.m. UTC | #9
On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:

>> Sure, the hidepid mount option is old enough, and this per-task
>> hidepid is clearly defined only for procfs and per task, we can't add
>> another switch that's relate to both a filesystem and pid namespaces,
>> it will be a bit complicated and not really useful for cases that are
>> in *same* pidns where *each* one have to mount its procfs, it will
>> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to
>> use now.
>
> What I'm trying to say is that I want to understand a complete,
> real-world use case.  Adding a security-related per-task flag is can
> be quite messy and requires a lot of careful thought to get right, and
> I'd rather avoid it if at all possible.
>
> I'm imaging something like a new RestrictPidVisisbility= option in
> systemd.  I agree that this is currently a mess to do.  But maybe a

It's not just a "mess" to do, it's not possible afaics. The hidepid
thing is after all not really a mount option, but an option of the pid
namespace. Hence, if you want to restrict the visibility for one
service only, then you have to get your own PID namespace, but that
does a lot more than just hide visibility: it renumbers everything
after all...

> simpler solution would be to add a new mount option local_hidepid to
> procfs.  If you set that option, then it overrides hidepid for that
> instance.  Most of these semi-sandboxed daemon processes already have
> their own mount namespace, so the overhead should be minimal.

When I worked on the patches originally, I actually wanted to
implement this as true per-superblock procfs mount option. But this is
really hard to do, as the private superblock pointer of the procfs
instance currently points to the pid namespace, and breaking that up,
so that you can have multiple procfs superblocks per pid namespace is
a ton of work, and I doubt anyone would really like the complexity
this brings, just for adding a single 2bit option...

The per-process option is much simpler code-wise. It also has
semantical benefits: if the thing isn't a mount option it is
accessible with absolutely minimal privileges, as it does not imply
namespaces and mounting. This means, my Firefox can run with hidepid
turned on without my KDE Konsole instance also having to turn it on.
Or to say this differently: your suggested RestrictPidVisibility=
works nicely both for "systemd" as PID 1 and for "systemd --user" as
user "lafcadio", when it is per-process, but is much more complex to
implement if it was a true mount option.

L.
Djalal Harouni Jan. 20, 2017, 4:33 p.m. UTC | #10
On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
[...]
>> Sure, the hidepid mount option is old enough, and this per-task
>> hidepid is clearly defined only for procfs and per task, we can't add
>> another switch that's relate to both a filesystem and pid namespaces,
>> it will be a bit complicated and not really useful for cases that are
>> in *same* pidns where *each* one have to mount its procfs, it will
>> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to
>> use now.
>
> What I'm trying to say is that I want to understand a complete,
> real-world use case.  Adding a security-related per-task flag is can
> be quite messy and requires a lot of careful thought to get right, and
> I'd rather avoid it if at all possible.

I do agree, but that's not what we are proposing here. This use case
is limited we do not manipulate the creds of the task, there are no
security transitions. The task does not change, its only related to
procfs and pid entries there. Also the flag applies only to current
task and not on remote ones... Nothing new here it's an extension of
procfs hidepid.

> I'm imaging something like a new RestrictPidVisisbility= option in
> systemd.  I agree that this is currently a mess to do.  But maybe a

Yes that's one use case, If we manage to land this I'll follow up with
it... plus there is, I've a use case related to kubernetes where I do
want to reduce the number of processes inside containers per pod to
minimal. Some other cases are: lock down children where being
unprivileged. Also as noted in other replies on today's desktop
systems, under a normal user session, the user should see all
processes of the system where the media player, browser etc have no
business to see the process tree. This can be easily implemented when
launching apps without the need to regain privileges...

> simpler solution would be to add a new mount option local_hidepid to
> procfs.  If you set that option, then it overrides hidepid for that
> instance.  Most of these semi-sandboxed daemon processes already have
> their own mount namespace, so the overhead should be minimal.

Andy If that could work :-/    we have to re-write or adapt lot of
things inside procfs... plus:
Procfs is a miror to the current pid namespace. Mount options are not
procfs but rather pid namespace. That would not work.
Also having multiple mount namespaces where each one with its setup
and having to migrate tasks between these environements: I would
rather have the security information or context or any minor flag
attached to the task itself rather than the object.
The prctl() interface is really simple for userspace. The kernel
change is not intrusive, and current approach does not require any
privileges. For others you may have to gain privileges or ask some one
to set it up. The current tendency is to allow more unprivileged
code/containers... to setup such mini jails. Note to mention that this
schema is simple and can be isolated from other complexities, set it
once and that's it.


> --Andy
Andy Lutomirski Jan. 21, 2017, 12:53 a.m. UTC | #11
On Fri, Jan 20, 2017 at 8:33 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> [...]
>>> Sure, the hidepid mount option is old enough, and this per-task
>>> hidepid is clearly defined only for procfs and per task, we can't add
>>> another switch that's relate to both a filesystem and pid namespaces,
>>> it will be a bit complicated and not really useful for cases that are
>>> in *same* pidns where *each* one have to mount its procfs, it will
>>> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to
>>> use now.
>>
>> What I'm trying to say is that I want to understand a complete,
>> real-world use case.  Adding a security-related per-task flag is can
>> be quite messy and requires a lot of careful thought to get right, and
>> I'd rather avoid it if at all possible.
>
> I do agree, but that's not what we are proposing here. This use case
> is limited we do not manipulate the creds of the task, there are no
> security transitions. The task does not change, its only related to
> procfs and pid entries there. Also the flag applies only to current
> task and not on remote ones... Nothing new here it's an extension of
> procfs hidepid.
>
>> I'm imaging something like a new RestrictPidVisisbility= option in
>> systemd.  I agree that this is currently a mess to do.  But maybe a
>
> Yes that's one use case, If we manage to land this I'll follow up with
> it... plus there is, I've a use case related to kubernetes where I do
> want to reduce the number of processes inside containers per pod to
> minimal. Some other cases are: lock down children where being
> unprivileged. Also as noted in other replies on today's desktop
> systems, under a normal user session, the user should see all
> processes of the system where the media player, browser etc have no
> business to see the process tree. This can be easily implemented when
> launching apps without the need to regain privileges...
>
>> simpler solution would be to add a new mount option local_hidepid to
>> procfs.  If you set that option, then it overrides hidepid for that
>> instance.  Most of these semi-sandboxed daemon processes already have
>> their own mount namespace, so the overhead should be minimal.
>
> Andy If that could work :-/    we have to re-write or adapt lot of
> things inside procfs... plus:
> Procfs is a miror to the current pid namespace. Mount options are not
> procfs but rather pid namespace. That would not work.

I agree that the kernel change to do it per task is very simple.  But
this is an unfortunate slippery slope.  What if you want to block off
everything in /proc that isn't associated with a PID?  What if you
want to suppress /sys access?  What if you want ot block *all*
non-current PIDs from being revealed in /proc?  What if you want to
hide /proc/PID/cmdline?

I think that the right solution here is to fix procfs to understand
per-superblock mount options.

--Andy
Djalal Harouni Jan. 23, 2017, 11:46 a.m. UTC | #12
On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Jan 20, 2017 at 8:33 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> [...]
>>>> Sure, the hidepid mount option is old enough, and this per-task
>>>> hidepid is clearly defined only for procfs and per task, we can't add
>>>> another switch that's relate to both a filesystem and pid namespaces,
>>>> it will be a bit complicated and not really useful for cases that are
>>>> in *same* pidns where *each* one have to mount its procfs, it will
>>>> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to
>>>> use now.
>>>
>>> What I'm trying to say is that I want to understand a complete,
>>> real-world use case.  Adding a security-related per-task flag is can
>>> be quite messy and requires a lot of careful thought to get right, and
>>> I'd rather avoid it if at all possible.
>>
>> I do agree, but that's not what we are proposing here. This use case
>> is limited we do not manipulate the creds of the task, there are no
>> security transitions. The task does not change, its only related to
>> procfs and pid entries there. Also the flag applies only to current
>> task and not on remote ones... Nothing new here it's an extension of
>> procfs hidepid.
>>
>>> I'm imaging something like a new RestrictPidVisisbility= option in
>>> systemd.  I agree that this is currently a mess to do.  But maybe a
>>
>> Yes that's one use case, If we manage to land this I'll follow up with
>> it... plus there is, I've a use case related to kubernetes where I do
>> want to reduce the number of processes inside containers per pod to
>> minimal. Some other cases are: lock down children where being
>> unprivileged. Also as noted in other replies on today's desktop
>> systems, under a normal user session, the user should see all
>> processes of the system where the media player, browser etc have no
>> business to see the process tree. This can be easily implemented when
>> launching apps without the need to regain privileges...
>>
>>> simpler solution would be to add a new mount option local_hidepid to
>>> procfs.  If you set that option, then it overrides hidepid for that
>>> instance.  Most of these semi-sandboxed daemon processes already have
>>> their own mount namespace, so the overhead should be minimal.
>>
>> Andy If that could work :-/    we have to re-write or adapt lot of
>> things inside procfs... plus:
>> Procfs is a miror to the current pid namespace. Mount options are not
>> procfs but rather pid namespace. That would not work.
>
> I agree that the kernel change to do it per task is very simple.  But
> this is an unfortunate slippery slope.  What if you want to block off
> everything in /proc that isn't associated with a PID?  What if you
> want to suppress /sys access?  What if you want ot block *all*
> non-current PIDs from being revealed in /proc?  What if you want to
> hide /proc/PID/cmdline?

For /sys we mount an inaccessible directory on top, we even do that
for some static /sys and /proc inodes, of course that doesn't scale
but we try... please see below.

For non-current PIDs from being revealed in /proc, actually the use
case did not come, it will be complex to handle TOCTOU, other races
etc. We don't want that and we don't have a use case for it. The patch
here is a clear parent -> child relation.

> I think that the right solution here is to fix procfs to understand
> per-superblock mount options.

Unfortunately and as also noted by Lafcadio and you this is too
complex. Also from what you have said above and from what /proc
reports to userspace and today's use cases with containers, namespaces
etc. maybe the kernel needs a new way to report *some* kernel objects
and other things to userspace which are not based on /proc... or move
them out of /proc... in some cases the kernel may need to know if the
calling process is in a namespace...
but lets please stay focused here, fixing procfs is a bit out of the
scope for this *specific* use case and patch, we don't have the
resources to explore something new...
The aim here is a simple fix of 2bits that preserves the semantics of
procfs and hidepid, at same time makes the hidepid option local to
current process. It does not require or mess up with privileges,
namespaces etc. Easy to review and maintain.

Also as said in other emails we have clear use cases: some
cloud/container providers tax users for extra processes and resources
that they do not really need, we have mini jails for desktop systems
too... all this can be improved.

Thanks!

> --Andy
Andy Lutomirski Jan. 23, 2017, 8:07 p.m. UTC | #13
On Mon, Jan 23, 2017 at 3:46 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> I agree that the kernel change to do it per task is very simple.  But
>> this is an unfortunate slippery slope.  What if you want to block off
>> everything in /proc that isn't associated with a PID?  What if you
>> want to suppress /sys access?  What if you want ot block *all*
>> non-current PIDs from being revealed in /proc?  What if you want to
>> hide /proc/PID/cmdline?
>
> For /sys we mount an inaccessible directory on top, we even do that
> for some static /sys and /proc inodes, of course that doesn't scale
> but we try... please see below.

So you do have a private fs namespace.

>
>> I think that the right solution here is to fix procfs to understand
>> per-superblock mount options.
>
> Unfortunately and as also noted by Lafcadio and you this is too
> complex.
> ... but lets please stay focused here, fixing procfs is a bit out of the
> scope for this *specific* use case and patch, we don't have the
> resources to explore something new...

I'm not the final authority on this (that's probably Eric), but NAK.
For upstream Linux, you can't say "doing it right is too hard so we're
going to introduce a hackish ABI with questionable security
properties".

The right solution is IMO quite clearly to fix /proc.  This isn't even
particularly hard -- there are only 17 instances of s_fs_info in
fs/proc/.

> ... Easy to review and maintain.

Au contraire.  It's a pain in the arse to review because of the
security implications and it's unpleasant to maintain because it's a
special case in core kernel code.
Djalal Harouni Jan. 26, 2017, 1:20 p.m. UTC | #14
On Mon, Jan 23, 2017 at 9:07 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jan 23, 2017 at 3:46 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> I agree that the kernel change to do it per task is very simple.  But
>>> this is an unfortunate slippery slope.  What if you want to block off
>>> everything in /proc that isn't associated with a PID?  What if you
>>> want to suppress /sys access?  What if you want ot block *all*
>>> non-current PIDs from being revealed in /proc?  What if you want to
>>> hide /proc/PID/cmdline?
>>
>> For /sys we mount an inaccessible directory on top, we even do that
>> for some static /sys and /proc inodes, of course that doesn't scale
>> but we try... please see below.
>
> So you do have a private fs namespace.

Yes for some cases and they are cheap, however I don't see the relation here ?


>>
>>> I think that the right solution here is to fix procfs to understand
>>> per-superblock mount options.
>>
>> Unfortunately and as also noted by Lafcadio and you this is too
>> complex.
>> ... but lets please stay focused here, fixing procfs is a bit out of the
>> scope for this *specific* use case and patch, we don't have the
>> resources to explore something new...
>
> I'm not the final authority on this (that's probably Eric), but NAK.

(I don't know the final authority. The *only* thing I know is that we
have technical problems here that are *not* fixed, and we try to fix
them).


> For upstream Linux, you can't say "doing it right is too hard so we're
> going to introduce a hackish ABI with questionable security
> properties".

No one said this.

Maybe you think that procfs is the right way, but I certainly can't
predict how much damage any fundamental change on procfs will make.
procfs is a special fs that has its own rules and hacks... everyone
would like to avoid major changes on it...

Could you please explain in clear words what are the benefits to use
mount, retain or regain CAP_SYS_ADMIN for something that we can set
without any privileges ?


> The right solution is IMO quite clearly to fix /proc.  This isn't even
> particularly hard -- there are only 17 instances of s_fs_info in
> fs/proc/.

pid namespaces are tied to procfs to the heart, any change on procfs
has also to count on pid namespaces, flush cached entries of a task
from each /proc... ? pid namespaces have then to be made smarter.
Forward port vulnerability and bug fixes that have been stacked in
procfs ? or at least do not break them. "procfs has always been a
special fs" by git logs. Also cases of some specific directories
/proc/fs/nfs and maybe devices id  ?
persistent of mount options and other userspace information -
https://lkml.org/lkml/2012/3/26/486

These prevent me from asserting that's the best way... where in a
previous thread you said that we should go with the easiest way to fix
the problem.
Lafcadio Wluiki Feb. 10, 2017, 2:40 p.m. UTC | #15
On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote:

> I agree that the kernel change to do it per task is very simple.  But
> this is an unfortunate slippery slope.  What if you want to block off
> everything in /proc that isn't associated with a PID?  What if you
> want to suppress /sys access?  What if you want ot block *all*
> non-current PIDs from being revealed in /proc?  What if you want to
> hide /proc/PID/cmdline?

These are all what-ifs that noone was interested in so far. I have a
very specific itch I want to scratch, and I found a reasonably generic
concept of exposing it. But now you are trying to lead this all down
onto a paths asking for features noone is actually really interested
in.

> I think that the right solution here is to fix procfs to understand
> per-superblock mount options.

Andy, this is really not helpful. Doing that is far from easy, that's
a major undertaking: asking us to rework procfs in such a major way I
only can understand as an indirect way for you to say "go away" to
us... Expecting newcomers to kernel work to basically clean up an
entire kernel subsystem, dealing with all the politics involved is
just not going to work.

And again: I am very sure the proposed prctl() based solution is
actually much much nicer than any procfs superblock based one, since
it works naturally, and easily for unprivileged processes too!
Anything that requires mount options means privileges are required in
some form or another. Yes userns would open that up, but that comes
with a huge amount of additional problems.

Anyway, I understand it is your intention to derail this. I can accept
that. It's a pity though.

L.
Andy Lutomirski Feb. 10, 2017, 4:18 p.m. UTC | #16
On Fri, Feb 10, 2017 at 6:40 AM, Lafcadio Wluiki <wluikil@gmail.com> wrote:
> On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
>> I agree that the kernel change to do it per task is very simple.  But
>> this is an unfortunate slippery slope.  What if you want to block off
>> everything in /proc that isn't associated with a PID?  What if you
>> want to suppress /sys access?  What if you want ot block *all*
>> non-current PIDs from being revealed in /proc?  What if you want to
>> hide /proc/PID/cmdline?
>
> These are all what-ifs that noone was interested in so far. I have a
> very specific itch I want to scratch, and I found a reasonably generic
> concept of exposing it. But now you are trying to lead this all down
> onto a paths asking for features noone is actually really interested
> in.

If you have an itch and want to scratch it by adding a new API, the
API has to be right, full stop.

>
>> I think that the right solution here is to fix procfs to understand
>> per-superblock mount options.
>
> Andy, this is really not helpful. Doing that is far from easy, that's
> a major undertaking: asking us to rework procfs in such a major way I
> only can understand as an indirect way for you to say "go away" to
> us... Expecting newcomers to kernel work to basically clean up an
> entire kernel subsystem, dealing with all the politics involved is
> just not going to work.

Once upon a time, a newbish kernel hacker named Andy wrote some code
to change the way the vDSO worked.  The maintainers told me that it
was a nice thought and that it was too messy and wasn't acceptable in
its present form.  So the patchset got quite a bit longer, I learned
more about the mess that was the vDSO, and I fixed it.  The
maintainers were right.

It's just not the case that a newcomer gets to add fields to
task_struct that a more experienced developer wouldn't get to add.
Sorry.

>
> And again: I am very sure the proposed prctl() based solution is
> actually much much nicer than any procfs superblock based one, since
> it works naturally, and easily for unprivileged processes too!
> Anything that requires mount options means privileges are required in
> some form or another. Yes userns would open that up, but that comes
> with a huge amount of additional problems.

If you want to design a generic mechanism by which a task can change
the way that pseudo filesystems look for itself and its children
without using mount options, and you come up with something that is
sane, safe, generic, and reasonable clean, you're welcome to do so.
This sounds like a considerably more challenging undertaking than just
fixing procfs.

>
> Anyway, I understand it is your intention to derail this. I can accept
> that. It's a pity though.

I have no intention of derailing improvements to hidepid.  In fact,
I'd love to see improvements here.  I'm just saying that the current
proposal isn't the way to do it.
Kees Cook Feb. 10, 2017, 11:44 p.m. UTC | #17
On Wed, Jan 18, 2017 at 3:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> Andy I don't follow here, no_new_privs is never cleared right ? I
>> can't see the corresponding clear bit code for it.
>
> I believe that unsharing userns clears no_new_privs.

Seriously? That's kind of ... weird. I mean, I guess you're
priv-confined in a way, but that seems fragile.

-Kees
Andy Lutomirski Feb. 13, 2017, 7:01 p.m. UTC | #18
On Fri, Feb 10, 2017 at 3:44 PM, Kees Cook <keescook@chromium.org> wrote:
> On Wed, Jan 18, 2017 at 3:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>> Andy I don't follow here, no_new_privs is never cleared right ? I
>>> can't see the corresponding clear bit code for it.
>>
>> I believe that unsharing userns clears no_new_privs.
>
> Seriously? That's kind of ... weird. I mean, I guess you're
> priv-confined in a way, but that seems fragile.
>

I appear to have made this up.  Either I genuinely pulled it out of
thin air or it was discussed and not done.

$ setpriv --nnp unshare -Ur cat /proc/self/status |grep NoNewPrivs
NoNewPrivs:    1

If it were to be done, it ought to be quite safe except for possible LSM issues.

--Andy
Kees Cook Feb. 13, 2017, 7:15 p.m. UTC | #19
On Mon, Feb 13, 2017 at 11:01 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Feb 10, 2017 at 3:44 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Wed, Jan 18, 2017 at 3:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>> Andy I don't follow here, no_new_privs is never cleared right ? I
>>>> can't see the corresponding clear bit code for it.
>>>
>>> I believe that unsharing userns clears no_new_privs.
>>
>> Seriously? That's kind of ... weird. I mean, I guess you're
>> priv-confined in a way, but that seems fragile.
>>
>
> I appear to have made this up.  Either I genuinely pulled it out of
> thin air or it was discussed and not done.
>
> $ setpriv --nnp unshare -Ur cat /proc/self/status |grep NoNewPrivs
> NoNewPrivs:    1
>
> If it were to be done, it ought to be quite safe except for possible LSM issues.

Okay, cool. Thanks. (Also, where does "setpriv" live? I must need a
new set of util-linux or something?)

-Kees
Christian Kujau Feb. 14, 2017, 4:11 a.m. UTC | #20
On Mon, 13 Feb 2017, Kees Cook wrote:
> Okay, cool. Thanks. (Also, where does "setpriv" live? I must need a
> new set of util-linux or something?)

Indeed, a newer version of util-linux[0] should do, although 
Debian/testing appears to have an extra package just for "setpriv":

  https://packages.debian.org/stretch/setpriv

C.

[0] https://git.kernel.org/cgit/utils/util-linux/util-linux.git/commit/?id=5600c40
diff mbox

Patch

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 72624a1..fc95261 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -164,6 +164,7 @@  read the file /proc/PID/status:
   Uid:    501     501     501     501
   Gid:    100     100     100     100
   FDSize: 256
+  HidePid:      0
   Groups: 100 14 16
   VmPeak:     5004 kB
   VmSize:     5004 kB
@@ -228,6 +229,7 @@  Table 1-2: Contents of the status files (as of 4.1)
  Gid                         Real, effective, saved set, and  file system GIDs
  Umask                       file mode creation mask
  FDSize                      number of file descriptor slots currently allocated
+ HidePid                     process access mode of /proc/<pid>/
  Groups                      supplementary group list
  NStgid                      descendant namespace thread group ID hierarchy
  NSpid                       descendant namespace process ID hierarchy
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 51a4213..e6cd1a1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -163,6 +163,7 @@  static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	const struct cred *cred;
 	pid_t ppid, tpid = 0, tgid, ngid;
 	unsigned int max_fds = 0;
+	int hide_pid;
 
 	rcu_read_lock();
 	ppid = pid_alive(p) ?
@@ -183,6 +184,7 @@  static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	task_lock(p);
 	if (p->files)
 		max_fds = files_fdtable(p->files)->max_fds;
+	hide_pid = p->hide_pid;
 	task_unlock(p);
 	rcu_read_unlock();
 
@@ -201,6 +203,7 @@  static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
 	seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
 	seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
+	seq_put_decimal_ull(m, "\nHidePid:\t", hide_pid);
 	seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
 
 	seq_puts(m, "\nGroups:\t");
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cd8dd15..596b17f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -712,7 +712,9 @@  static bool has_pid_permissions(struct pid_namespace *pid,
 				 struct task_struct *task,
 				 int hide_pid_min)
 {
-	if (pid->hide_pid < hide_pid_min)
+	int hide_pid = max(pid->hide_pid, (int) current->hide_pid);
+
+	if (hide_pid < hide_pid_min)
 		return true;
 	if (in_group_p(pid->pid_gid))
 		return true;
@@ -733,7 +735,9 @@  static int proc_pid_permission(struct inode *inode, int mask)
 	put_task_struct(task);
 
 	if (!has_perms) {
-		if (pid->hide_pid == HIDEPID_INVISIBLE) {
+		int hide_pid = max(pid->hide_pid, (int) current->hide_pid);
+
+		if (hide_pid == HIDEPID_INVISIBLE) {
 			/*
 			 * Let's make getdents(), stat(), and open()
 			 * consistent with each other.  If a process
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 325f649..c87de0e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -250,6 +250,7 @@  extern struct task_group root_task_group;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.hide_pid	= 0,						\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9e..ba9f1d5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1608,6 +1608,7 @@  struct task_struct {
 	/* unserialized, strictly 'current' */
 	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
 	unsigned in_iowait:1;
+	unsigned hide_pid:2; /* per-process procfs hidepid= */
 #if !defined(TIF_RESTORE_SIGMASK)
 	unsigned restore_sigmask:1;
 #endif
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a8d0759..ada62b6 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -197,4 +197,8 @@  struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+/* Per process, non-revokable procfs hidepid= option */
+#define PR_SET_HIDEPID 48
+#define PR_GET_HIDEPID 49
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..a701a77 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1574,6 +1574,7 @@  static __latent_entropy struct task_struct *copy_process(
 #endif
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
+	p->hide_pid = current->hide_pid;
 
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 842914e..4041ff4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2261,6 +2261,19 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SET_HIDEPID:
+		if (arg2 < HIDEPID_OFF || arg2 > HIDEPID_INVISIBLE ||
+		    arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (arg2 < me->hide_pid)
+			return -EPERM;
+		me->hide_pid = arg2;
+		break;
+	case PR_GET_HIDEPID:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = me->hide_pid;
+		break;
 	default:
 		error = -EINVAL;
 		break;