Message ID | 1484572984-13388-3-git-send-email-djalal@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Cc linux-api On Mon, Jan 16, 2017 at 2:23 PM, Djalal Harouni <tixxdz@gmail.com> wrote: > From: Djalal Harouni <tixxdz@gmail.com> > > This adds a new per-task hidepid= flag that is honored by procfs when > presenting /proc to the user, in addition to the existing hidepid= mount > option. So far, hidepid= was exclusively a per-pidns setting. Locking > down a set of processes so that they cannot see other user's processes > without affecting the rest of the system thus currently requires > creation of a private PID namespace, with all the complexity it brings, > including maintaining a stub init process as PID 1 and losing the > ability to see processes of the same user on the rest of the system. > > With this patch all acesss and visibility checks in procfs now > honour two fields: > > a) the existing hide_pid field in the PID namespace > b) the new hide_pid in struct task_struct > > Access/visibility is only granted if both fields permit it; the more > restrictive one wins. By default the new task_struct hide_pid value > defaults to 0, which means behaviour is not changed from the status quo. > > Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID > prctl() option which takes the same three supported values as the > hidepid= mount option. The per-process hide_pid may only be increased, > never decreased, thus ensuring that once applied, processes can never > escape such a hide_pid jail. When a process forks it inherits its > parent's hide_pid value. > > Suggested usecase: let's say nginx runs as user "www-data". After > dropping privileges it may now call: > > … > prctl(PR_SET_HIDEPID, 2); > … > > And from that point on neither nginx itself, nor any of its child > processes may see processes in /proc anymore that belong to a different > user than "www-data". Other services running on the same system remain > unaffected. > > This should permit Linux distributions to more comprehensively lock down > their services, as it allows an isolated opt-in for hidepid= for > specific services. Previously hidepid= could only be set system-wide, > and then specific services had to be excluded by group membership, > essentially a more complex concept of opt-out. > > A tool to test this is available here: > https://gist.github.com/tixxdz/4e6d21071463ad2c5a043984e3efb5a1 > > Original-author: Lafcadio Wluiki <wluikil@gmail.com> > Signed-off-by: Djalal Harouni <tixxdz@gmail.com> > --- > Documentation/filesystems/proc.txt | 2 ++ > fs/proc/array.c | 3 +++ > fs/proc/base.c | 8 ++++++-- > include/linux/init_task.h | 1 + > include/linux/sched.h | 1 + > include/uapi/linux/prctl.h | 4 ++++ > kernel/fork.c | 1 + > kernel/sys.c | 13 +++++++++++++ > 8 files changed, 31 insertions(+), 2 deletions(-) > > diff --git a/Documentation/filesystems/proc.txt > b/Documentation/filesystems/proc.txt > index 72624a1..fc95261 100644 > --- a/Documentation/filesystems/proc.txt > +++ b/Documentation/filesystems/proc.txt > @@ -164,6 +164,7 @@ read the file /proc/PID/status: > Uid: 501 501 501 501 > Gid: 100 100 100 100 > FDSize: 256 > + HidePid: 0 > Groups: 100 14 16 > VmPeak: 5004 kB > VmSize: 5004 kB > @@ -228,6 +229,7 @@ Table 1-2: Contents of the status files (as of 4.1) > Gid Real, effective, saved set, and file system > GIDs > Umask file mode creation mask > FDSize number of file descriptor slots currently > allocated > + HidePid process access mode of /proc/<pid>/ > Groups supplementary group list > NStgid descendant namespace thread group ID > hierarchy > NSpid descendant namespace process ID hierarchy > diff --git a/fs/proc/array.c b/fs/proc/array.c > index 51a4213..e6cd1a1 100644 > --- a/fs/proc/array.c > +++ b/fs/proc/array.c > @@ -163,6 +163,7 @@ static inline void task_state(struct seq_file *m, > struct pid_namespace *ns, > const struct cred *cred; > pid_t ppid, tpid = 0, tgid, ngid; > unsigned int max_fds = 0; > + int hide_pid; > > rcu_read_lock(); > ppid = pid_alive(p) ? > @@ -183,6 +184,7 @@ static inline void task_state(struct seq_file *m, > struct pid_namespace *ns, > task_lock(p); > if (p->files) > max_fds = files_fdtable(p->files)->max_fds; > + hide_pid = p->hide_pid; > task_unlock(p); > rcu_read_unlock(); > > @@ -201,6 +203,7 @@ static inline void task_state(struct seq_file *m, > struct pid_namespace *ns, > seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, > cred->egid)); > seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, > cred->sgid)); > seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, > cred->fsgid)); > + seq_put_decimal_ull(m, "\nHidePid:\t", hide_pid); > seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); > > seq_puts(m, "\nGroups:\t"); > diff --git a/fs/proc/base.c b/fs/proc/base.c > index cd8dd15..596b17f 100644 > --- a/fs/proc/base.c > +++ b/fs/proc/base.c > @@ -712,7 +712,9 @@ static bool has_pid_permissions(struct pid_namespace > *pid, > struct task_struct *task, > int hide_pid_min) > { > - if (pid->hide_pid < hide_pid_min) > + int hide_pid = max(pid->hide_pid, (int) current->hide_pid); > + > + if (hide_pid < hide_pid_min) > return true; > if (in_group_p(pid->pid_gid)) > return true; > @@ -733,7 +735,9 @@ static int proc_pid_permission(struct inode *inode, > int mask) > put_task_struct(task); > > if (!has_perms) { > - if (pid->hide_pid == HIDEPID_INVISIBLE) { > + int hide_pid = max(pid->hide_pid, (int) current->hide_pid); > + > + if (hide_pid == HIDEPID_INVISIBLE) { > /* > * Let's make getdents(), stat(), and open() > * consistent with each other. If a process > diff --git a/include/linux/init_task.h b/include/linux/init_task.h > index 325f649..c87de0e 100644 > --- a/include/linux/init_task.h > +++ b/include/linux/init_task.h > @@ -250,6 +250,7 @@ extern struct task_group root_task_group; > .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ > .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ > .timer_slack_ns = 50000, /* 50 usec default slack */ \ > + .hide_pid = 0, \ > .pids = { \ > [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ > [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ > diff --git a/include/linux/sched.h b/include/linux/sched.h > index ad3ec9e..ba9f1d5 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1608,6 +1608,7 @@ struct task_struct { > /* unserialized, strictly 'current' */ > unsigned in_execve:1; /* bit to tell LSMs we're in execve */ > unsigned in_iowait:1; > + unsigned hide_pid:2; /* per-process procfs hidepid= */ > #if !defined(TIF_RESTORE_SIGMASK) > unsigned restore_sigmask:1; > #endif > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h > index a8d0759..ada62b6 100644 > --- a/include/uapi/linux/prctl.h > +++ b/include/uapi/linux/prctl.h > @@ -197,4 +197,8 @@ struct prctl_mm_map { > # define PR_CAP_AMBIENT_LOWER 3 > # define PR_CAP_AMBIENT_CLEAR_ALL 4 > > +/* Per process, non-revokable procfs hidepid= option */ > +#define PR_SET_HIDEPID 48 > +#define PR_GET_HIDEPID 49 > + > #endif /* _LINUX_PRCTL_H */ > diff --git a/kernel/fork.c b/kernel/fork.c > index 11c5c8a..a701a77 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1574,6 +1574,7 @@ static __latent_entropy struct task_struct > *copy_process( > #endif > > p->default_timer_slack_ns = current->timer_slack_ns; > + p->hide_pid = current->hide_pid; > > task_io_accounting_init(&p->ioac); > acct_clear_integrals(p); > diff --git a/kernel/sys.c b/kernel/sys.c > index 842914e..4041ff4 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -2261,6 +2261,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, > arg2, unsigned long, arg3, > case PR_GET_FP_MODE: > error = GET_FP_MODE(me); > break; > + case PR_SET_HIDEPID: > + if (arg2 < HIDEPID_OFF || arg2 > HIDEPID_INVISIBLE || > + arg3 || arg4 || arg5) > + return -EINVAL; > + if (arg2 < me->hide_pid) > + return -EPERM; > + me->hide_pid = arg2; > + break; > + case PR_GET_HIDEPID: > + if (arg2 || arg3 || arg4 || arg5) > + return -EINVAL; > + error = me->hide_pid; > + break; > default: > error = -EINVAL; > break; > -- > 2.5.5 > >
> This should permit Linux distributions to more comprehensively lock > down > their services, as it allows an isolated opt-in for hidepid= for > specific services. Previously hidepid= could only be set system-wide, > and then specific services had to be excluded by group membership, > essentially a more complex concept of opt-out. I think it's a lot easier for them to introduce a proc group and then figure out the very few exceptions that are needed vs. requiring a huge number of opt-ins. I don't think the issue is difficulty in deploying it, it's lack of interest. Android deployed it in 7.x without any major issues. A good way to get people to use it would be adding proc groups to major distributions and getting systemd to expose a simple toggle for this, instead of requiring users to add /proc to fstab (not there by default with systemd) and hard-wired the correct proc gid for that distribution. Can then file bugs for packages needing the proc group. For systemd itself, logind needs it since it drops the capability that allows bypassing it. Other than that, it's mostly just polkit.
Turning on the hidepid group thing for a tightly managed OS like Android is much easier than doing that for an established distro, without breaking compatibility. Moreover using groups for this kind of sandboxing isn't that great anyway as group memberships are "sticky": whenever some code had it, it can always get it back, by creating a setgid binary for it. The prctl() feature has the benefit that hidepid exceptions cannot be made sticky that way... I think it's pretty unrealistic for big distros such as Debian/Ubuntu to move to the hidepid group feature. (I mean, if this was easy, then somebody would already have done that, hidepid is pretty old after all). OTOH some other opt-in security features (such as systemd's PrivateTmp= on Fedora) are pretty well used these days, and I don't think that the prctl would be any different there. The prctl() is a very easy-to-grasp boolean-like concept after all. A big benefit of the prctl is that it makes it easy to lock things down per-process, i.e. to be more fine-grained than just per-user. For example, my own user on my own system should probably be able to see the process tree, to make things easy to administer for me. However, my web browser running under the same user ID probably should not be able to see the entire process tree, and with the prctl that's very easy to set up, as this kind of privilege control does not require privileges with the prctl patch. And also, let's not forget: what is opt-in on the kernel level could actually be made opt-out on the service manager level anyway... Finally, note that the prctl doesn't conflict with the group thing. It just makes it easier to adopt on a per-service and even per-process basis, that's all. Djalal, thank you very much for picking this up! On Mon, Jan 16, 2017 at 7:24 PM, Daniel Micay <danielmicay@gmail.com> wrote: >> This should permit Linux distributions to more comprehensively lock >> down >> their services, as it allows an isolated opt-in for hidepid= for >> specific services. Previously hidepid= could only be set system-wide, >> and then specific services had to be excluded by group membership, >> essentially a more complex concept of opt-out. > > I think it's a lot easier for them to introduce a proc group and then > figure out the very few exceptions that are needed vs. requiring a huge > number of opt-ins. I don't think the issue is difficulty in deploying > it, it's lack of interest. Android deployed it in 7.x without any major > issues. A good way to get people to use it would be adding proc groups > to major distributions and getting systemd to expose a simple toggle for > this, instead of requiring users to add /proc to fstab (not there by > default with systemd) and hard-wired the correct proc gid for that > distribution. Can then file bugs for packages needing the proc group. > For systemd itself, logind needs it since it drops the capability that > allows bypassing it. Other than that, it's mostly just polkit.
On Tue, Jan 17, 2017 at 9:33 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Mon, Jan 16, 2017 at 9:15 AM, Djalal Harouni <tixxdz@gmail.com> wrote: >> Cc linux-api >> >> On Mon, Jan 16, 2017 at 2:23 PM, Djalal Harouni <tixxdz@gmail.com> wrote: >>> >>> From: Djalal Harouni <tixxdz@gmail.com> >>> >>> This adds a new per-task hidepid= flag that is honored by procfs when >>> presenting /proc to the user, in addition to the existing hidepid= mount >>> option. So far, hidepid= was exclusively a per-pidns setting. Locking >>> down a set of processes so that they cannot see other user's processes >>> without affecting the rest of the system thus currently requires >>> creation of a private PID namespace, with all the complexity it brings, >>> including maintaining a stub init process as PID 1 and losing the >>> ability to see processes of the same user on the rest of the system. >>> >>> With this patch all acesss and visibility checks in procfs now >>> honour two fields: >>> >>> a) the existing hide_pid field in the PID namespace >>> b) the new hide_pid in struct task_struct >>> >>> Access/visibility is only granted if both fields permit it; the more >>> restrictive one wins. By default the new task_struct hide_pid value >>> defaults to 0, which means behaviour is not changed from the status quo. >>> >>> Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID >>> prctl() option which takes the same three supported values as the >>> hidepid= mount option. The per-process hide_pid may only be increased, >>> never decreased, thus ensuring that once applied, processes can never >>> escape such a hide_pid jail. When a process forks it inherits its >>> parent's hide_pid value. >>> >>> Suggested usecase: let's say nginx runs as user "www-data". After >>> dropping privileges it may now call: >>> >>> … >>> prctl(PR_SET_HIDEPID, 2); >>> … >>> >>> And from that point on neither nginx itself, nor any of its child >>> processes may see processes in /proc anymore that belong to a different >>> user than "www-data". Other services running on the same system remain >>> unaffected. > > What affect, if any, does this have on ptrace() permissions? This should not affect ptrace() permissions or other system calls that work directly on pids, the test in procfs is related to inodes before the ptrace check, hmm what do you have in mind ? > Also, this one-way thing seems wrong to me. I think it should roughly > follow the no_new_privs rules instead. IOW, if you unshare your > pidns, it gets cleared. Also, maybe you shouldn't be able to set it Andy I don't follow here, no_new_privs is never cleared right ? I can't see the corresponding clear bit code for it. For this one I want it to act like no_new_privs. Also pidns can be created with userns which means it can be revoked. For my use case I want it to be part of *one* single operation where it is set with the other sandbox operations that are all preserved... instead of setting it *again* each time where it can already be late. > without either having CAP_SYS_ADMIN over your userns or having > no_new_privs set. For this one I can add it sure. Historically that logic was added to make seccomp more usable, for this patch the values can't be relaxed, they are always increased never decreased. However one minor advantage if you require no_new_privs is that this option hidepid will also assert that you can't setuid to access some procfs inodes... though you can also just set 'no_new_privs + hidepid' both of them in any order. Also it allows unprivileged without userns to setup a minimal jail while performing some operations that can be blocked by no_new_privs. Andy, Kees any other comments please on it ? I'm not sure if overusing no_new_privs in this case is a good idea. Seems to me that seccomp + no_new_privs is different than this hidepid feature that overlaps nicely with no_new_privs. If there are no responses for this question, then I will just add the "CAP_SYS_ADMIN || no_new_privs" test in the next iteration. > --Andy Thanks!
On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote: > On Tue, Jan 17, 2017 at 9:33 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Mon, Jan 16, 2017 at 9:15 AM, Djalal Harouni <tixxdz@gmail.com> wrote: >>> Cc linux-api >>> >>> On Mon, Jan 16, 2017 at 2:23 PM, Djalal Harouni <tixxdz@gmail.com> wrote: >>>> >>>> From: Djalal Harouni <tixxdz@gmail.com> >>>> >>>> This adds a new per-task hidepid= flag that is honored by procfs when >>>> presenting /proc to the user, in addition to the existing hidepid= mount >>>> option. So far, hidepid= was exclusively a per-pidns setting. Locking >>>> down a set of processes so that they cannot see other user's processes >>>> without affecting the rest of the system thus currently requires >>>> creation of a private PID namespace, with all the complexity it brings, >>>> including maintaining a stub init process as PID 1 and losing the >>>> ability to see processes of the same user on the rest of the system. >>>> >>>> With this patch all acesss and visibility checks in procfs now >>>> honour two fields: >>>> >>>> a) the existing hide_pid field in the PID namespace >>>> b) the new hide_pid in struct task_struct >>>> >>>> Access/visibility is only granted if both fields permit it; the more >>>> restrictive one wins. By default the new task_struct hide_pid value >>>> defaults to 0, which means behaviour is not changed from the status quo. >>>> >>>> Setting the per-process hide_pid value is done via a new PR_SET_HIDEPID >>>> prctl() option which takes the same three supported values as the >>>> hidepid= mount option. The per-process hide_pid may only be increased, >>>> never decreased, thus ensuring that once applied, processes can never >>>> escape such a hide_pid jail. When a process forks it inherits its >>>> parent's hide_pid value. >>>> >>>> Suggested usecase: let's say nginx runs as user "www-data". After >>>> dropping privileges it may now call: >>>> >>>> … >>>> prctl(PR_SET_HIDEPID, 2); >>>> … >>>> >>>> And from that point on neither nginx itself, nor any of its child >>>> processes may see processes in /proc anymore that belong to a different >>>> user than "www-data". Other services running on the same system remain >>>> unaffected. >> >> What affect, if any, does this have on ptrace() permissions? > > This should not affect ptrace() permissions or other system calls that > work directly on pids, the test in procfs is related to inodes before > the ptrace check, hmm what do you have in mind ? > I'm wondering what problem you're trying to solve, then. hidepid helps lock down procfs, but ISTM you might still want to lock down other PID-based APIs. > >> Also, this one-way thing seems wrong to me. I think it should roughly >> follow the no_new_privs rules instead. IOW, if you unshare your >> pidns, it gets cleared. Also, maybe you shouldn't be able to set it > > Andy I don't follow here, no_new_privs is never cleared right ? I > can't see the corresponding clear bit code for it. I believe that unsharing userns clears no_new_privs. > > For this one I want it to act like no_new_privs. Also pidns can be > created with userns which means it can be revoked. For my use case I > want it to be part of *one* single operation where it is set with the > other sandbox operations that are all preserved... instead of setting > it *again* each time where it can already be late. > I don't see the problem as long as this gets implemented carefully enough. If you unshare your userns and your pidns, then you should be able to see all tasks in the new pidns, even if you mount a fresh procfs pointing at that pidns -- after all, you are privileged in that namespace. > >> without either having CAP_SYS_ADMIN over your userns or having >> no_new_privs set. > > For this one I can add it sure. Historically that logic was added to > make seccomp more usable, for this patch the values can't be relaxed, > they are always increased never decreased. However one minor advantage > if you require no_new_privs is that this option hidepid will also > assert that you can't setuid to access some procfs inodes... though > you can also just set 'no_new_privs + hidepid' both of them in any > order. Also it allows unprivileged without userns to setup a minimal > jail while performing some operations that can be blocked by > no_new_privs. > > Andy, Kees any other comments please on it ? I'm not sure if overusing > no_new_privs in this case is a good idea. Seems to me that seccomp + > no_new_privs is different than this hidepid feature that overlaps > nicely with no_new_privs. > > If there are no responses for this question, then I will just add the > "CAP_SYS_ADMIN || no_new_privs" test in the next iteration. I feel like this feature (per-task hidepid) is subtle and complex enough that it should have a very clear purpose and use case before it's merged and that we should make sure that there isn't a better way to accomplish what you're trying to do.
On Thu, Jan 19, 2017 at 12:35 AM, Andy Lutomirski <luto@amacapital.net> wrote: > On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote: [...] >>>>> >>>>> … >>>>> prctl(PR_SET_HIDEPID, 2); >>>>> … >>>>> >>>>> And from that point on neither nginx itself, nor any of its child >>>>> processes may see processes in /proc anymore that belong to a different >>>>> user than "www-data". Other services running on the same system remain >>>>> unaffected. >>> >>> What affect, if any, does this have on ptrace() permissions? >> >> This should not affect ptrace() permissions or other system calls that >> work directly on pids, the test in procfs is related to inodes before >> the ptrace check, hmm what do you have in mind ? >> > > I'm wondering what problem you're trying to solve, then. hidepid > helps lock down procfs, but ISTM you might still want to lock down > other PID-based APIs. Yes but they are already locked based on uid checks. procfs was not and this patch is specifically to align it, and to reduce the ability to peek data from other processes. >> >>> Also, this one-way thing seems wrong to me. I think it should roughly >>> follow the no_new_privs rules instead. IOW, if you unshare your >>> pidns, it gets cleared. Also, maybe you shouldn't be able to set it >> >> Andy I don't follow here, no_new_privs is never cleared right ? I >> can't see the corresponding clear bit code for it. > > I believe that unsharing userns clears no_new_privs. No, it is not cleared, and I can't see the clear bit for it. Maybe due to userns+filesystems limitations it was not noticed. >> >> For this one I want it to act like no_new_privs. Also pidns can be >> created with userns which means it can be revoked. For my use case I >> want it to be part of *one* single operation where it is set with the >> other sandbox operations that are all preserved... instead of setting >> it *again* each time where it can already be late. >> > > I don't see the problem as long as this gets implemented carefully > enough. If you unshare your userns and your pidns, then you should be > able to see all tasks in the new pidns, even if you mount a fresh > procfs pointing at that pidns -- after all, you are privileged in that > namespace. That's already the case, if you are privileged you can see all tasks, the code is written that the per-task hidepid does not overwrite capabilities. >> >>> without either having CAP_SYS_ADMIN over your userns or having >>> no_new_privs set. >> >> For this one I can add it sure. Historically that logic was added to >> make seccomp more usable, for this patch the values can't be relaxed, >> they are always increased never decreased. However one minor advantage >> if you require no_new_privs is that this option hidepid will also >> assert that you can't setuid to access some procfs inodes... though >> you can also just set 'no_new_privs + hidepid' both of them in any >> order. Also it allows unprivileged without userns to setup a minimal >> jail while performing some operations that can be blocked by >> no_new_privs. >> >> Andy, Kees any other comments please on it ? I'm not sure if overusing >> no_new_privs in this case is a good idea. Seems to me that seccomp + >> no_new_privs is different than this hidepid feature that overlaps >> nicely with no_new_privs. >> >> If there are no responses for this question, then I will just add the >> "CAP_SYS_ADMIN || no_new_privs" test in the next iteration. > > I feel like this feature (per-task hidepid) is subtle and complex > enough that it should have a very clear purpose and use case before > it's merged and that we should make sure that there isn't a better way > to accomplish what you're trying to do. Sure, the hidepid mount option is old enough, and this per-task hidepid is clearly defined only for procfs and per task, we can't add another switch that's relate to both a filesystem and pid namespaces, it will be a bit complicated and not really useful for cases that are in *same* pidns where *each* one have to mount its procfs, it will propagate. Also as noted by Lafcadio, the gid thing is a bit hard to use now. Thanks!
On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote: > On Thu, Jan 19, 2017 at 12:35 AM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote: >>>> Also, this one-way thing seems wrong to me. I think it should roughly >>>> follow the no_new_privs rules instead. IOW, if you unshare your >>>> pidns, it gets cleared. Also, maybe you shouldn't be able to set it >>> >>> Andy I don't follow here, no_new_privs is never cleared right ? I >>> can't see the corresponding clear bit code for it. >> >> I believe that unsharing userns clears no_new_privs. > No, it is not cleared, and I can't see the clear bit for it. Maybe due > to userns+filesystems limitations it was not noticed. Hmm, maybe I remembered wrong. >> I feel like this feature (per-task hidepid) is subtle and complex >> enough that it should have a very clear purpose and use case before >> it's merged and that we should make sure that there isn't a better way >> to accomplish what you're trying to do. > > Sure, the hidepid mount option is old enough, and this per-task > hidepid is clearly defined only for procfs and per task, we can't add > another switch that's relate to both a filesystem and pid namespaces, > it will be a bit complicated and not really useful for cases that are > in *same* pidns where *each* one have to mount its procfs, it will > propagate. Also as noted by Lafcadio, the gid thing is a bit hard to > use now. What I'm trying to say is that I want to understand a complete, real-world use case. Adding a security-related per-task flag is can be quite messy and requires a lot of careful thought to get right, and I'd rather avoid it if at all possible. I'm imaging something like a new RestrictPidVisisbility= option in systemd. I agree that this is currently a mess to do. But maybe a simpler solution would be to add a new mount option local_hidepid to procfs. If you set that option, then it overrides hidepid for that instance. Most of these semi-sandboxed daemon processes already have their own mount namespace, so the overhead should be minimal. --Andy
On Thu, Jan 19, 2017 at 12:35 AM, Andy Lutomirski <luto@amacapital.net> wrote: >>>>> And from that point on neither nginx itself, nor any of its child >>>>> processes may see processes in /proc anymore that belong to a different >>>>> user than "www-data". Other services running on the same system remain >>>>> unaffected. >>> >>> What affect, if any, does this have on ptrace() permissions? >> >> This should not affect ptrace() permissions or other system calls that >> work directly on pids, the test in procfs is related to inodes before >> the ptrace check, hmm what do you have in mind ? > > I'm wondering what problem you're trying to solve, then. hidepid > helps lock down procfs, but ISTM you might still want to lock down > other PID-based APIs. The (pseudo-) mount option hidepid= is about reducing information leakage, that's all. It's not about enforcing access to PIDs if you happen to know them. This patch set simply make this global option usable locally, that's all. It does not change semantics, it does not alter permission checks, it doesn't turn the thing into something different than it is right now. It just permits to turn on its effect in a more local way, without having to change the whole system. >>> Also, this one-way thing seems wrong to me. I think it should roughly >>> follow the no_new_privs rules instead. IOW, if you unshare your >>> pidns, it gets cleared. Also, maybe you shouldn't be able to set it >> >> Andy I don't follow here, no_new_privs is never cleared right ? I >> can't see the corresponding clear bit code for it. > > I believe that unsharing userns clears no_new_privs. Reverting the a zero hidepid value if you create your own pid/user namespace sounds OK to me. After all, if you create your own kingdom anyway, it should be up to you what you want to be able to see inside of it... > I feel like this feature (per-task hidepid) is subtle and complex > enough that it should have a very clear purpose and use case before > it's merged and that we should make sure that there isn't a better way > to accomplish what you're trying to do. The purpose for me is really reducing information leakage, the same thing hidepid= always has done, and not more. To quote the proc(5) man page about this: "… This doesn't hide the fact that a process with a specific PID value exists (it can be learned by other means, for example, by "kill -0 $PID"), but it hides a process's UID and GID, which could otherwise be learned by employing stat(2) on a /proc/[pid] directory. This greatly complicates an attacker's task of gathering information about running processes (e.g., discovering whether some daemon is running with elevated privileges, whether another user is running some sensitive program, whether other users are running any program at all, and so on). …" And, yeah, I think this is useful for services, but to make it usable in general-purpose distros an individual per-service and per-process opt-in would be much better than a global opt-in. Yes, I think there's value in reducing information leakage. And yes, what was true when the proc(5) man page was written, is still relevant today, I think. L.
On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> Sure, the hidepid mount option is old enough, and this per-task >> hidepid is clearly defined only for procfs and per task, we can't add >> another switch that's relate to both a filesystem and pid namespaces, >> it will be a bit complicated and not really useful for cases that are >> in *same* pidns where *each* one have to mount its procfs, it will >> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to >> use now. > > What I'm trying to say is that I want to understand a complete, > real-world use case. Adding a security-related per-task flag is can > be quite messy and requires a lot of careful thought to get right, and > I'd rather avoid it if at all possible. > > I'm imaging something like a new RestrictPidVisisbility= option in > systemd. I agree that this is currently a mess to do. But maybe a It's not just a "mess" to do, it's not possible afaics. The hidepid thing is after all not really a mount option, but an option of the pid namespace. Hence, if you want to restrict the visibility for one service only, then you have to get your own PID namespace, but that does a lot more than just hide visibility: it renumbers everything after all... > simpler solution would be to add a new mount option local_hidepid to > procfs. If you set that option, then it overrides hidepid for that > instance. Most of these semi-sandboxed daemon processes already have > their own mount namespace, so the overhead should be minimal. When I worked on the patches originally, I actually wanted to implement this as true per-superblock procfs mount option. But this is really hard to do, as the private superblock pointer of the procfs instance currently points to the pid namespace, and breaking that up, so that you can have multiple procfs superblocks per pid namespace is a ton of work, and I doubt anyone would really like the complexity this brings, just for adding a single 2bit option... The per-process option is much simpler code-wise. It also has semantical benefits: if the thing isn't a mount option it is accessible with absolutely minimal privileges, as it does not imply namespaces and mounting. This means, my Firefox can run with hidepid turned on without my KDE Konsole instance also having to turn it on. Or to say this differently: your suggested RestrictPidVisibility= works nicely both for "systemd" as PID 1 and for "systemd --user" as user "lafcadio", when it is per-process, but is much more complex to implement if it was a true mount option. L.
On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote: [...] >> Sure, the hidepid mount option is old enough, and this per-task >> hidepid is clearly defined only for procfs and per task, we can't add >> another switch that's relate to both a filesystem and pid namespaces, >> it will be a bit complicated and not really useful for cases that are >> in *same* pidns where *each* one have to mount its procfs, it will >> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to >> use now. > > What I'm trying to say is that I want to understand a complete, > real-world use case. Adding a security-related per-task flag is can > be quite messy and requires a lot of careful thought to get right, and > I'd rather avoid it if at all possible. I do agree, but that's not what we are proposing here. This use case is limited we do not manipulate the creds of the task, there are no security transitions. The task does not change, its only related to procfs and pid entries there. Also the flag applies only to current task and not on remote ones... Nothing new here it's an extension of procfs hidepid. > I'm imaging something like a new RestrictPidVisisbility= option in > systemd. I agree that this is currently a mess to do. But maybe a Yes that's one use case, If we manage to land this I'll follow up with it... plus there is, I've a use case related to kubernetes where I do want to reduce the number of processes inside containers per pod to minimal. Some other cases are: lock down children where being unprivileged. Also as noted in other replies on today's desktop systems, under a normal user session, the user should see all processes of the system where the media player, browser etc have no business to see the process tree. This can be easily implemented when launching apps without the need to regain privileges... > simpler solution would be to add a new mount option local_hidepid to > procfs. If you set that option, then it overrides hidepid for that > instance. Most of these semi-sandboxed daemon processes already have > their own mount namespace, so the overhead should be minimal. Andy If that could work :-/ we have to re-write or adapt lot of things inside procfs... plus: Procfs is a miror to the current pid namespace. Mount options are not procfs but rather pid namespace. That would not work. Also having multiple mount namespaces where each one with its setup and having to migrate tasks between these environements: I would rather have the security information or context or any minor flag attached to the task itself rather than the object. The prctl() interface is really simple for userspace. The kernel change is not intrusive, and current approach does not require any privileges. For others you may have to gain privileges or ask some one to set it up. The current tendency is to allow more unprivileged code/containers... to setup such mini jails. Note to mention that this schema is simple and can be isolated from other complexities, set it once and that's it. > --Andy
On Fri, Jan 20, 2017 at 8:33 AM, Djalal Harouni <tixxdz@gmail.com> wrote: > On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote: > [...] >>> Sure, the hidepid mount option is old enough, and this per-task >>> hidepid is clearly defined only for procfs and per task, we can't add >>> another switch that's relate to both a filesystem and pid namespaces, >>> it will be a bit complicated and not really useful for cases that are >>> in *same* pidns where *each* one have to mount its procfs, it will >>> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to >>> use now. >> >> What I'm trying to say is that I want to understand a complete, >> real-world use case. Adding a security-related per-task flag is can >> be quite messy and requires a lot of careful thought to get right, and >> I'd rather avoid it if at all possible. > > I do agree, but that's not what we are proposing here. This use case > is limited we do not manipulate the creds of the task, there are no > security transitions. The task does not change, its only related to > procfs and pid entries there. Also the flag applies only to current > task and not on remote ones... Nothing new here it's an extension of > procfs hidepid. > >> I'm imaging something like a new RestrictPidVisisbility= option in >> systemd. I agree that this is currently a mess to do. But maybe a > > Yes that's one use case, If we manage to land this I'll follow up with > it... plus there is, I've a use case related to kubernetes where I do > want to reduce the number of processes inside containers per pod to > minimal. Some other cases are: lock down children where being > unprivileged. Also as noted in other replies on today's desktop > systems, under a normal user session, the user should see all > processes of the system where the media player, browser etc have no > business to see the process tree. This can be easily implemented when > launching apps without the need to regain privileges... > >> simpler solution would be to add a new mount option local_hidepid to >> procfs. If you set that option, then it overrides hidepid for that >> instance. Most of these semi-sandboxed daemon processes already have >> their own mount namespace, so the overhead should be minimal. > > Andy If that could work :-/ we have to re-write or adapt lot of > things inside procfs... plus: > Procfs is a miror to the current pid namespace. Mount options are not > procfs but rather pid namespace. That would not work. I agree that the kernel change to do it per task is very simple. But this is an unfortunate slippery slope. What if you want to block off everything in /proc that isn't associated with a PID? What if you want to suppress /sys access? What if you want ot block *all* non-current PIDs from being revealed in /proc? What if you want to hide /proc/PID/cmdline? I think that the right solution here is to fix procfs to understand per-superblock mount options. --Andy
On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote: > On Fri, Jan 20, 2017 at 8:33 AM, Djalal Harouni <tixxdz@gmail.com> wrote: >> On Thu, Jan 19, 2017 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: >>> On Thu, Jan 19, 2017 at 5:53 AM, Djalal Harouni <tixxdz@gmail.com> wrote: >> [...] >>>> Sure, the hidepid mount option is old enough, and this per-task >>>> hidepid is clearly defined only for procfs and per task, we can't add >>>> another switch that's relate to both a filesystem and pid namespaces, >>>> it will be a bit complicated and not really useful for cases that are >>>> in *same* pidns where *each* one have to mount its procfs, it will >>>> propagate. Also as noted by Lafcadio, the gid thing is a bit hard to >>>> use now. >>> >>> What I'm trying to say is that I want to understand a complete, >>> real-world use case. Adding a security-related per-task flag is can >>> be quite messy and requires a lot of careful thought to get right, and >>> I'd rather avoid it if at all possible. >> >> I do agree, but that's not what we are proposing here. This use case >> is limited we do not manipulate the creds of the task, there are no >> security transitions. The task does not change, its only related to >> procfs and pid entries there. Also the flag applies only to current >> task and not on remote ones... Nothing new here it's an extension of >> procfs hidepid. >> >>> I'm imaging something like a new RestrictPidVisisbility= option in >>> systemd. I agree that this is currently a mess to do. But maybe a >> >> Yes that's one use case, If we manage to land this I'll follow up with >> it... plus there is, I've a use case related to kubernetes where I do >> want to reduce the number of processes inside containers per pod to >> minimal. Some other cases are: lock down children where being >> unprivileged. Also as noted in other replies on today's desktop >> systems, under a normal user session, the user should see all >> processes of the system where the media player, browser etc have no >> business to see the process tree. This can be easily implemented when >> launching apps without the need to regain privileges... >> >>> simpler solution would be to add a new mount option local_hidepid to >>> procfs. If you set that option, then it overrides hidepid for that >>> instance. Most of these semi-sandboxed daemon processes already have >>> their own mount namespace, so the overhead should be minimal. >> >> Andy If that could work :-/ we have to re-write or adapt lot of >> things inside procfs... plus: >> Procfs is a miror to the current pid namespace. Mount options are not >> procfs but rather pid namespace. That would not work. > > I agree that the kernel change to do it per task is very simple. But > this is an unfortunate slippery slope. What if you want to block off > everything in /proc that isn't associated with a PID? What if you > want to suppress /sys access? What if you want ot block *all* > non-current PIDs from being revealed in /proc? What if you want to > hide /proc/PID/cmdline? For /sys we mount an inaccessible directory on top, we even do that for some static /sys and /proc inodes, of course that doesn't scale but we try... please see below. For non-current PIDs from being revealed in /proc, actually the use case did not come, it will be complex to handle TOCTOU, other races etc. We don't want that and we don't have a use case for it. The patch here is a clear parent -> child relation. > I think that the right solution here is to fix procfs to understand > per-superblock mount options. Unfortunately and as also noted by Lafcadio and you this is too complex. Also from what you have said above and from what /proc reports to userspace and today's use cases with containers, namespaces etc. maybe the kernel needs a new way to report *some* kernel objects and other things to userspace which are not based on /proc... or move them out of /proc... in some cases the kernel may need to know if the calling process is in a namespace... but lets please stay focused here, fixing procfs is a bit out of the scope for this *specific* use case and patch, we don't have the resources to explore something new... The aim here is a simple fix of 2bits that preserves the semantics of procfs and hidepid, at same time makes the hidepid option local to current process. It does not require or mess up with privileges, namespaces etc. Easy to review and maintain. Also as said in other emails we have clear use cases: some cloud/container providers tax users for extra processes and resources that they do not really need, we have mini jails for desktop systems too... all this can be improved. Thanks! > --Andy
On Mon, Jan 23, 2017 at 3:46 AM, Djalal Harouni <tixxdz@gmail.com> wrote: > On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote: >> I agree that the kernel change to do it per task is very simple. But >> this is an unfortunate slippery slope. What if you want to block off >> everything in /proc that isn't associated with a PID? What if you >> want to suppress /sys access? What if you want ot block *all* >> non-current PIDs from being revealed in /proc? What if you want to >> hide /proc/PID/cmdline? > > For /sys we mount an inaccessible directory on top, we even do that > for some static /sys and /proc inodes, of course that doesn't scale > but we try... please see below. So you do have a private fs namespace. > >> I think that the right solution here is to fix procfs to understand >> per-superblock mount options. > > Unfortunately and as also noted by Lafcadio and you this is too > complex. > ... but lets please stay focused here, fixing procfs is a bit out of the > scope for this *specific* use case and patch, we don't have the > resources to explore something new... I'm not the final authority on this (that's probably Eric), but NAK. For upstream Linux, you can't say "doing it right is too hard so we're going to introduce a hackish ABI with questionable security properties". The right solution is IMO quite clearly to fix /proc. This isn't even particularly hard -- there are only 17 instances of s_fs_info in fs/proc/. > ... Easy to review and maintain. Au contraire. It's a pain in the arse to review because of the security implications and it's unpleasant to maintain because it's a special case in core kernel code.
On Mon, Jan 23, 2017 at 9:07 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Mon, Jan 23, 2017 at 3:46 AM, Djalal Harouni <tixxdz@gmail.com> wrote: >> On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote: >>> I agree that the kernel change to do it per task is very simple. But >>> this is an unfortunate slippery slope. What if you want to block off >>> everything in /proc that isn't associated with a PID? What if you >>> want to suppress /sys access? What if you want ot block *all* >>> non-current PIDs from being revealed in /proc? What if you want to >>> hide /proc/PID/cmdline? >> >> For /sys we mount an inaccessible directory on top, we even do that >> for some static /sys and /proc inodes, of course that doesn't scale >> but we try... please see below. > > So you do have a private fs namespace. Yes for some cases and they are cheap, however I don't see the relation here ? >> >>> I think that the right solution here is to fix procfs to understand >>> per-superblock mount options. >> >> Unfortunately and as also noted by Lafcadio and you this is too >> complex. >> ... but lets please stay focused here, fixing procfs is a bit out of the >> scope for this *specific* use case and patch, we don't have the >> resources to explore something new... > > I'm not the final authority on this (that's probably Eric), but NAK. (I don't know the final authority. The *only* thing I know is that we have technical problems here that are *not* fixed, and we try to fix them). > For upstream Linux, you can't say "doing it right is too hard so we're > going to introduce a hackish ABI with questionable security > properties". No one said this. Maybe you think that procfs is the right way, but I certainly can't predict how much damage any fundamental change on procfs will make. procfs is a special fs that has its own rules and hacks... everyone would like to avoid major changes on it... Could you please explain in clear words what are the benefits to use mount, retain or regain CAP_SYS_ADMIN for something that we can set without any privileges ? > The right solution is IMO quite clearly to fix /proc. This isn't even > particularly hard -- there are only 17 instances of s_fs_info in > fs/proc/. pid namespaces are tied to procfs to the heart, any change on procfs has also to count on pid namespaces, flush cached entries of a task from each /proc... ? pid namespaces have then to be made smarter. Forward port vulnerability and bug fixes that have been stacked in procfs ? or at least do not break them. "procfs has always been a special fs" by git logs. Also cases of some specific directories /proc/fs/nfs and maybe devices id ? persistent of mount options and other userspace information - https://lkml.org/lkml/2012/3/26/486 These prevent me from asserting that's the best way... where in a previous thread you said that we should go with the easiest way to fix the problem.
On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote: > I agree that the kernel change to do it per task is very simple. But > this is an unfortunate slippery slope. What if you want to block off > everything in /proc that isn't associated with a PID? What if you > want to suppress /sys access? What if you want ot block *all* > non-current PIDs from being revealed in /proc? What if you want to > hide /proc/PID/cmdline? These are all what-ifs that noone was interested in so far. I have a very specific itch I want to scratch, and I found a reasonably generic concept of exposing it. But now you are trying to lead this all down onto a paths asking for features noone is actually really interested in. > I think that the right solution here is to fix procfs to understand > per-superblock mount options. Andy, this is really not helpful. Doing that is far from easy, that's a major undertaking: asking us to rework procfs in such a major way I only can understand as an indirect way for you to say "go away" to us... Expecting newcomers to kernel work to basically clean up an entire kernel subsystem, dealing with all the politics involved is just not going to work. And again: I am very sure the proposed prctl() based solution is actually much much nicer than any procfs superblock based one, since it works naturally, and easily for unprivileged processes too! Anything that requires mount options means privileges are required in some form or another. Yes userns would open that up, but that comes with a huge amount of additional problems. Anyway, I understand it is your intention to derail this. I can accept that. It's a pity though. L.
On Fri, Feb 10, 2017 at 6:40 AM, Lafcadio Wluiki <wluikil@gmail.com> wrote: > On Sat, Jan 21, 2017 at 1:53 AM, Andy Lutomirski <luto@amacapital.net> wrote: > >> I agree that the kernel change to do it per task is very simple. But >> this is an unfortunate slippery slope. What if you want to block off >> everything in /proc that isn't associated with a PID? What if you >> want to suppress /sys access? What if you want ot block *all* >> non-current PIDs from being revealed in /proc? What if you want to >> hide /proc/PID/cmdline? > > These are all what-ifs that noone was interested in so far. I have a > very specific itch I want to scratch, and I found a reasonably generic > concept of exposing it. But now you are trying to lead this all down > onto a paths asking for features noone is actually really interested > in. If you have an itch and want to scratch it by adding a new API, the API has to be right, full stop. > >> I think that the right solution here is to fix procfs to understand >> per-superblock mount options. > > Andy, this is really not helpful. Doing that is far from easy, that's > a major undertaking: asking us to rework procfs in such a major way I > only can understand as an indirect way for you to say "go away" to > us... Expecting newcomers to kernel work to basically clean up an > entire kernel subsystem, dealing with all the politics involved is > just not going to work. Once upon a time, a newbish kernel hacker named Andy wrote some code to change the way the vDSO worked. The maintainers told me that it was a nice thought and that it was too messy and wasn't acceptable in its present form. So the patchset got quite a bit longer, I learned more about the mess that was the vDSO, and I fixed it. The maintainers were right. It's just not the case that a newcomer gets to add fields to task_struct that a more experienced developer wouldn't get to add. Sorry. > > And again: I am very sure the proposed prctl() based solution is > actually much much nicer than any procfs superblock based one, since > it works naturally, and easily for unprivileged processes too! > Anything that requires mount options means privileges are required in > some form or another. Yes userns would open that up, but that comes > with a huge amount of additional problems. If you want to design a generic mechanism by which a task can change the way that pseudo filesystems look for itself and its children without using mount options, and you come up with something that is sane, safe, generic, and reasonable clean, you're welcome to do so. This sounds like a considerably more challenging undertaking than just fixing procfs. > > Anyway, I understand it is your intention to derail this. I can accept > that. It's a pity though. I have no intention of derailing improvements to hidepid. In fact, I'd love to see improvements here. I'm just saying that the current proposal isn't the way to do it.
On Wed, Jan 18, 2017 at 3:35 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote: >> Andy I don't follow here, no_new_privs is never cleared right ? I >> can't see the corresponding clear bit code for it. > > I believe that unsharing userns clears no_new_privs. Seriously? That's kind of ... weird. I mean, I guess you're priv-confined in a way, but that seems fragile. -Kees
On Fri, Feb 10, 2017 at 3:44 PM, Kees Cook <keescook@chromium.org> wrote: > On Wed, Jan 18, 2017 at 3:35 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote: >>> Andy I don't follow here, no_new_privs is never cleared right ? I >>> can't see the corresponding clear bit code for it. >> >> I believe that unsharing userns clears no_new_privs. > > Seriously? That's kind of ... weird. I mean, I guess you're > priv-confined in a way, but that seems fragile. > I appear to have made this up. Either I genuinely pulled it out of thin air or it was discussed and not done. $ setpriv --nnp unshare -Ur cat /proc/self/status |grep NoNewPrivs NoNewPrivs: 1 If it were to be done, it ought to be quite safe except for possible LSM issues. --Andy
On Mon, Feb 13, 2017 at 11:01 AM, Andy Lutomirski <luto@amacapital.net> wrote: > On Fri, Feb 10, 2017 at 3:44 PM, Kees Cook <keescook@chromium.org> wrote: >> On Wed, Jan 18, 2017 at 3:35 PM, Andy Lutomirski <luto@amacapital.net> wrote: >>> On Wed, Jan 18, 2017 at 2:50 PM, Djalal Harouni <tixxdz@gmail.com> wrote: >>>> Andy I don't follow here, no_new_privs is never cleared right ? I >>>> can't see the corresponding clear bit code for it. >>> >>> I believe that unsharing userns clears no_new_privs. >> >> Seriously? That's kind of ... weird. I mean, I guess you're >> priv-confined in a way, but that seems fragile. >> > > I appear to have made this up. Either I genuinely pulled it out of > thin air or it was discussed and not done. > > $ setpriv --nnp unshare -Ur cat /proc/self/status |grep NoNewPrivs > NoNewPrivs: 1 > > If it were to be done, it ought to be quite safe except for possible LSM issues. Okay, cool. Thanks. (Also, where does "setpriv" live? I must need a new set of util-linux or something?) -Kees
On Mon, 13 Feb 2017, Kees Cook wrote: > Okay, cool. Thanks. (Also, where does "setpriv" live? I must need a > new set of util-linux or something?) Indeed, a newer version of util-linux[0] should do, although Debian/testing appears to have an extra package just for "setpriv": https://packages.debian.org/stretch/setpriv C. [0] https://git.kernel.org/cgit/utils/util-linux/util-linux.git/commit/?id=5600c40
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 72624a1..fc95261 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -164,6 +164,7 @@ read the file /proc/PID/status: Uid: 501 501 501 501 Gid: 100 100 100 100 FDSize: 256 + HidePid: 0 Groups: 100 14 16 VmPeak: 5004 kB VmSize: 5004 kB @@ -228,6 +229,7 @@ Table 1-2: Contents of the status files (as of 4.1) Gid Real, effective, saved set, and file system GIDs Umask file mode creation mask FDSize number of file descriptor slots currently allocated + HidePid process access mode of /proc/<pid>/ Groups supplementary group list NStgid descendant namespace thread group ID hierarchy NSpid descendant namespace process ID hierarchy diff --git a/fs/proc/array.c b/fs/proc/array.c index 51a4213..e6cd1a1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -163,6 +163,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; unsigned int max_fds = 0; + int hide_pid; rcu_read_lock(); ppid = pid_alive(p) ? @@ -183,6 +184,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_lock(p); if (p->files) max_fds = files_fdtable(p->files)->max_fds; + hide_pid = p->hide_pid; task_unlock(p); rcu_read_unlock(); @@ -201,6 +203,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); + seq_put_decimal_ull(m, "\nHidePid:\t", hide_pid); seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); seq_puts(m, "\nGroups:\t"); diff --git a/fs/proc/base.c b/fs/proc/base.c index cd8dd15..596b17f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -712,7 +712,9 @@ static bool has_pid_permissions(struct pid_namespace *pid, struct task_struct *task, int hide_pid_min) { - if (pid->hide_pid < hide_pid_min) + int hide_pid = max(pid->hide_pid, (int) current->hide_pid); + + if (hide_pid < hide_pid_min) return true; if (in_group_p(pid->pid_gid)) return true; @@ -733,7 +735,9 @@ static int proc_pid_permission(struct inode *inode, int mask) put_task_struct(task); if (!has_perms) { - if (pid->hide_pid == HIDEPID_INVISIBLE) { + int hide_pid = max(pid->hide_pid, (int) current->hide_pid); + + if (hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 325f649..c87de0e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -250,6 +250,7 @@ extern struct task_group root_task_group; .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ + .hide_pid = 0, \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index ad3ec9e..ba9f1d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1608,6 +1608,7 @@ struct task_struct { /* unserialized, strictly 'current' */ unsigned in_execve:1; /* bit to tell LSMs we're in execve */ unsigned in_iowait:1; + unsigned hide_pid:2; /* per-process procfs hidepid= */ #if !defined(TIF_RESTORE_SIGMASK) unsigned restore_sigmask:1; #endif diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index a8d0759..ada62b6 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -197,4 +197,8 @@ struct prctl_mm_map { # define PR_CAP_AMBIENT_LOWER 3 # define PR_CAP_AMBIENT_CLEAR_ALL 4 +/* Per process, non-revokable procfs hidepid= option */ +#define PR_SET_HIDEPID 48 +#define PR_GET_HIDEPID 49 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 11c5c8a..a701a77 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1574,6 +1574,7 @@ static __latent_entropy struct task_struct *copy_process( #endif p->default_timer_slack_ns = current->timer_slack_ns; + p->hide_pid = current->hide_pid; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/sys.c b/kernel/sys.c index 842914e..4041ff4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SET_HIDEPID: + if (arg2 < HIDEPID_OFF || arg2 > HIDEPID_INVISIBLE || + arg3 || arg4 || arg5) + return -EINVAL; + if (arg2 < me->hide_pid) + return -EPERM; + me->hide_pid = arg2; + break; + case PR_GET_HIDEPID: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = me->hide_pid; + break; default: error = -EINVAL; break;