Message ID | 20240924141001.116584-1-tycho@tycho.pizza (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [RFC] exec: add a flag for "reasonable" execveat() comm | expand |
Tycho Andersen <tycho@tycho.pizza> writes: > From: Tycho Andersen <tandersen@netflix.com> > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > switching to execveat() for service execution, but can't, because the > contents of /proc/pid/comm are the file descriptor which was used, > instead of the path to the binary. This makes the output of tools like > top and ps useless, especially in a world where most fds are opened > CLOEXEC so the number is truly meaningless. > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > contents of argv[0], instead of the fdno. The kernel allows prctl(PR_SET_NAME, ...) without any permission checks so adding an AT_ flat to use argv[0] instead of the execed filename seems reasonable. Maybe the flag should be called AT_NAME_ARGV0. That said I am trying to remember why we picked /dev/fd/N, as the filename. My memory is that we couldn't think of anything more reasonable to use. Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system call") unfortunately doesn't clarify anything for me, except that /dev/fd/N was a reasonable choice. I am thinking the code could reasonably try: get_fs_root_rcu(current->fs, &root); path = __d_path(file->f_path, root, buf, buflen); To see if a path to the file from the current root directory can be found. For files that are not reachable from the current root the code still need to fallback to /dev/fd/N. Do you think you can investigate that and see if that would generate a reasonable task->comm? If for no other reason than because it would generate a usable result for #! scripts, without /proc mounted. It looks like a reasonable case can be made that while /dev/fd/N is a good path for interpreters, it is never a good choice for comm, so perhaps we could always use argv[0] if the fdpath is of the form /dev/fd/N. All of that said I am not a fan of the implementation below as it has the side effect of replacing /dev/fd/N with a filename that is not usable by #! interpreters. So I suggest an implementation that affects task->comm and not brpm->filename. Eric > Signed-off-by: Tycho Andersen <tandersen@netflix.com> > Suggested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> > CC: Aleksa Sarai <cyphar@cyphar.com> > --- > There is some question about what to name the flag; it seems to me that > "everyone wants this" instead of the fdno, but probably "REASONABLE" is not > a good choice. > > Also, requiring the arg to alloc_bprm() is a bit ugly: kernel-based execs > will never use this, so they just have to pass an empty thing. We could > introduce a bprm_fixup_comm() to do the munging there, but then the code > paths start to diverge, which is maybe not nice. I left it this way because > this is the smallest patch in terms of size, but I'm happy to change it. > > Finally, here is a small set of test programs, I'm happy to turn them into > kselftests if we agree on an API > > #include <stdio.h> > #include <unistd.h> > #include <stdlib.h> > #include <sys/types.h> > #include <sys/stat.h> > #include <fcntl.h> > > int main(void) > { > int fd; > char buf[128]; > > fd = open("/proc/self/comm", O_RDONLY); > if (fd < 0) { > perror("open comm"); > exit(1); > } > > if (read(fd, buf, 128) < 0) { > perror("read"); > exit(1); > } > > printf("comm: %s", buf); > exit(0); > } > > #define _GNU_SOURCE > #include <stdio.h> > #include <syscall.h> > #include <stdbool.h> > #include <unistd.h> > #include <fcntl.h> > #include <stdlib.h> > #include <errno.h> > #include <sys/wait.h> > > #ifndef AT_EMPTY_PATH > #define AT_EMPTY_PATH 0x1000 /* Allow empty relative */ > #endif > > #ifndef AT_EXEC_REASONABLE_COMM > #define AT_EXEC_REASONABLE_COMM 0x200 > #endif > > int main(int argc, char *argv[]) > { > pid_t pid; > int status; > bool wants_reasonable_comm = argc > 1; > > pid = fork(); > if (pid < 0) { > perror("fork"); > exit(1); > } > > if (pid == 0) { > int fd; > long ret, flags; > > fd = open("./catprocselfcomm", O_PATH); > if (fd < 0) { > perror("open catprocselfname"); > exit(1); > } > > flags = AT_EMPTY_PATH; > if (wants_reasonable_comm) > flags |= AT_EXEC_REASONABLE_COMM; > syscall(__NR_execveat, fd, "", (char *[]){"./catprocselfcomm", NULL}, NULL, flags); > fprintf(stderr, "execveat failed %d\n", errno); > exit(1); > } > > if (waitpid(pid, &status, 0) != pid) { > fprintf(stderr, "wrong child\n"); > exit(1); > } > > if (!WIFEXITED(status)) { > fprintf(stderr, "exit status %x\n", status); > exit(1); > } > > if (WEXITSTATUS(status) != 0) { > fprintf(stderr, "child failed\n"); > exit(1); > } > > return 0; > } > --- > fs/exec.c | 22 ++++++++++++++++++---- > include/uapi/linux/fcntl.h | 3 ++- > 2 files changed, 20 insertions(+), 5 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index dad402d55681..36434feddb7b 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1569,11 +1569,15 @@ static void free_bprm(struct linux_binprm *bprm) > kfree(bprm); > } > > -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, > + struct user_arg_ptr argv, int flags) > { > struct linux_binprm *bprm; > struct file *file; > int retval = -ENOMEM; > + bool needs_comm_fixup = flags & AT_EXEC_REASONABLE_COMM; > + > + flags &= ~AT_EXEC_REASONABLE_COMM; > > file = do_open_execat(fd, filename, flags); > if (IS_ERR(file)) > @@ -1590,11 +1594,20 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl > if (fd == AT_FDCWD || filename->name[0] == '/') { > bprm->filename = filename->name; > } else { > - if (filename->name[0] == '\0') > + if (needs_comm_fixup) { > + const char __user *p = get_user_arg_ptr(argv, 0); > + > + retval = -EFAULT; > + if (!p) > + goto out_free; > + > + bprm->fdpath = strndup_user(p, MAX_ARG_STRLEN); > + } else if (filename->name[0] == '\0') > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); > else > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", > fd, filename->name); > + retval = -ENOMEM; > if (!bprm->fdpath) > goto out_free; > > @@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename, > * further execve() calls fail. */ > current->flags &= ~PF_NPROC_EXCEEDED; > > - bprm = alloc_bprm(fd, filename, flags); > + bprm = alloc_bprm(fd, filename, argv, flags); > if (IS_ERR(bprm)) { > retval = PTR_ERR(bprm); > goto out_ret; > @@ -2034,6 +2047,7 @@ int kernel_execve(const char *kernel_filename, > struct linux_binprm *bprm; > int fd = AT_FDCWD; > int retval; > + struct user_arg_ptr user_argv = {}; > > /* It is non-sense for kernel threads to call execve */ > if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) > @@ -2043,7 +2057,7 @@ int kernel_execve(const char *kernel_filename, > if (IS_ERR(filename)) > return PTR_ERR(filename); > > - bprm = alloc_bprm(fd, filename, 0); > + bprm = alloc_bprm(fd, filename, user_argv, 0); > if (IS_ERR(bprm)) { > retval = PTR_ERR(bprm); > goto out_ret; > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > index 87e2dec79fea..7178d1e4a3de 100644 > --- a/include/uapi/linux/fcntl.h > +++ b/include/uapi/linux/fcntl.h > @@ -100,7 +100,8 @@ > /* Reserved for per-syscall flags 0xff. */ > #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic > links. */ > -/* Reserved for per-syscall flags 0x200 */ > +#define AT_EXEC_REASONABLE_COMM 0x200 /* Use argv[0] for comm in > + execveat */ > #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ > #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount > traversal. */ > > base-commit: baeb9a7d8b60b021d907127509c44507539c15e5
On September 24, 2024 10:39:35 AM PDT, "Eric W. Biederman" <ebiederm@xmission.com> wrote: >Tycho Andersen <tycho@tycho.pizza> writes: > >> From: Tycho Andersen <tandersen@netflix.com> >> >> Zbigniew mentioned at Linux Plumber's that systemd is interested in >> switching to execveat() for service execution, but can't, because the >> contents of /proc/pid/comm are the file descriptor which was used, >> instead of the path to the binary. This makes the output of tools like >> top and ps useless, especially in a world where most fds are opened >> CLOEXEC so the number is truly meaningless. And just to double check: systemd's use would be entirely cosmetic, yes? >> >> This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the >> contents of argv[0], instead of the fdno. > >The kernel allows prctl(PR_SET_NAME, ...) without any permission >checks so adding an AT_ flat to use argv[0] instead of the execed >filename seems reasonable. > >Maybe the flag should be called AT_NAME_ARGV0. If we add an AT flag I like this name. > > >That said I am trying to remember why we picked /dev/fd/N, as the >filename. > >My memory is that we couldn't think of anything more reasonable to use. >Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system >call") unfortunately doesn't clarify anything for me, except that >/dev/fd/N was a reasonable choice. > >I am thinking the code could reasonably try: > get_fs_root_rcu(current->fs, &root); > path = __d_path(file->f_path, root, buf, buflen); > >To see if a path to the file from the current root directory can be >found. For files that are not reachable from the current root the code >still need to fallback to /dev/fd/N. > >Do you think you can investigate that and see if that would generate >a reasonable task->comm? > >If for no other reason than because it would generate a usable result >for #! scripts, without /proc mounted. > > >It looks like a reasonable case can be made that while /dev/fd/N is >a good path for interpreters, it is never a good choice for comm, >so perhaps we could always use argv[0] if the fdpath is of the >form /dev/fd/N. I haven't had a chance to go look closely yet, but this was the same thought I had when I first read this RFC. Nobody really wants a dev path in comm. Can we do this unconditionally? (And if argv0 is empty, use dev path...) >All of that said I am not a fan of the implementation below as it has >the side effect of replacing /dev/fd/N with a filename that is not >usable by #! interpreters. So I suggest an implementation that affects >task->comm and not brpm->filename. Also agreed. There is already enough fiddly usage of the bprm filename/interpreter/fdpath members -- the argv0 stuff should be distinct. Perhaps store a pointer to argv0 during arg copy? I need to go look but I'm still AFK/OoO... -Kees
On Tue, Sep 24, 2024 at 02:37:13PM -0700, Kees Cook wrote: > > > On September 24, 2024 10:39:35 AM PDT, "Eric W. Biederman" <ebiederm@xmission.com> wrote: > >Tycho Andersen <tycho@tycho.pizza> writes: > > > >> From: Tycho Andersen <tandersen@netflix.com> > >> > >> Zbigniew mentioned at Linux Plumber's that systemd is interested in > >> switching to execveat() for service execution, but can't, because the > >> contents of /proc/pid/comm are the file descriptor which was used, > >> instead of the path to the binary. This makes the output of tools like > >> top and ps useless, especially in a world where most fds are opened > >> CLOEXEC so the number is truly meaningless. > > And just to double check: systemd's use would be entirely cosmetic, yes? I think it's not really systemd, but their concern for admins looking at `ps` and being confused by "4 is using lots of CPU". IIUC systemd won't actually use the value at all. Zbigniew can confirm though. > >> > >> This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > >> contents of argv[0], instead of the fdno. > > > >The kernel allows prctl(PR_SET_NAME, ...) without any permission > >checks so adding an AT_ flat to use argv[0] instead of the execed > >filename seems reasonable. > > > >Maybe the flag should be called AT_NAME_ARGV0. > > If we add an AT flag I like this name. +1 > > > > > >That said I am trying to remember why we picked /dev/fd/N, as the > >filename. > > > >My memory is that we couldn't think of anything more reasonable to use. > >Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system > >call") unfortunately doesn't clarify anything for me, except that > >/dev/fd/N was a reasonable choice. > > > >I am thinking the code could reasonably try: > > get_fs_root_rcu(current->fs, &root); > > path = __d_path(file->f_path, root, buf, buflen); > > > >To see if a path to the file from the current root directory can be > >found. For files that are not reachable from the current root the code > >still need to fallback to /dev/fd/N. > > > >Do you think you can investigate that and see if that would generate > >a reasonable task->comm? > > > >If for no other reason than because it would generate a usable result > >for #! scripts, without /proc mounted. > > > > > >It looks like a reasonable case can be made that while /dev/fd/N is > >a good path for interpreters, it is never a good choice for comm, > >so perhaps we could always use argv[0] if the fdpath is of the > >form /dev/fd/N. > > I haven't had a chance to go look closely yet, but this was the same thought I had when I first read this RFC. Nobody really wants a dev path in comm. Can we do this unconditionally? (And if argv0 is empty, use dev path...) We can, I was just worried about the behavior change. But it seems we are all in violent agreement that the current behavior isn't very good, so maybe it's fine to change. > >All of that said I am not a fan of the implementation below as it has > >the side effect of replacing /dev/fd/N with a filename that is not > >usable by #! interpreters. So I suggest an implementation that affects > >task->comm and not brpm->filename. > > Also agreed. There is already enough fiddly usage of the bprm filename/interpreter/fdpath members -- the argv0 stuff should be distinct. Perhaps store a pointer to argv0 during arg copy? I need to go look but I'm still AFK/OoO... Yeah, on second thought we could do something like: diff --git a/fs/exec.c b/fs/exec.c index 36434feddb7b..a45ea270cc43 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1416,7 +1416,10 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + if (needs_comm_fixup) + __set_task_comm(me, argv0, true); + else + __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ and then we don't need to mess with bprm at all. Seems much cleaner. I will see about the get_fs_root_rcu(current->fs, &root); path = __d_path(file->f_path, root, buf, buflen); that Eric suggested and how that works with the above. Tycho
diff --git a/fs/exec.c b/fs/exec.c index dad402d55681..36434feddb7b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1569,11 +1569,15 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, + struct user_arg_ptr argv, int flags) { struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM; + bool needs_comm_fixup = flags & AT_EXEC_REASONABLE_COMM; + + flags &= ~AT_EXEC_REASONABLE_COMM; file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) @@ -1590,11 +1594,20 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { - if (filename->name[0] == '\0') + if (needs_comm_fixup) { + const char __user *p = get_user_arg_ptr(argv, 0); + + retval = -EFAULT; + if (!p) + goto out_free; + + bprm->fdpath = strndup_user(p, MAX_ARG_STRLEN); + } else if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); + retval = -ENOMEM; if (!bprm->fdpath) goto out_free; @@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename, * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; - bprm = alloc_bprm(fd, filename, flags); + bprm = alloc_bprm(fd, filename, argv, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -2034,6 +2047,7 @@ int kernel_execve(const char *kernel_filename, struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; + struct user_arg_ptr user_argv = {}; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) @@ -2043,7 +2057,7 @@ int kernel_execve(const char *kernel_filename, if (IS_ERR(filename)) return PTR_ERR(filename); - bprm = alloc_bprm(fd, filename, 0); + bprm = alloc_bprm(fd, filename, user_argv, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 87e2dec79fea..7178d1e4a3de 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -100,7 +100,8 @@ /* Reserved for per-syscall flags 0xff. */ #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ -/* Reserved for per-syscall flags 0x200 */ +#define AT_EXEC_REASONABLE_COMM 0x200 /* Use argv[0] for comm in + execveat */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal. */