@@ -437,9 +437,14 @@ new set of credentials by calling::
struct cred *prepare_creds(void);
-this locks current->cred_replace_mutex and then allocates and constructs a
-duplicate of the current process's credentials, returning with the mutex still
-held if successful. It returns NULL if not successful (out of memory).
+this allocates and constructs a duplicate of the current process's credentials.
+It returns NULL if not successful (out of memory).
+
+If called from __do_execve_file, the mutex current->signal->cred_guard_mutex
+is acquired before this function gets called, and released after setting
+current->signal->cred_locked_for_ptrace. The same mutex is acquired later,
+while the credentials and the process mmap are actually changed, and
+current->signal->cred_locked_for_ptrace is reset again.
The mutex prevents ``ptrace()`` from altering the ptrace state of a process
while security checks on credentials construction and changing is taking place
@@ -466,9 +471,8 @@ by calling::
This will alter various aspects of the credentials and the process, giving the
LSM a chance to do likewise, then it will use ``rcu_assign_pointer()`` to
-actually commit the new credentials to ``current->cred``, it will release
-``current->cred_replace_mutex`` to allow ``ptrace()`` to take place, and it
-will notify the scheduler and others of the changes.
+actually commit the new credentials to ``current->cred``, and it will notify
+the scheduler and others of the changes.
This function is guaranteed to return 0, so that it can be tail-called at the
end of such functions as ``sys_setresuid()``.
@@ -486,8 +490,7 @@ invoked::
void abort_creds(struct cred *new);
-This releases the lock on ``current->cred_replace_mutex`` that
-``prepare_creds()`` got and then releases the new credentials.
+This releases the new credentials.
A typical credentials alteration function would look something like this::
@@ -1266,6 +1266,12 @@ int flush_old_exec(struct linux_binprm * bprm)
if (retval)
goto out;
+ retval = mutex_lock_killable(¤t->signal->cred_guard_mutex);
+ if (retval)
+ goto out;
+
+ bprm->called_flush_old_exec = 1;
+
/*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visibile until then. This also enables the update
@@ -1398,28 +1404,41 @@ void finalize_exec(struct linux_binprm *bprm)
EXPORT_SYMBOL(finalize_exec);
/*
- * Prepare credentials and lock ->cred_guard_mutex.
+ * Prepare credentials and set ->cred_locked_for_ptrace.
* install_exec_creds() commits the new creds and drops the lock.
* Or, if exec fails before, free_bprm() should release ->cred and
* and unlock.
*/
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
+ int ret;
+
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
+ ret = -EAGAIN;
+ if (unlikely(current->signal->cred_locked_for_ptrace))
+ goto out;
+
+ ret = -ENOMEM;
bprm->cred = prepare_exec_creds();
- if (likely(bprm->cred))
- return 0;
+ if (likely(bprm->cred)) {
+ current->signal->cred_locked_for_ptrace = true;
+ ret = 0;
+ }
+out:
mutex_unlock(¤t->signal->cred_guard_mutex);
- return -ENOMEM;
+ return ret;
}
static void free_bprm(struct linux_binprm *bprm)
{
free_arg_pages(bprm);
if (bprm->cred) {
+ if (!bprm->called_flush_old_exec)
+ mutex_lock(¤t->signal->cred_guard_mutex);
+ current->signal->cred_locked_for_ptrace = false;
mutex_unlock(¤t->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1469,6 +1488,7 @@ void install_exec_creds(struct linux_binprm *bprm)
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
+ current->signal->cred_locked_for_ptrace = false;
mutex_unlock(¤t->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
@@ -44,7 +44,11 @@ struct linux_binprm {
* exec has happened. Used to sanitize execution environment
* and to set AT_SECURE auxv for glibc.
*/
- secureexec:1;
+ secureexec:1,
+ /*
+ * Set by flush_old_exec, when the cred_change_mutex is taken.
+ */
+ called_flush_old_exec:1;
#ifdef __alpha__
unsigned int taso:1;
#endif
@@ -225,6 +225,7 @@ struct signal_struct {
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace) */
+ bool cred_locked_for_ptrace; /* set while in execve */
} __randomize_layout;
/*
@@ -26,6 +26,7 @@
.multiprocess = HLIST_HEAD_INIT,
.rlim = INIT_RLIMITS,
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
+ .cred_locked_for_ptrace = false,
#ifdef CONFIG_POSIX_TIMERS
.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
.cputimer = {
@@ -676,7 +676,7 @@ void __init cred_init(void)
*
* Returns the new credentials or NULL if out of memory.
*
- * Does not take, and does not return holding current->cred_replace_mutex.
+ * Does not take, and does not return holding ->cred_guard_mutex.
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
@@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
+ sig->cred_locked_for_ptrace = false;
return 0;
}
@@ -395,6 +395,10 @@ static int ptrace_attach(struct task_struct *task, long request,
if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
goto out;
+ retval = -EAGAIN;
+ if (task->signal->cred_locked_for_ptrace)
+ goto unlock_creds;
+
task_lock(task);
retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
task_unlock(task);
@@ -204,7 +204,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
- * Explicitly map EACCES to EPERM as EPERM is a more a
+ * Explicitly map EACCES to EPERM as EPERM is a more
* appropriate error code for process_vw_readv/writev
*/
if (rc == -EACCES)
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -iquote../../../../include/uapi -Wall
+CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
-TEST_GEN_PROGS := get_syscall_info peeksiginfo
+TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
include ../lib.mk
new file mode 100644
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de>
+ * All rights reserved.
+ *
+ * Check whether /proc/$pid/mem can be accessed without causing deadlocks
+ * when de_thread is blocked with ->cred_guard_mutex held.
+ */
+
+#include "../kselftest_harness.h"
+#include <stdio.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+
+static void *thread(void *arg)
+{
+ ptrace(PTRACE_TRACEME, 0, 0L, 0L);
+ return NULL;
+}
+
+TEST(vmaccess)
+{
+ int f, pid = fork();
+ char mm[64];
+
+ if (!pid) {
+ pthread_t pt;
+ pthread_create(&pt, NULL, thread, NULL);
+ pthread_join(pt, NULL);
+ execlp("true", "true", NULL);
+ }
+
+ sleep(1);
+ sprintf(mm, "/proc/%d/mem", pid);
+ f = open(mm, O_RDONLY);
+ ASSERT_LE(0, f);
+ close(f);
+ f = kill(pid, SIGCONT);
+ ASSERT_EQ(0, f);
+}
+
+TEST(attach)
+{
+ int f, pid = fork();
+
+ if (!pid) {
+ pthread_t pt;
+ pthread_create(&pt, NULL, thread, NULL);
+ pthread_join(pt, NULL);
+ execlp("true", "true", NULL);
+ }
+
+ sleep(1);
+ f = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+ ASSERT_EQ(EAGAIN, errno);
+ ASSERT_EQ(f, -1);
+ f = kill(pid, SIGCONT);
+ ASSERT_EQ(0, f);
+}
+
+TEST_HARNESS_MAIN
This fixes a deadlock in the tracer when tracing a multi-threaded application that calls execve while more than one thread are running. I observed that when running strace on the gcc test suite, it always blocks after a while, when expect calls execve, because other threads have to be terminated. They send ptrace events, but the strace is no longer able to respond, since it is blocked in vm_access. The deadlock is always happening when strace needs to access the tracees process mmap, while another thread in the tracee starts to execve a child process, but that cannot continue until the PTRACE_EVENT_EXIT is handled and the WIFEXITED event is received: strace D 0 30614 30584 0x00000000 Call Trace: __schedule+0x3ce/0x6e0 schedule+0x5c/0xd0 schedule_preempt_disabled+0x15/0x20 __mutex_lock.isra.13+0x1ec/0x520 __mutex_lock_killable_slowpath+0x13/0x20 mutex_lock_killable+0x28/0x30 mm_access+0x27/0xa0 process_vm_rw_core.isra.3+0xff/0x550 process_vm_rw+0xdd/0xf0 __x64_sys_process_vm_readv+0x31/0x40 do_syscall_64+0x64/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 expect D 0 31933 30876 0x80004003 Call Trace: __schedule+0x3ce/0x6e0 schedule+0x5c/0xd0 flush_old_exec+0xc4/0x770 load_elf_binary+0x35a/0x16c0 search_binary_handler+0x97/0x1d0 __do_execve_file.isra.40+0x5d4/0x8a0 __x64_sys_execve+0x49/0x60 do_syscall_64+0x64/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The proposed solution is to take the cred_guard_mutex only in a critical section at the beginning, and at the end of the execve function, and let PTRACE_ATTACH fail with EAGAIN while execve is not complete, but other functions like vm_access are allowed to complete normally. I also took the opportunity to improve the documentation of prepare_creds, which is obviously out of sync. Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de> --- Documentation/security/credentials.rst | 19 +++++---- fs/exec.c | 28 ++++++++++++-- include/linux/binfmts.h | 6 ++- include/linux/sched/signal.h | 1 + init/init_task.c | 1 + kernel/cred.c | 2 +- kernel/fork.c | 1 + kernel/ptrace.c | 4 ++ mm/process_vm_access.c | 2 +- tools/testing/selftests/ptrace/Makefile | 4 +- tools/testing/selftests/ptrace/vmaccess.c | 64 +++++++++++++++++++++++++++++++ 11 files changed, 115 insertions(+), 17 deletions(-) create mode 100644 tools/testing/selftests/ptrace/vmaccess.c v2: adds a test case which passes when this patch is applied. v3: fixes the issue without introducing a new mutex.