diff mbox series

[v2] exit: perform randomness and pid work without tasklist_lock

Message ID 20250128160743.3142544-1-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series [v2] exit: perform randomness and pid work without tasklist_lock | expand

Commit Message

Mateusz Guzik Jan. 28, 2025, 4:07 p.m. UTC
Both add_device_randomness() and attach_pid()/detach_pid() have their
own synchronisation mechanisms.

The clone side calls them *without* the tasklist_lock held, meaning
parallel calls can contend on their locks.

The exit side calls them *with* the tasklist_lock lock, which means the
hold time is avoidably extended by waiting on either of the 2 locks, in
turn exacerbating contention on tasklist_lock itself.

Postponing the work until after the lock is dropped bumps thread
creation/destruction rate by 15% on a 24 core vm.

Bench (plop into will-it-scale):
$ cat tests/threadspawn1.c

char *testcase_description = "Thread creation and teardown";

static void *worker(void *arg)
{
        return (NULL);
}

void testcase(unsigned long long *iterations, unsigned long nr)
{
        pthread_t thread;
        int error;

        while (1) {
                error = pthread_create(&thread, NULL, worker, NULL);
                assert(error == 0);
                error = pthread_join(thread, NULL);
                assert(error == 0);
                (*iterations)++;
        }
}

Run:
$ ./threadspawn1_processes -t 24

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

v2:
- introduce a struct to collect work
- move out pids as well

there is more which can be pulled out

this may look suspicious:
+	proc_flush_pid(p->thread_pid);

AFAICS this is constant for the duration of the lifetime, so i don't
there is a problem

 include/linux/pid.h |  1 +
 kernel/exit.c       | 53 +++++++++++++++++++++++++++++++++------------
 kernel/pid.c        | 23 +++++++++++++++-----
 3 files changed, 58 insertions(+), 19 deletions(-)

Comments

Oleg Nesterov Jan. 28, 2025, 6:29 p.m. UTC | #1
(Add Eric).


On 01/28, Mateusz Guzik wrote:
>
> Both add_device_randomness() and attach_pid()/detach_pid()

So afaics this patch does 2 different things, and I do think this needs
2 separate patches. Can you split this change please?

As for add_device_randomness(). I must have missed something, but I still can't
understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
to release_release_task() and avoid release_task_post->randomness.

You said:

	I wanted to keep the load where it was

but why??? Again, I must have missed something, but to me this simply adds the
unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
task-to-release != current, so what is the point?

Oleg.

> have their
> own synchronisation mechanisms.
>
> The clone side calls them *without* the tasklist_lock held, meaning
> parallel calls can contend on their locks.
>
> The exit side calls them *with* the tasklist_lock lock, which means the
> hold time is avoidably extended by waiting on either of the 2 locks, in
> turn exacerbating contention on tasklist_lock itself.
>
> Postponing the work until after the lock is dropped bumps thread
> creation/destruction rate by 15% on a 24 core vm.
>
> Bench (plop into will-it-scale):
> $ cat tests/threadspawn1.c
>
> char *testcase_description = "Thread creation and teardown";
>
> static void *worker(void *arg)
> {
>         return (NULL);
> }
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
>         pthread_t thread;
>         int error;
>
>         while (1) {
>                 error = pthread_create(&thread, NULL, worker, NULL);
>                 assert(error == 0);
>                 error = pthread_join(thread, NULL);
>                 assert(error == 0);
>                 (*iterations)++;
>         }
> }
>
> Run:
> $ ./threadspawn1_processes -t 24
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - introduce a struct to collect work
> - move out pids as well
>
> there is more which can be pulled out
>
> this may look suspicious:
> +	proc_flush_pid(p->thread_pid);
>
> AFAICS this is constant for the duration of the lifetime, so i don't
> there is a problem
>
>  include/linux/pid.h |  1 +
>  kernel/exit.c       | 53 +++++++++++++++++++++++++++++++++------------
>  kernel/pid.c        | 23 +++++++++++++++-----
>  3 files changed, 58 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f3..6e9fcacd02cd 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
>   * these helpers must be called with the tasklist_lock write-held.
>   */
>  extern void attach_pid(struct task_struct *task, enum pid_type);
> +extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
>  extern void detach_pid(struct task_struct *task, enum pid_type);
>  extern void change_pid(struct task_struct *task, enum pid_type,
>  			struct pid *pid);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1dcddfe537ee..4e452d3e3a89 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
>  late_initcall(kernel_exit_sysfs_init);
>  #endif
>
> -static void __unhash_process(struct task_struct *p, bool group_dead)
> +/*
> + * For things release_task would like to do *after* tasklist_lock is released.
> + */
> +struct release_task_post {
> +	unsigned long long randomness;
> +	struct pid *pids[PIDTYPE_MAX];
> +};
> +
> +static void __unhash_process(struct release_task_post *post, struct task_struct *p,
> +			     bool group_dead)
>  {
>  	nr_threads--;
> -	detach_pid(p, PIDTYPE_PID);
> +	post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
>  	if (group_dead) {
> -		detach_pid(p, PIDTYPE_TGID);
> -		detach_pid(p, PIDTYPE_PGID);
> -		detach_pid(p, PIDTYPE_SID);
> +		post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
> +		post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
> +		post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
>
>  		list_del_rcu(&p->tasks);
>  		list_del_init(&p->sibling);
> @@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
>  /*
>   * This function expects the tasklist_lock write-locked.
>   */
> -static void __exit_signal(struct task_struct *tsk)
> +static void __exit_signal(struct release_task_post *post,
> +			  struct task_struct *tsk)
>  {
>  	struct signal_struct *sig = tsk->signal;
>  	bool group_dead = thread_group_leader(tsk);
> @@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
>  			sig->curr_target = next_thread(tsk);
>  	}
>
> -	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
> -			      sizeof(unsigned long long));
> +	post->randomness = tsk->se.sum_exec_runtime;
>
>  	/*
>  	 * Accumulate here the counters for all threads as they die. We could
> @@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
>  	task_io_accounting_add(&sig->ioac, &tsk->ioac);
>  	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
>  	sig->nr_threads--;
> -	__unhash_process(tsk, group_dead);
> +	__unhash_process(post, tsk, group_dead);
>  	write_sequnlock(&sig->stats_lock);
>
>  	/*
> @@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
>  void release_task(struct task_struct *p)
>  {
>  	struct task_struct *leader;
> -	struct pid *thread_pid;
>  	int zap_leader;
> +	struct release_task_post post;
>  repeat:
> +	memset(&post, 0, sizeof(post));
> +
> +	proc_flush_pid(p->thread_pid);
> +
>  	/* don't need to get the RCU readlock here - the process is dead and
>  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
>  	rcu_read_lock();
> @@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
>
>  	write_lock_irq(&tasklist_lock);
>  	ptrace_release_task(p);
> -	thread_pid = get_pid(p->thread_pid);
> -	__exit_signal(p);
> +	__exit_signal(&post, p);
>
>  	/*
>  	 * If we are the last non-leader member of the thread
> @@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
>  	}
>
>  	write_unlock_irq(&tasklist_lock);
> -	proc_flush_pid(thread_pid);
> -	put_pid(thread_pid);
> +
> +	/*
> +	 * Process clean up deferred to after we drop the tasklist lock.
> +	 */
> +	add_device_randomness((const void*) &post.randomness,
> +			      sizeof(unsigned long long));
> +	if (post.pids[PIDTYPE_PID])
> +		free_pid(post.pids[PIDTYPE_PID]);
> +	if (post.pids[PIDTYPE_TGID])
> +		free_pid(post.pids[PIDTYPE_TGID]);
> +	if (post.pids[PIDTYPE_PGID])
> +		free_pid(post.pids[PIDTYPE_PGID]);
> +	if (post.pids[PIDTYPE_SID])
> +		free_pid(post.pids[PIDTYPE_SID]);
> +
>  	release_thread(p);
>  	put_task_struct_rcu_user(p);
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3a10a7b6fcf8..047cdbcef5cf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
>  	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
>  }
>
> -static void __change_pid(struct task_struct *task, enum pid_type type,
> +static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
>  			struct pid *new)
>  {
>  	struct pid **pid_ptr = task_pid_ptr(task, type);
> @@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
>
>  	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
>  		if (pid_has_task(pid, tmp))
> -			return;
> +			return NULL;
>
> -	free_pid(pid);
> +	return pid;
> +}
> +
> +struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
> +{
> +	return __change_pid(task, type, NULL);
>  }
>
>  void detach_pid(struct task_struct *task, enum pid_type type)
>  {
> -	__change_pid(task, type, NULL);
> +	struct pid *pid;
> +
> +	pid = detach_pid_return(task, type);
> +	if (pid)
> +		free_pid(pid);
>  }
>
>  void change_pid(struct task_struct *task, enum pid_type type,
>  		struct pid *pid)
>  {
> -	__change_pid(task, type, pid);
> +	struct pid *opid;
> +
> +	opid = __change_pid(task, type, pid);
> +	if (opid)
> +		free_pid(opid);
>  	attach_pid(task, type);
>  }
>
> --
> 2.43.0
>
Mateusz Guzik Jan. 28, 2025, 6:38 p.m. UTC | #2
On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> (Add Eric).
>
>
> On 01/28, Mateusz Guzik wrote:
> >
> > Both add_device_randomness() and attach_pid()/detach_pid()
>
> So afaics this patch does 2 different things, and I do think this needs
> 2 separate patches. Can you split this change please?
>

no problem, will send a v3 provided there are no issues reported
concerning the pid stuff

maybe i'll add few more things pulled out to further justify the struct

> As for add_device_randomness(). I must have missed something, but I still can't
> understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
> to release_release_task() and avoid release_task_post->randomness.
>
> You said:
>
>         I wanted to keep the load where it was
>
> but why??? Again, I must have missed something, but to me this simply adds the
> unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
> task-to-release != current, so what is the point?
>

Perhaps I should preface this is not a hill I'm going to die on. :->

This is the spot which is known to work and release_task does not
access the area otherwise. So for all I know someone will change it
later to be freeable, zeroed for "hardening" or some other crap and
the read moved to later will quietly break to always add the same
value. So by default I don't want to change aspect.

However, if you insist on the read to just moving, I'll be more than
happy to do that in v3.
Oleg Nesterov Jan. 28, 2025, 7:22 p.m. UTC | #3
On 01/28, Mateusz Guzik wrote:
>
> On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> no problem, will send a v3 provided there are no issues reported
> concerning the pid stuff

Great, thanks.

BTW, I didn't look at the pid stuff yet, I _feel_ that this can be simplified
too, but I am already sleeping, most probably I am wrong.

> > As for add_device_randomness(). I must have missed something, but I still can't
> > understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
> > to release_release_task() and avoid release_task_post->randomness.
> >
> > You said:
> >
> >         I wanted to keep the load where it was
> >
> > but why??? Again, I must have missed something, but to me this simply adds the
> > unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
> > task-to-release != current, so what is the point?
> >
>
> Perhaps I should preface this is not a hill I'm going to die on. :->
>
> This is the spot which is known to work and release_task does not
> access the area otherwise. So for all I know someone will change it
> later to be freeable, zeroed for "hardening"

sum_exec_runtime can't be freed/zeroed/etc in any case.

Again, please note that task-to-release can still be running, especially
if it is current.

> always add the same value.

I don't think that "add the same value" does matter at all in this case,
but please correct me.

> So by default I don't want to change aspect.
>
> However, if you insist on the read to just moving, I'll be more than
> happy to do that in v3.

Thanks, see above ;)

Oleg.
Mateusz Guzik Jan. 30, 2025, 11:01 a.m. UTC | #4
On Tue, Jan 28, 2025 at 8:22 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 01/28, Mateusz Guzik wrote:
> >
> > On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > no problem, will send a v3 provided there are no issues reported
> > concerning the pid stuff
>
> Great, thanks.
>
> BTW, I didn't look at the pid stuff yet, I _feel_ that this can be simplified
> too, but I am already sleeping, most probably I am wrong.
>

I looked at pid code apart from the issue at hand.

It the lock protecting it uses irq disablement to guard against
tasklist_lock users coming from an interrupt.

AFAICS this can be legally arranged so that the pidmap_lock is *never*
taken while tasklist_lock is held.

so far the problematic ordering only stems from free_pid calls (not
only on exit), which can all be moved out.

this will reduce total tasklist_lock hold time *and* whack the irq
trip, speeding this up single-threaded

I'll hack it up when I get around to it, maybe this week.

btw, with the current patch when rolling with highly parallel thread
creation/destruction it is pidmap_lock which is the main bottleneck
instead of tasklist_lock
diff mbox series

Patch

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f3..6e9fcacd02cd 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -101,6 +101,7 @@  extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
  * these helpers must be called with the tasklist_lock write-held.
  */
 extern void attach_pid(struct task_struct *task, enum pid_type);
+extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
 extern void detach_pid(struct task_struct *task, enum pid_type);
 extern void change_pid(struct task_struct *task, enum pid_type,
 			struct pid *pid);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1dcddfe537ee..4e452d3e3a89 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,14 +122,23 @@  static __init int kernel_exit_sysfs_init(void)
 late_initcall(kernel_exit_sysfs_init);
 #endif
 
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+	unsigned long long randomness;
+	struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+			     bool group_dead)
 {
 	nr_threads--;
-	detach_pid(p, PIDTYPE_PID);
+	post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
 	if (group_dead) {
-		detach_pid(p, PIDTYPE_TGID);
-		detach_pid(p, PIDTYPE_PGID);
-		detach_pid(p, PIDTYPE_SID);
+		post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
+		post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
+		post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
@@ -141,7 +150,8 @@  static void __unhash_process(struct task_struct *p, bool group_dead)
 /*
  * This function expects the tasklist_lock write-locked.
  */
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post,
+			  struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
 	bool group_dead = thread_group_leader(tsk);
@@ -174,8 +184,7 @@  static void __exit_signal(struct task_struct *tsk)
 			sig->curr_target = next_thread(tsk);
 	}
 
-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
-			      sizeof(unsigned long long));
+	post->randomness = tsk->se.sum_exec_runtime;
 
 	/*
 	 * Accumulate here the counters for all threads as they die. We could
@@ -197,7 +206,7 @@  static void __exit_signal(struct task_struct *tsk)
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
 	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
-	__unhash_process(tsk, group_dead);
+	__unhash_process(post, tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
 
 	/*
@@ -240,9 +249,13 @@  void __weak release_thread(struct task_struct *dead_task)
 void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
-	struct pid *thread_pid;
 	int zap_leader;
+	struct release_task_post post;
 repeat:
+	memset(&post, 0, sizeof(post));
+
+	proc_flush_pid(p->thread_pid);
+
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
@@ -253,8 +266,7 @@  void release_task(struct task_struct *p)
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
-	thread_pid = get_pid(p->thread_pid);
-	__exit_signal(p);
+	__exit_signal(&post, p);
 
 	/*
 	 * If we are the last non-leader member of the thread
@@ -276,8 +288,21 @@  void release_task(struct task_struct *p)
 	}
 
 	write_unlock_irq(&tasklist_lock);
-	proc_flush_pid(thread_pid);
-	put_pid(thread_pid);
+
+	/*
+	 * Process clean up deferred to after we drop the tasklist lock.
+	 */
+	add_device_randomness((const void*) &post.randomness,
+			      sizeof(unsigned long long));
+	if (post.pids[PIDTYPE_PID])
+		free_pid(post.pids[PIDTYPE_PID]);
+	if (post.pids[PIDTYPE_TGID])
+		free_pid(post.pids[PIDTYPE_TGID]);
+	if (post.pids[PIDTYPE_PGID])
+		free_pid(post.pids[PIDTYPE_PGID]);
+	if (post.pids[PIDTYPE_SID])
+		free_pid(post.pids[PIDTYPE_SID]);
+
 	release_thread(p);
 	put_task_struct_rcu_user(p);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a10a7b6fcf8..047cdbcef5cf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -343,7 +343,7 @@  void attach_pid(struct task_struct *task, enum pid_type type)
 	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
 }
 
-static void __change_pid(struct task_struct *task, enum pid_type type,
+static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
 			struct pid *new)
 {
 	struct pid **pid_ptr = task_pid_ptr(task, type);
@@ -362,20 +362,33 @@  static void __change_pid(struct task_struct *task, enum pid_type type,
 
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 		if (pid_has_task(pid, tmp))
-			return;
+			return NULL;
 
-	free_pid(pid);
+	return pid;
+}
+
+struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
+{
+	return __change_pid(task, type, NULL);
 }
 
 void detach_pid(struct task_struct *task, enum pid_type type)
 {
-	__change_pid(task, type, NULL);
+	struct pid *pid;
+
+	pid = detach_pid_return(task, type);
+	if (pid)
+		free_pid(pid);
 }
 
 void change_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
 {
-	__change_pid(task, type, pid);
+	struct pid *opid;
+
+	opid = __change_pid(task, type, pid);
+	if (opid)
+		free_pid(opid);
 	attach_pid(task, type);
 }