diff mbox series

[RFC,12/20] kthread: Implement preferred affinity

Message ID 20240726215701.19459-13-frederic@kernel.org (mailing list archive)
State New
Headers show
Series None | expand

Commit Message

Frederic Weisbecker July 26, 2024, 9:56 p.m. UTC
Affining kthreads follow either of three existing different patterns:

1) Per-CPU kthreads must stay affine to a single CPU and never execute
   relevant code on any other CPU. This is currently handled by smpboot
   code which takes care of CPU-hotplug operations.

2) Kthreads that _have_ to be affine to a specific set of CPUs and can't
   run anywhere else. The affinity is set through kthread_bind_mask()
   and the subsystem takes care by itself to handle CPU-hotplug operations.

3) Kthreads that have a _preferred_ affinity but that can run anywhere
   without breaking correctness. Userspace can overwrite the affinity.
   It is set manually like any other task and CPU-hotplug is supposed
   to be handled by the relevant subsystem so that the task is properly
   reaffined whenever a given CPU from the preferred affinity comes up
   or down. Also care must be taken so that the preferred affinity
   doesn't cross housekeeping cpumask boundaries.

Currently the preferred affinity pattern has at least 4 identified
users, with more or less success when it comes to handle CPU-hotplug
operations and housekeeping cpumask.

Provide an infrastructure to handle this usecase patter. A new
kthread_affine_preferred() API is introduced, to be used just like
kthread_bind_mask(), right after kthread creation and before the first
wake up. The kthread is then affine right away to the cpumask passed
through the API if it has online housekeeping CPUs. Otherwise it will
be affine to all online housekeeping CPUs as a last resort.

It is aware of CPU hotplug events such that:

* When a housekeeping CPU goes up and is part of the preferred affinity
  of a given kthread, it is added to its applied affinity set (and
  possibly the default last resort online housekeeping set is removed
  from the set).

* When a housekeeping CPU goes down while it was part of the preferred
  affinity of a kthread, it is removed from the kthread's applied
  affinity. The last resort is to affine the kthread to all online
  housekeeping CPUs.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/cpuhotplug.h |   1 +
 include/linux/kthread.h    |   1 +
 kernel/kthread.c           | 121 +++++++++++++++++++++++++++++++++++++
 3 files changed, 123 insertions(+)

Comments

Frederic Weisbecker July 26, 2024, 10:31 p.m. UTC | #1
Le Fri, Jul 26, 2024 at 11:56:48PM +0200, Frederic Weisbecker a écrit :
> +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
> +{
> +	struct kthread *kthread = to_kthread(p);
> +	cpumask_var_t affinity;
> +	unsigned long flags;
> +	int ret;
> +
> +	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
> +		WARN_ON(1);
> +		return -EINVAL;
> +	}
> +
> +	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
> +	if (!kthread->preferred_affinity) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	mutex_lock(&kthreads_hotplug_lock);
> +	cpumask_copy(kthread->preferred_affinity, mask);
> +	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
> +	kthread_fetch_affinity(kthread, affinity);
> +
> +	/* It's safe because the task is inactive. */
> +	raw_spin_lock_irqsave(&p->pi_lock, flags);
> +	do_set_cpus_allowed(p, mask);

s/mask/affinity
Vlastimil Babka July 30, 2024, 3:49 p.m. UTC | #2
On 7/26/24 11:56 PM, Frederic Weisbecker wrote:
> Affining kthreads follow either of three existing different patterns:
> 
> 1) Per-CPU kthreads must stay affine to a single CPU and never execute
>    relevant code on any other CPU. This is currently handled by smpboot
>    code which takes care of CPU-hotplug operations.
> 
> 2) Kthreads that _have_ to be affine to a specific set of CPUs and can't
>    run anywhere else. The affinity is set through kthread_bind_mask()
>    and the subsystem takes care by itself to handle CPU-hotplug operations.
> 
> 3) Kthreads that have a _preferred_ affinity but that can run anywhere
>    without breaking correctness. Userspace can overwrite the affinity.
>    It is set manually like any other task and CPU-hotplug is supposed
>    to be handled by the relevant subsystem so that the task is properly
>    reaffined whenever a given CPU from the preferred affinity comes up
>    or down. Also care must be taken so that the preferred affinity
>    doesn't cross housekeeping cpumask boundaries.
> 
> Currently the preferred affinity pattern has at least 4 identified
> users, with more or less success when it comes to handle CPU-hotplug
> operations and housekeeping cpumask.
> 
> Provide an infrastructure to handle this usecase patter. A new
> kthread_affine_preferred() API is introduced, to be used just like
> kthread_bind_mask(), right after kthread creation and before the first
> wake up. The kthread is then affine right away to the cpumask passed
> through the API if it has online housekeeping CPUs. Otherwise it will
> be affine to all online housekeeping CPUs as a last resort.
> 
> It is aware of CPU hotplug events such that:
> 
> * When a housekeeping CPU goes up and is part of the preferred affinity
>   of a given kthread, it is added to its applied affinity set (and
>   possibly the default last resort online housekeeping set is removed
>   from the set).
> 
> * When a housekeeping CPU goes down while it was part of the preferred
>   affinity of a kthread, it is removed from the kthread's applied
>   affinity. The last resort is to affine the kthread to all online
>   housekeeping CPUs.
> 
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

Nit:

> +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
> +{
> +	struct kthread *kthread = to_kthread(p);
> +	cpumask_var_t affinity;
> +	unsigned long flags;
> +	int ret;
> +
> +	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
> +		WARN_ON(1);
> +		return -EINVAL;
> +	}
> +

Should we also fail if kthread->preferred_affinity already exist? In
case somebody calls this twice.

Also for some of the use cases (kswapd, kcompactd) it would make sense
to be able to add cpus of a node as they are onlined. Which seems we
didn't do, except some corner case handling in kcompactd, but maybe we
should? I wonder if the current implementation of onlining a completely
new node with cpus does the right thing as a result of the individual
onlining operations, or we end up with being affined to a single cpu (or
none).

But that would need some kind of kthread_affine_preferred_update()
implementation?

> +	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
> +	if (!kthread->preferred_affinity) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	mutex_lock(&kthreads_hotplug_lock);
> +	cpumask_copy(kthread->preferred_affinity, mask);
> +	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
> +	kthread_fetch_affinity(kthread, affinity);
> +
> +	/* It's safe because the task is inactive. */
> +	raw_spin_lock_irqsave(&p->pi_lock, flags);
> +	do_set_cpus_allowed(p, mask);
> +	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +
> +	mutex_unlock(&kthreads_hotplug_lock);
> +out:
> +	free_cpumask_var(affinity);
> +
> +	return 0;
> +}
> +
> +static int kthreads_hotplug_update(void)
> +{
> +	cpumask_var_t affinity;
> +	struct kthread *k;
> +	int err = 0;
> +
> +	if (list_empty(&kthreads_hotplug))
> +		return 0;
> +
> +	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
> +		if (WARN_ON_ONCE(!k->preferred_affinity)) {
> +			err = -EINVAL;
> +			break;
> +		}
> +		kthread_fetch_affinity(k, affinity);
> +		set_cpus_allowed_ptr(k->task, affinity);
> +	}
> +
> +	free_cpumask_var(affinity);
> +
> +	return err;
> +}
> +
> +static int kthreads_offline_cpu(unsigned int cpu)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&kthreads_hotplug_lock);
> +	cpumask_clear_cpu(cpu, &kthread_online_mask);
> +	ret = kthreads_hotplug_update();
> +	mutex_unlock(&kthreads_hotplug_lock);
> +
> +	return ret;
> +}
> +
> +static int kthreads_online_cpu(unsigned int cpu)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&kthreads_hotplug_lock);
> +	cpumask_set_cpu(cpu, &kthread_online_mask);
> +	ret = kthreads_hotplug_update();
> +	mutex_unlock(&kthreads_hotplug_lock);
> +
> +	return ret;
> +}
> +
> +static int kthreads_init(void)
> +{
> +	return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
> +				kthreads_online_cpu, kthreads_offline_cpu);
> +}
> +early_initcall(kthreads_init);
> +
>  void __kthread_init_worker(struct kthread_worker *worker,
>  				const char *name,
>  				struct lock_class_key *key)
Frederic Weisbecker Aug. 5, 2024, 2:28 p.m. UTC | #3
Le Tue, Jul 30, 2024 at 05:49:51PM +0200, Vlastimil Babka a écrit :
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Nit:
> 
> > +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
> > +{
> > +	struct kthread *kthread = to_kthread(p);
> > +	cpumask_var_t affinity;
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
> > +		WARN_ON(1);
> > +		return -EINVAL;
> > +	}
> > +
> 
> Should we also fail if kthread->preferred_affinity already exist? In
> case somebody calls this twice.

Good point!

> 
> Also for some of the use cases (kswapd, kcompactd) it would make sense
> to be able to add cpus of a node as they are onlined. Which seems we
> didn't do, except some corner case handling in kcompactd, but maybe we
> should? I wonder if the current implementation of onlining a completely
> new node with cpus does the right thing as a result of the individual
> onlining operations, or we end up with being affined to a single cpu (or
> none).
> 
> But that would need some kind of kthread_affine_preferred_update()
> implementation?

So you mean that the "for_each_node_state()" loop in kcompactd doesn't
handle all possible nodes but only those online when it's called? Or
am I confused?

If all users of preferred affinity were to use NUMA nodes, it could be
a good idea to do a flavour of kernel/smpboot.c which would handle
per-node kthreads instead of per-cpu kthreads. I initially thought
about that. It would have handled all the lifecycle of those kthreads,
including creation, against hotplug. Unfortunately RCU doesn't rely on
per-NUMA nodes but rather use its own tree.

If there be more users of real per NUMA nodes kthreads than kswapd and
kcompactd, of course that would be much worth considering.

Thanks.
Vlastimil Babka Aug. 5, 2024, 2:53 p.m. UTC | #4
On 8/5/24 16:28, Frederic Weisbecker wrote:
> Le Tue, Jul 30, 2024 at 05:49:51PM +0200, Vlastimil Babka a écrit :
>> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>> 
>> Nit:
>> 
>> > +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
>> > +{
>> > +	struct kthread *kthread = to_kthread(p);
>> > +	cpumask_var_t affinity;
>> > +	unsigned long flags;
>> > +	int ret;
>> > +
>> > +	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
>> > +		WARN_ON(1);
>> > +		return -EINVAL;
>> > +	}
>> > +
>> 
>> Should we also fail if kthread->preferred_affinity already exist? In
>> case somebody calls this twice.
> 
> Good point!
> 
>> 
>> Also for some of the use cases (kswapd, kcompactd) it would make sense
>> to be able to add cpus of a node as they are onlined. Which seems we
>> didn't do, except some corner case handling in kcompactd, but maybe we
>> should? I wonder if the current implementation of onlining a completely
>> new node with cpus does the right thing as a result of the individual
>> onlining operations, or we end up with being affined to a single cpu (or
>> none).
>> 
>> But that would need some kind of kthread_affine_preferred_update()
>> implementation?
> 
> So you mean that the "for_each_node_state()" loop in kcompactd doesn't
> handle all possible nodes but only those online when it's called? Or
> am I confused?

If you mean the loop in kcompactd_init() then indeed, but we also have a
hook in online_pages() to start new threads on newly onlined nodes, so
that's not a problem.

The problem (I think) I see is cpumask_of_node(pgdat->node_id) is a snapshot
of cpus running on the NUMA node the time, and is never updated later as new
cpus might be brought up.

kcompactd_cpu_online() does try to update that when cpus are onlined (in a
clumsy way), there was nothing like that for kswapd and after your series
this update is also removed for kcompactd.

> If all users of preferred affinity were to use NUMA nodes, it could be
> a good idea to do a flavour of kernel/smpboot.c which would handle
> per-node kthreads instead of per-cpu kthreads. I initially thought
> about that. It would have handled all the lifecycle of those kthreads,
> including creation, against hotplug. Unfortunately RCU doesn't rely on
> per-NUMA nodes but rather use its own tree.
> 
> If there be more users of real per NUMA nodes kthreads than kswapd and
> kcompactd, of course that would be much worth considering.

Yeah it's not that compelling, but a way to update the preferred affine mask
in response to cpu hotplug events, that kswapd and kcompactd could use,
would be sufficient. And maybe more widely useful.

I guess there could be a callback defined for kthread to provide a new
preferred_affinity, that you'd call from kthreads_hotplug_update() ?
And kcompactd and kswapd could both use the same callback that interprets
kthread_data() as pgdat and fetches a new cpumask of it?

> Thanks.
Frederic Weisbecker Aug. 5, 2024, 4:23 p.m. UTC | #5
Le Mon, Aug 05, 2024 at 04:53:51PM +0200, Vlastimil Babka a écrit :
> If you mean the loop in kcompactd_init() then indeed, but we also have a
> hook in online_pages() to start new threads on newly onlined nodes, so
> that's not a problem.
> 
> The problem (I think) I see is cpumask_of_node(pgdat->node_id) is a snapshot
> of cpus running on the NUMA node the time, and is never updated later as new
> cpus might be brought up.

Oh I see now...

> 
> kcompactd_cpu_online() does try to update that when cpus are onlined (in a
> clumsy way), there was nothing like that for kswapd and after your series
> this update is also removed for kcompactd.

Ok...

> 
> > If all users of preferred affinity were to use NUMA nodes, it could be
> > a good idea to do a flavour of kernel/smpboot.c which would handle
> > per-node kthreads instead of per-cpu kthreads. I initially thought
> > about that. It would have handled all the lifecycle of those kthreads,
> > including creation, against hotplug. Unfortunately RCU doesn't rely on
> > per-NUMA nodes but rather use its own tree.
> > 
> > If there be more users of real per NUMA nodes kthreads than kswapd and
> > kcompactd, of course that would be much worth considering.
> 
> Yeah it's not that compelling, but a way to update the preferred affine mask
> in response to cpu hotplug events, that kswapd and kcompactd could use,
> would be sufficient. And maybe more widely useful.
> 
> I guess there could be a callback defined for kthread to provide a new
> preferred_affinity, that you'd call from kthreads_hotplug_update() ?
> And kcompactd and kswapd could both use the same callback that interprets
> kthread_data() as pgdat and fetches a new cpumask of it?

It's too bad we don't have a way to have a cpumask_possible_of_node(). I've
looked into the guts of numa but that doesn't look easy to do.

Or there could be kthread_set_preferred_node()... ?

Thanks.

> 
> > Thanks.
>
Vlastimil Babka Aug. 5, 2024, 9:25 p.m. UTC | #6
On 8/5/24 18:23, Frederic Weisbecker wrote:
> Le Mon, Aug 05, 2024 at 04:53:51PM +0200, Vlastimil Babka a écrit :
>> If you mean the loop in kcompactd_init() then indeed, but we also have a
>> hook in online_pages() to start new threads on newly onlined nodes, so
>> that's not a problem.
>> 
>> The problem (I think) I see is cpumask_of_node(pgdat->node_id) is a snapshot
>> of cpus running on the NUMA node the time, and is never updated later as new
>> cpus might be brought up.
> 
> Oh I see now...
> 
>> 
>> kcompactd_cpu_online() does try to update that when cpus are onlined (in a
>> clumsy way), there was nothing like that for kswapd and after your series
>> this update is also removed for kcompactd.
> 
> Ok...
> 
>> 
>> > If all users of preferred affinity were to use NUMA nodes, it could be
>> > a good idea to do a flavour of kernel/smpboot.c which would handle
>> > per-node kthreads instead of per-cpu kthreads. I initially thought
>> > about that. It would have handled all the lifecycle of those kthreads,
>> > including creation, against hotplug. Unfortunately RCU doesn't rely on
>> > per-NUMA nodes but rather use its own tree.
>> > 
>> > If there be more users of real per NUMA nodes kthreads than kswapd and
>> > kcompactd, of course that would be much worth considering.
>> 
>> Yeah it's not that compelling, but a way to update the preferred affine mask
>> in response to cpu hotplug events, that kswapd and kcompactd could use,
>> would be sufficient. And maybe more widely useful.
>> 
>> I guess there could be a callback defined for kthread to provide a new
>> preferred_affinity, that you'd call from kthreads_hotplug_update() ?
>> And kcompactd and kswapd could both use the same callback that interprets
>> kthread_data() as pgdat and fetches a new cpumask of it?
> 
> It's too bad we don't have a way to have a cpumask_possible_of_node(). I've
> looked into the guts of numa but that doesn't look easy to do.

That was my impression as well. Maybe not even possible because exact cpu
ids might not be pre-determined like this?

> Or there could be kthread_set_preferred_node()... ?

Possible instead of the callback idea suggested above?
kthreads_hotplug_update() could check if this is set and construct the mask
accordingly.

> Thanks.
> 
>> 
>> > Thanks.
>>
Frederic Weisbecker Aug. 5, 2024, 11:59 p.m. UTC | #7
On Mon, Aug 05, 2024 at 11:25:59PM +0200, Vlastimil Babka wrote:
> > It's too bad we don't have a way to have a cpumask_possible_of_node(). I've
> > looked into the guts of numa but that doesn't look easy to do.
> 
> That was my impression as well. Maybe not even possible because exact cpu
> ids might not be pre-determined like this?

Probably.

> 
> > Or there could be kthread_set_preferred_node()... ?
> 
> Possible instead of the callback idea suggested above?
> kthreads_hotplug_update() could check if this is set and construct the mask
> accordingly.

Or even better, callers of kthread_create_on_node() with actual node passed
(!NUMA_NO_NODE) can be preferrably affined to the corresponding node by default
unless told otherwise (that is unless kthread_bind() or
kthread_set_preferred_affinity() has been called before the first wake up, and
that includes kthread_create_on_cpu()).

There are a few callers concerned: kswapd, kcompactd, some drivers:
drivers/block/mtip32xx/mtip32xx.c, drivers/firmware/stratix10-svc.c,
kernel/dma/map_benchmark.c, net/sunrpc/svc.c

After all kthread_create_on_cpu() affines to the corresponding CPU. So
it sounds natural that kthread_create_on_node() affines to the corresponding
node.

And then it's handled on hotplug just as a special case of preferred affinity.

Or is there something that wouldn't make that work?

Thanks.


> 
> > Thanks.
> > 
> >> 
> >> > Thanks.
> >> 
>
Vlastimil Babka Aug. 6, 2024, 11:08 a.m. UTC | #8
On 8/6/24 01:59, Frederic Weisbecker wrote:
> On Mon, Aug 05, 2024 at 11:25:59PM +0200, Vlastimil Babka wrote:
>> > It's too bad we don't have a way to have a cpumask_possible_of_node(). I've
>> > looked into the guts of numa but that doesn't look easy to do.
>> 
>> That was my impression as well. Maybe not even possible because exact cpu
>> ids might not be pre-determined like this?
> 
> Probably.
> 
>> 
>> > Or there could be kthread_set_preferred_node()... ?
>> 
>> Possible instead of the callback idea suggested above?
>> kthreads_hotplug_update() could check if this is set and construct the mask
>> accordingly.
> 
> Or even better, callers of kthread_create_on_node() with actual node passed
> (!NUMA_NO_NODE) can be preferrably affined to the corresponding node by default
> unless told otherwise (that is unless kthread_bind() or
> kthread_set_preferred_affinity() has been called before the first wake up, and
> that includes kthread_create_on_cpu()).

Sounds logical and great!

> There are a few callers concerned: kswapd, kcompactd, some drivers:
> drivers/block/mtip32xx/mtip32xx.c, drivers/firmware/stratix10-svc.c,
> kernel/dma/map_benchmark.c, net/sunrpc/svc.c
> 
> After all kthread_create_on_cpu() affines to the corresponding CPU. So
> it sounds natural that kthread_create_on_node() affines to the corresponding
> node.

Yes.

> And then it's handled on hotplug just as a special case of preferred affinity.
> 
> Or is there something that wouldn't make that work?

Hopefully not.

> Thanks.
> 
> 
>> 
>> > Thanks.
>> > 
>> >> 
>> >> > Thanks.
>> >> 
>>
diff mbox series

Patch

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 7a5785f405b6..5c204bd0fed6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -238,6 +238,7 @@  enum cpuhp_state {
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RANDOM_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
+	CPUHP_AP_KTHREADS_ONLINE,
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 40,
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index b11f53c1ba2e..30209bdf83a2 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -85,6 +85,7 @@  kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
 void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 int kthread_stop_put(struct task_struct *k);
 bool kthread_should_stop(void);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ecb719f54f7a..cfa6e1b8d933 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,10 @@  static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+static struct cpumask kthread_online_mask;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
@@ -64,6 +68,9 @@  struct kthread {
 #endif
 	/* To store the full name if task comm is truncated. */
 	char *full_name;
+	struct task_struct *task;
+	struct list_head hotplug_node;
+	struct cpumask *preferred_affinity;
 };
 
 enum KTHREAD_BITS {
@@ -124,6 +131,7 @@  bool set_kthread_struct(struct task_struct *p)
 	init_completion(&kthread->parked);
 	p->vfork_done = &kthread->exited;
 
+	kthread->task = p;
 	p->worker_private = kthread;
 	return true;
 }
@@ -314,6 +322,16 @@  void __noreturn kthread_exit(long result)
 {
 	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
+	if (kthread->preferred_affinity) {
+		mutex_lock(&kthreads_hotplug_lock);
+		list_del(&kthread->hotplug_node);
+		/* Make sure the kthread never gets re-affined globally */
+		set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
+		mutex_unlock(&kthreads_hotplug_lock);
+
+		kfree(kthread->preferred_affinity);
+		kthread->preferred_affinity = NULL;
+	}
 	do_exit(0);
 }
 EXPORT_SYMBOL(kthread_exit);
@@ -779,6 +797,109 @@  int kthreadd(void *unused)
 	return 0;
 }
 
+static void kthread_fetch_affinity(struct kthread *k, struct cpumask *mask)
+{
+	cpumask_and(mask, k->preferred_affinity, &kthread_online_mask);
+	cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+	if (cpumask_empty(mask))
+		cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+}
+
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+{
+	struct kthread *kthread = to_kthread(p);
+	cpumask_var_t affinity;
+	unsigned long flags;
+	int ret;
+
+	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+	if (!kthread->preferred_affinity) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_copy(kthread->preferred_affinity, mask);
+	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+	kthread_fetch_affinity(kthread, affinity);
+
+	/* It's safe because the task is inactive. */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	do_set_cpus_allowed(p, mask);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	mutex_unlock(&kthreads_hotplug_lock);
+out:
+	free_cpumask_var(affinity);
+
+	return 0;
+}
+
+static int kthreads_hotplug_update(void)
+{
+	cpumask_var_t affinity;
+	struct kthread *k;
+	int err = 0;
+
+	if (list_empty(&kthreads_hotplug))
+		return 0;
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+		if (WARN_ON_ONCE(!k->preferred_affinity)) {
+			err = -EINVAL;
+			break;
+		}
+		kthread_fetch_affinity(k, affinity);
+		set_cpus_allowed_ptr(k->task, affinity);
+	}
+
+	free_cpumask_var(affinity);
+
+	return err;
+}
+
+static int kthreads_offline_cpu(unsigned int cpu)
+{
+	int ret = 0;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_clear_cpu(cpu, &kthread_online_mask);
+	ret = kthreads_hotplug_update();
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	return ret;
+}
+
+static int kthreads_online_cpu(unsigned int cpu)
+{
+	int ret = 0;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_set_cpu(cpu, &kthread_online_mask);
+	ret = kthreads_hotplug_update();
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	return ret;
+}
+
+static int kthreads_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+				kthreads_online_cpu, kthreads_offline_cpu);
+}
+early_initcall(kthreads_init);
+
 void __kthread_init_worker(struct kthread_worker *worker,
 				const char *name,
 				struct lock_class_key *key)