diff mbox series

[v11,08/16] sched: Allow task CPU affinity to be restricted on asymmetric systems

Message ID 20210730112443.23245-9-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series Add support for 32-bit tasks on asymmetric AArch32 systems | expand

Commit Message

Will Deacon July 30, 2021, 11:24 a.m. UTC
Asymmetric systems may not offer the same level of userspace ISA support
across all CPUs, meaning that some applications cannot be executed by
some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do
not feature support for 32-bit applications on both clusters.

Although userspace can carefully manage the affinity masks for such
tasks, one place where it is particularly problematic is execve()
because the CPU on which the execve() is occurring may be incompatible
with the new application image. In such a situation, it is desirable to
restrict the affinity mask of the task and ensure that the new image is
entered on a compatible CPU. From userspace's point of view, this looks
the same as if the incompatible CPUs have been hotplugged off in the
task's affinity mask. Similarly, if a subsequent execve() reverts to
a compatible image, then the old affinity is restored if it is still
valid.

In preparation for restricting the affinity mask for compat tasks on
arm64 systems without uniform support for 32-bit applications, introduce
{force,relax}_compatible_cpus_allowed_ptr(), which respectively restrict
and restore the affinity mask for a task based on the compatible CPUs.

Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/sched.h |   2 +
 kernel/sched/core.c   | 180 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h  |   1 +
 3 files changed, 167 insertions(+), 16 deletions(-)

Comments

Peter Zijlstra Aug. 17, 2021, 3:10 p.m. UTC | #1
On Fri, Jul 30, 2021 at 12:24:35PM +0100, Will Deacon wrote:
> @@ -2783,20 +2778,173 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
>  
>  	__do_set_cpus_allowed(p, new_mask, flags);
>  
> -	return affine_move_task(rq, p, &rf, dest_cpu, flags);
> +	if (flags & SCA_USER)
> +		release_user_cpus_ptr(p);
> +
> +	return affine_move_task(rq, p, rf, dest_cpu, flags);
>  
>  out:
> -	task_rq_unlock(rq, p, &rf);
> +	task_rq_unlock(rq, p, rf);
>  
>  	return ret;
>  }

> +void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
> +{
> +	unsigned long flags;
> +	struct cpumask *mask = p->user_cpus_ptr;
> +
> +	/*
> +	 * Try to restore the old affinity mask. If this fails, then
> +	 * we free the mask explicitly to avoid it being inherited across
> +	 * a subsequent fork().
> +	 */
> +	if (!mask || !__sched_setaffinity(p, mask))
> +		return;
> +
> +	raw_spin_lock_irqsave(&p->pi_lock, flags);
> +	release_user_cpus_ptr(p);
> +	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +}

Both these are a problem on RT.

The easiest recourse is simply never freeing the CPU mask (except on
exit). The alternative is something like the below I suppose..

I'm leaning towards the former option, wdyt?

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2733,6 +2733,7 @@ static int __set_cpus_allowed_ptr_locked
 	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	bool kthread = p->flags & PF_KTHREAD;
+	struct cpumask *user_mask = NULL;
 	unsigned int dest_cpu;
 	int ret = 0;
 
@@ -2792,9 +2793,13 @@ static int __set_cpus_allowed_ptr_locked
 	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (flags & SCA_USER)
-		release_user_cpus_ptr(p);
+		swap(user_mask, p->user_cpus_ptr);
+
+	ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+
+	kfree(user_mask);
 
-	return affine_move_task(rq, p, rf, dest_cpu, flags);
+	return ret;
 
 out:
 	task_rq_unlock(rq, p, rf);
@@ -2954,8 +2959,10 @@ void relax_compatible_cpus_allowed_ptr(s
 		return;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	release_user_cpus_ptr(p);
+	p->user_cpus_ptr = NULL;
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	kfree(mask);
 }
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Peter Zijlstra Aug. 17, 2021, 3:41 p.m. UTC | #2
On Fri, Jul 30, 2021 at 12:24:35PM +0100, Will Deacon wrote:
> +	struct rq_flags rf;
> +	struct rq *rq;
> +	int err;
> +	struct cpumask *user_mask = NULL;

> +	cpumask_var_t new_mask;
> +	const struct cpumask *override_mask = task_cpu_possible_mask(p);

> +	unsigned long flags;
> +	struct cpumask *mask = p->user_cpus_ptr;

I've fixed all that up to be proper reverse x-mas trees; similar for
other patches.
Will Deacon Aug. 18, 2021, 10:42 a.m. UTC | #3
Hi Peter,

On Tue, Aug 17, 2021 at 05:10:53PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 30, 2021 at 12:24:35PM +0100, Will Deacon wrote:
> > @@ -2783,20 +2778,173 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
> >  
> >  	__do_set_cpus_allowed(p, new_mask, flags);
> >  
> > -	return affine_move_task(rq, p, &rf, dest_cpu, flags);
> > +	if (flags & SCA_USER)
> > +		release_user_cpus_ptr(p);
> > +
> > +	return affine_move_task(rq, p, rf, dest_cpu, flags);
> >  
> >  out:
> > -	task_rq_unlock(rq, p, &rf);
> > +	task_rq_unlock(rq, p, rf);
> >  
> >  	return ret;
> >  }
> 
> > +void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
> > +{
> > +	unsigned long flags;
> > +	struct cpumask *mask = p->user_cpus_ptr;
> > +
> > +	/*
> > +	 * Try to restore the old affinity mask. If this fails, then
> > +	 * we free the mask explicitly to avoid it being inherited across
> > +	 * a subsequent fork().
> > +	 */
> > +	if (!mask || !__sched_setaffinity(p, mask))
> > +		return;
> > +
> > +	raw_spin_lock_irqsave(&p->pi_lock, flags);
> > +	release_user_cpus_ptr(p);
> > +	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> > +}
> 
> Both these are a problem on RT.

Ah, sorry. I didn't realise you couldn't _free_ with a raw lock held in RT.
Is there somewhere I can read up on that?

> The easiest recourse is simply never freeing the CPU mask (except on
> exit). The alternative is something like the below I suppose..
> 
> I'm leaning towards the former option, wdyt?

Defering the freeing until exit feels like a little fiddly, as we still
want to clear ->user_cpus_ptr on affinity changes when SCA_USER is set
so we'd have to keep track of the mask somewhere and reuse it instead
of allocating a new one if we need it later on. Do-able, but feels a bit
nasty, particular across fork().

As for your other suggestion:

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2733,6 +2733,7 @@ static int __set_cpus_allowed_ptr_locked
>  	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
>  	const struct cpumask *cpu_valid_mask = cpu_active_mask;
>  	bool kthread = p->flags & PF_KTHREAD;
> +	struct cpumask *user_mask = NULL;
>  	unsigned int dest_cpu;
>  	int ret = 0;
>  
> @@ -2792,9 +2793,13 @@ static int __set_cpus_allowed_ptr_locked
>  	__do_set_cpus_allowed(p, new_mask, flags);
>  
>  	if (flags & SCA_USER)
> -		release_user_cpus_ptr(p);
> +		swap(user_mask, p->user_cpus_ptr);
> +
> +	ret = affine_move_task(rq, p, rf, dest_cpu, flags);
> +
> +	kfree(user_mask);
>  
> -	return affine_move_task(rq, p, rf, dest_cpu, flags);
> +	return ret;
>  
>  out:
>  	task_rq_unlock(rq, p, rf);
> @@ -2954,8 +2959,10 @@ void relax_compatible_cpus_allowed_ptr(s
>  		return;
>  
>  	raw_spin_lock_irqsave(&p->pi_lock, flags);
> -	release_user_cpus_ptr(p);
> +	p->user_cpus_ptr = NULL;
>  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> +
> +	kfree(mask);

I think the idea looks good, but perhaps we could wrap things up a bit:

/* Comment about why this is useful with RT */
static cpumask_t *clear_user_cpus_ptr(struct task_struct *p)
{
	struct cpumask *user_mask = NULL;

	swap(user_mask, p->user_cpus_ptr);
	return user_mask;
}

void release_user_cpus_ptr(struct task_struct *p)
{
	kfree(clear_user_cpus_ptr(p));
}

Then just use clear_user_cpus_ptr() in sched/core.c where we know what
we're doing (well, at least one of us does!).

Will
Will Deacon Aug. 18, 2021, 10:43 a.m. UTC | #4
On Tue, Aug 17, 2021 at 05:41:42PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 30, 2021 at 12:24:35PM +0100, Will Deacon wrote:
> > +	struct rq_flags rf;
> > +	struct rq *rq;
> > +	int err;
> > +	struct cpumask *user_mask = NULL;
> 
> > +	cpumask_var_t new_mask;
> > +	const struct cpumask *override_mask = task_cpu_possible_mask(p);
> 
> > +	unsigned long flags;
> > +	struct cpumask *mask = p->user_cpus_ptr;
> 
> I've fixed all that up to be proper reverse x-mas trees; similar for
> other patches.

Thanks.

Will
Peter Zijlstra Aug. 18, 2021, 10:56 a.m. UTC | #5
On Wed, Aug 18, 2021 at 11:42:28AM +0100, Will Deacon wrote:
> As for your other suggestion:
> 
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2733,6 +2733,7 @@ static int __set_cpus_allowed_ptr_locked
> >  	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
> >  	const struct cpumask *cpu_valid_mask = cpu_active_mask;
> >  	bool kthread = p->flags & PF_KTHREAD;
> > +	struct cpumask *user_mask = NULL;
> >  	unsigned int dest_cpu;
> >  	int ret = 0;
> >  
> > @@ -2792,9 +2793,13 @@ static int __set_cpus_allowed_ptr_locked
> >  	__do_set_cpus_allowed(p, new_mask, flags);
> >  
> >  	if (flags & SCA_USER)
> > -		release_user_cpus_ptr(p);
> > +		swap(user_mask, p->user_cpus_ptr);
> > +
> > +	ret = affine_move_task(rq, p, rf, dest_cpu, flags);
> > +
> > +	kfree(user_mask);
> >  
> > -	return affine_move_task(rq, p, rf, dest_cpu, flags);
> > +	return ret;
> >  
> >  out:
> >  	task_rq_unlock(rq, p, rf);
> > @@ -2954,8 +2959,10 @@ void relax_compatible_cpus_allowed_ptr(s
> >  		return;
> >  
> >  	raw_spin_lock_irqsave(&p->pi_lock, flags);
> > -	release_user_cpus_ptr(p);
> > +	p->user_cpus_ptr = NULL;
> >  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> > +
> > +	kfree(mask);
> 
> I think the idea looks good, but perhaps we could wrap things up a bit:
> 
> /* Comment about why this is useful with RT */
> static cpumask_t *clear_user_cpus_ptr(struct task_struct *p)
> {
> 	struct cpumask *user_mask = NULL;
> 
> 	swap(user_mask, p->user_cpus_ptr);
> 	return user_mask;
> }
> 
> void release_user_cpus_ptr(struct task_struct *p)
> {
> 	kfree(clear_user_cpus_ptr(p));
> }
> 
> Then just use clear_user_cpus_ptr() in sched/core.c where we know what
> we're doing (well, at least one of us does!).

OK, I'll go make it like that.
Peter Zijlstra Aug. 18, 2021, 11:06 a.m. UTC | #6
On Wed, Aug 18, 2021 at 11:42:28AM +0100, Will Deacon wrote:

> Ah, sorry. I didn't realise you couldn't _free_ with a raw lock held in RT.
> Is there somewhere I can read up on that?

It's because the allocators use spinlock_t, which cannot nest inside
raw_spinlock_t.
Peter Zijlstra Aug. 18, 2021, 11:53 a.m. UTC | #7
On Wed, Aug 18, 2021 at 12:56:41PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 18, 2021 at 11:42:28AM +0100, Will Deacon wrote:

> > I think the idea looks good, but perhaps we could wrap things up a bit:
> > 
> > /* Comment about why this is useful with RT */
> > static cpumask_t *clear_user_cpus_ptr(struct task_struct *p)
> > {
> > 	struct cpumask *user_mask = NULL;
> > 
> > 	swap(user_mask, p->user_cpus_ptr);
> > 	return user_mask;
> > }
> > 
> > void release_user_cpus_ptr(struct task_struct *p)
> > {
> > 	kfree(clear_user_cpus_ptr(p));
> > }
> > 
> > Then just use clear_user_cpus_ptr() in sched/core.c where we know what
> > we're doing (well, at least one of us does!).
> 
> OK, I'll go make it like that.

Something like so then?

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2497,10 +2497,18 @@ int dup_user_cpus_ptr(struct task_struct
 	return 0;
 }
 
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+	struct cpumask *user_mask = NULL;
+
+	swap(p->user_cpus_ptr, user_mask);
+
+	return user_mask;
+}
+
 void release_user_cpus_ptr(struct task_struct *p)
 {
-	kfree(p->user_cpus_ptr);
-	p->user_cpus_ptr = NULL;
+	kfree(clear_user_cpus_ptr(p));
 }
 
 /*
@@ -2733,6 +2741,7 @@ static int __set_cpus_allowed_ptr_locked
 	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	bool kthread = p->flags & PF_KTHREAD;
+	struct cpumask *user_mask = NULL;
 	unsigned int dest_cpu;
 	int ret = 0;
 
@@ -2792,9 +2801,13 @@ static int __set_cpus_allowed_ptr_locked
 	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (flags & SCA_USER)
-		release_user_cpus_ptr(p);
+		user_mask = clear_user_cpus_ptr(p);
 
-	return affine_move_task(rq, p, rf, dest_cpu, flags);
+	ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+
+	kfree(user_mask);
+
+	return ret;
 
 out:
 	task_rq_unlock(rq, p, rf);
@@ -2941,20 +2954,22 @@ __sched_setaffinity(struct task_struct *
  */
 void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
 {
+	struct cpumask *user_mask = p->user_cpus_ptr;
 	unsigned long flags;
-	struct cpumask *mask = p->user_cpus_ptr;
 
 	/*
 	 * Try to restore the old affinity mask. If this fails, then
 	 * we free the mask explicitly to avoid it being inherited across
 	 * a subsequent fork().
 	 */
-	if (!mask || !__sched_setaffinity(p, mask))
+	if (!user_mask || !__sched_setaffinity(p, user_mask))
 		return;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	release_user_cpus_ptr(p);
+	user_mask = clear_user_cpus_ptr(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	kfree(user_mask);
 }
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Will Deacon Aug. 18, 2021, 12:19 p.m. UTC | #8
On Wed, Aug 18, 2021 at 01:53:28PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 18, 2021 at 12:56:41PM +0200, Peter Zijlstra wrote:
> > On Wed, Aug 18, 2021 at 11:42:28AM +0100, Will Deacon wrote:
> 
> > > I think the idea looks good, but perhaps we could wrap things up a bit:
> > > 
> > > /* Comment about why this is useful with RT */
> > > static cpumask_t *clear_user_cpus_ptr(struct task_struct *p)
> > > {
> > > 	struct cpumask *user_mask = NULL;
> > > 
> > > 	swap(user_mask, p->user_cpus_ptr);
> > > 	return user_mask;
> > > }
> > > 
> > > void release_user_cpus_ptr(struct task_struct *p)
> > > {
> > > 	kfree(clear_user_cpus_ptr(p));
> > > }
> > > 
> > > Then just use clear_user_cpus_ptr() in sched/core.c where we know what
> > > we're doing (well, at least one of us does!).
> > 
> > OK, I'll go make it like that.
> 
> Something like so then?

Looks good to me, thanks!

Will
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 91dab7a62aa1..2ebe3d6f8f0c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1708,6 +1708,8 @@  extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new
 extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
 extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
 extern void release_user_cpus_ptr(struct task_struct *p);
+extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
+extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
 #else
 static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d4219d366103..aec75ec1d257 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2707,27 +2707,22 @@  static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 }
 
 /*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
  */
-static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask,
-				  u32 flags)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+					 const struct cpumask *new_mask,
+					 u32 flags,
+					 struct rq *rq,
+					 struct rq_flags *rf)
+	__releases(rq->lock)
+	__releases(p->pi_lock)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
 	unsigned int dest_cpu;
-	struct rq_flags rf;
-	struct rq *rq;
 	int ret = 0;
 	bool kthread = p->flags & PF_KTHREAD;
 
-	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
 	if (kthread || is_migration_disabled(p)) {
@@ -2783,20 +2778,173 @@  static int __set_cpus_allowed_ptr(struct task_struct *p,
 
 	__do_set_cpus_allowed(p, new_mask, flags);
 
-	return affine_move_task(rq, p, &rf, dest_cpu, flags);
+	if (flags & SCA_USER)
+		release_user_cpus_ptr(p);
+
+	return affine_move_task(rq, p, rf, dest_cpu, flags);
 
 out:
-	task_rq_unlock(rq, p, &rf);
+	task_rq_unlock(rq, p, rf);
 
 	return ret;
 }
 
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, u32 flags)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(p, &rf);
+	return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+}
+
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+ * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+				     struct cpumask *new_mask,
+				     const struct cpumask *subset_mask)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	int err;
+	struct cpumask *user_mask = NULL;
+
+	if (!p->user_cpus_ptr) {
+		user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+
+		if (!user_mask)
+			return -ENOMEM;
+	}
+
+	rq = task_rq_lock(p, &rf);
+
+	/*
+	 * Forcefully restricting the affinity of a deadline task is
+	 * likely to cause problems, so fail and noisily override the
+	 * mask entirely.
+	 */
+	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+		err = -EPERM;
+		goto err_unlock;
+	}
+
+	if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+		err = -EINVAL;
+		goto err_unlock;
+	}
+
+	/*
+	 * We're about to butcher the task affinity, so keep track of what
+	 * the user asked for in case we're able to restore it later on.
+	 */
+	if (user_mask) {
+		cpumask_copy(user_mask, p->cpus_ptr);
+		p->user_cpus_ptr = user_mask;
+	}
+
+	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+
+err_unlock:
+	task_rq_unlock(rq, p, &rf);
+	kfree(user_mask);
+	return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+	cpumask_var_t new_mask;
+	const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+	alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+	/*
+	 * __migrate_task() can fail silently in the face of concurrent
+	 * offlining of the chosen destination CPU, so take the hotplug
+	 * lock to ensure that the migration succeeds.
+	 */
+	cpus_read_lock();
+	if (!cpumask_available(new_mask))
+		goto out_set_mask;
+
+	if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+		goto out_free_mask;
+
+	/*
+	 * We failed to find a valid subset of the affinity mask for the
+	 * task, so override it based on its cpuset hierarchy.
+	 */
+	cpuset_cpus_allowed(p, new_mask);
+	override_mask = new_mask;
+
+out_set_mask:
+	if (printk_ratelimit()) {
+		printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+				task_pid_nr(p), p->comm,
+				cpumask_pr_args(override_mask));
+	}
+
+	WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+	cpus_read_unlock();
+	free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+ * @p->user_cpus_ptr.
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+	unsigned long flags;
+	struct cpumask *mask = p->user_cpus_ptr;
+
+	/*
+	 * Try to restore the old affinity mask. If this fails, then
+	 * we free the mask explicitly to avoid it being inherited across
+	 * a subsequent fork().
+	 */
+	if (!mask || !__sched_setaffinity(p, mask))
+		return;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	release_user_cpus_ptr(p);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -7613,7 +7761,7 @@  __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
 	if (retval)
 		goto out_free_new_mask;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14a41a243f7b..e88c2d399f0d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2234,6 +2234,7 @@  extern struct task_struct *pick_next_task_idle(struct rq *rq);
 #define SCA_CHECK		0x01
 #define SCA_MIGRATE_DISABLE	0x02
 #define SCA_MIGRATE_ENABLE	0x04
+#define SCA_USER		0x08
 
 #ifdef CONFIG_SMP