diff mbox series

[23/27] x86/fpu: Defer FPU state load until return to userspace

Message ID 20190403164156.19645-24-bigeasy@linutronix.de (mailing list archive)
State New, archived
Headers show
Series [01/27] x86/fpu: Remove fpu->initialized usage in __fpu__restore_sig() | expand

Commit Message

Sebastian Andrzej Siewior April 3, 2019, 4:41 p.m. UTC
From: Rik van Riel <riel@surriel.com>

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin() & kernel_fpu_end().

The __fpregs_changes_{begin|end}() section ensures that the register
remain unchanged. Otherwise a context switch or a BH could save the
registers to its FPU context and processor's FPU register would became
random if beeing modified at the same time.

KVM swaps the host/guest register on entry/exit path. I kept the flow as
is. First it ensures that the registers are loaded and then saves the
current (host) state before it loads the guest's register. The swap is
done at the very end with disabled interrupts so it should not change
anymore before theg guest is entered. The read/save version seems to be
cheaper compared to memcpy() in a micro benchmark.

Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy().
For kernel threads, this flag gets never cleared which avoids saving /
restoring the FPU state for kernel threads and during in-kernel usage of
the FPU register.

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/entry/common.c             |   8 +++
 arch/x86/include/asm/fpu/api.h      |  22 +++++-
 arch/x86/include/asm/fpu/internal.h |  27 ++++---
 arch/x86/include/asm/trace/fpu.h    |   5 +-
 arch/x86/kernel/fpu/core.c          | 105 +++++++++++++++++++++-------
 arch/x86/kernel/fpu/signal.c        |  48 ++++++++-----
 arch/x86/kernel/process.c           |   2 +-
 arch/x86/kernel/process_32.c        |   5 +-
 arch/x86/kernel/process_64.c        |   5 +-
 arch/x86/kvm/x86.c                  |  20 ++++--
 10 files changed, 181 insertions(+), 66 deletions(-)

Comments

Borislav Petkov April 12, 2019, 2:36 p.m. UTC | #1
On Wed, Apr 03, 2019 at 06:41:52PM +0200, Sebastian Andrzej Siewior wrote:
> @@ -226,10 +236,9 @@ static void fpu__initialize(struct fpu *fpu)
>  {
>  	WARN_ON_FPU(fpu != &current->thread.fpu);
>  
> +	set_thread_flag(TIF_NEED_FPU_LOAD);
>  	fpstate_init(&fpu->state);
>  	trace_x86_fpu_init_state(fpu);
> -
> -	trace_x86_fpu_activate_state(fpu);

That is called nowhere after this patch.

Shouldn't it be called below, before fpregs_activate() because
fpregs_activate() does trace_x86_fpu_regs_activated()?

>  /*
> @@ -308,6 +317,8 @@ void fpu__drop(struct fpu *fpu)
>   */
>  static inline void copy_init_fpstate_to_fpregs(void)
>  {
> +	fpregs_lock();
> +
>  	if (use_xsave())
>  		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
>  	else if (static_cpu_has(X86_FEATURE_FXSR))
> @@ -317,6 +328,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
>  
>  	if (boot_cpu_has(X86_FEATURE_OSPKE))
>  		copy_init_pkru_to_fpregs();
> +
> +	fpregs_mark_activate();
> +	fpregs_unlock();
>  }
>  
>  /*
> @@ -339,6 +353,45 @@ void fpu__clear(struct fpu *fpu)
>  		copy_init_fpstate_to_fpregs();
>  }
>  
> +/*
> + * Load FPU context before returning to userspace.
> + */
> +void switch_fpu_return(void)
> +{
> +	if (!static_cpu_has(X86_FEATURE_FPU))
> +		return;
> +
> +	__fpregs_load_activate();
> +}
> +EXPORT_SYMBOL_GPL(switch_fpu_return);
> +
> +#ifdef CONFIG_X86_DEBUG_FPU
> +/*
> + * If current FPU state according to its tracking (loaded FPU ctx on this CPU)
> + * is not valid then we must have TIF_NEED_FPU_LOAD set so the context is loaded on
> + * return to userland.
> + */
> +void fpregs_assert_state_consistent(void)
> +{
> +       struct fpu *fpu = &current->thread.fpu;
> +
> +       if (test_thread_flag(TIF_NEED_FPU_LOAD))
> +               return;
> +       WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
> +}
> +EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
> +#endif
> +
> +void fpregs_mark_activate(void)
> +{
> +	struct fpu *fpu = &current->thread.fpu;
> +

<--- here?

> +	fpregs_activate(fpu);
> +	fpu->last_cpu = smp_processor_id();
> +	clear_thread_flag(TIF_NEED_FPU_LOAD);
> +}
> +EXPORT_SYMBOL_GPL(fpregs_mark_activate);
Sebastian Andrzej Siewior April 12, 2019, 3:24 p.m. UTC | #2
On 2019-04-12 16:36:15 [+0200], Borislav Petkov wrote:
> On Wed, Apr 03, 2019 at 06:41:52PM +0200, Sebastian Andrzej Siewior wrote:
> > @@ -226,10 +236,9 @@ static void fpu__initialize(struct fpu *fpu)
> >  {
> >  	WARN_ON_FPU(fpu != &current->thread.fpu);
> >  
> > +	set_thread_flag(TIF_NEED_FPU_LOAD);
> >  	fpstate_init(&fpu->state);
> >  	trace_x86_fpu_init_state(fpu);
> > -
> > -	trace_x86_fpu_activate_state(fpu);
> 
> That is called nowhere after this patch.

Isn't it called from fpu__clear()?

> Shouldn't it be called below, before fpregs_activate() because
> fpregs_activate() does trace_x86_fpu_regs_activated()?

Why? fpu__initialize() wipes the FPU state and starts from zero.
fpregs_mark_activate() on the other hand marks this FPU context is
currently active.

Sebastian
Borislav Petkov April 12, 2019, 4:22 p.m. UTC | #3
On Fri, Apr 12, 2019 at 05:24:37PM +0200, Sebastian Andrzej Siewior wrote:
> Isn't it called from fpu__clear()?

$ git grep trace_x86_fpu_activate_state
$

all 23 patches applied. Grepping the later patches doesn't give
trace_x86_fpu_activate_state() either.

> > Shouldn't it be called below, before fpregs_activate() because
> > fpregs_activate() does trace_x86_fpu_regs_activated()?
> 
> Why? fpu__initialize() wipes the FPU state and starts from zero.
> fpregs_mark_activate() on the other hand marks this FPU context is
> currently active.

Well, then the naming still needs adjusting.

"trace_x86_fpu_activate_state" reads to me like the state is being
activated here, at the call site. And fpregs_mark_activate() marks which
*fpu is the active one.

Hmm.
Sebastian Andrzej Siewior April 12, 2019, 4:37 p.m. UTC | #4
On 2019-04-12 18:22:13 [+0200], Borislav Petkov wrote:
> On Fri, Apr 12, 2019 at 05:24:37PM +0200, Sebastian Andrzej Siewior wrote:
> > Isn't it called from fpu__clear()?
> 
> $ git grep trace_x86_fpu_activate_state
> $
> 
> all 23 patches applied. Grepping the later patches doesn't give
> trace_x86_fpu_activate_state() either.
> 
> > > Shouldn't it be called below, before fpregs_activate() because
> > > fpregs_activate() does trace_x86_fpu_regs_activated()?
> > 
> > Why? fpu__initialize() wipes the FPU state and starts from zero.
> > fpregs_mark_activate() on the other hand marks this FPU context is
> > currently active.
> 
> Well, then the naming still needs adjusting.
> 
> "trace_x86_fpu_activate_state" reads to me like the state is being
> activated here, at the call site. And fpregs_mark_activate() marks which
> *fpu is the active one.

bah. You are referring to trace_x86_fpu_activate_state. I parsed this as
fpu__initialize(). Sorry for that.

trace_x86_fpu_activate_state is unused and we should do something about
it.  Adding it to fpregs_mark_activate() seems to make sense.
We we also have this:
 fpregs_mark_activate()
   fpregs_activate()
      trace_x86_fpu_regs_activated()

(as you mentioned) so we would always record both trace points.
Therefore I would suggest to remove it.
Maybe we could add a new one to __fpregs_load_activate() one in case we
avoid loading registers because of fpregs_state_valid(). This might make
sense.

Sebastian
Borislav Petkov April 12, 2019, 4:48 p.m. UTC | #5
On Fri, Apr 12, 2019 at 06:37:41PM +0200, Sebastian Andrzej Siewior wrote:
> (as you mentioned) so we would always record both trace points.
> Therefore I would suggest to remove it.

Remove which one?

Recording both TPs seems to make sense unless it doesn't make a whole
lotta sense to have:

fpregs_mark_activate
|-> trace_x86_fpu_activate_state		<-- TP1
|-> fpregs_activate
    |-> trace_x86_fpu_regs_activated		<-- TP2


Yeah, looks like the two are too much and too close for no good
reason. There's nothing particularly spectacular in-between in
fpregs_activate().

> Maybe we could add a new one to __fpregs_load_activate() one in case we
> avoid loading registers because of fpregs_state_valid(). This might make
> sense.

But that's only the switch_fpu_return() path. Is fpregs_mark_activate()
is going to use only the trace_x86_fpu_regs_activated() one? Note the
"d" at the end.

  [ Btw, those two names need adjusting too: who came up with such close,
     confusing names?!
     ]
Sebastian Andrzej Siewior April 12, 2019, 5:19 p.m. UTC | #6
On 2019-04-12 18:48:28 [+0200], Borislav Petkov wrote:
> On Fri, Apr 12, 2019 at 06:37:41PM +0200, Sebastian Andrzej Siewior wrote:
> > (as you mentioned) so we would always record both trace points.
> > Therefore I would suggest to remove it.
> 
> Remove which one?

remove x86_fpu_activate_state.

> Recording both TPs seems to make sense unless it doesn't make a whole
> lotta sense to have:
> 
> fpregs_mark_activate
> |-> trace_x86_fpu_activate_state		<-- TP1
> |-> fpregs_activate
>     |-> trace_x86_fpu_regs_activated		<-- TP2

> Yeah, looks like the two are too much and too close for no good
> reason. There's nothing particularly spectacular in-between in
> fpregs_activate().

correct.

> > Maybe we could add a new one to __fpregs_load_activate() one in case we
> > avoid loading registers because of fpregs_state_valid(). This might make
> > sense.
> 
> But that's only the switch_fpu_return() path. Is fpregs_mark_activate()
> is going to use only the trace_x86_fpu_regs_activated() one? Note the
> "d" at the end.

You could have a trace-point in case we return to userland and we don't
load FPU registers because it looks like they are valid.
It was just an idea.

We could also rip all trcepoints out, rethink the situation and add new
ones based on current code.
- on schedule out, we may need to save registers (depending on
  TIF_NEED_FPU_LOAD) which is new. Before the series we always did.

- on schedule in do nothing but set that TIF bit. This is probably
  boring.

- on return to userland we should load the registers but can avoid it if
  we assume that they are valid for the current task
  (fpregs_state_valid())

- in kernel_fpu_begin() we may trash the task's FPU state (by saving its
  registers or by resetting fpu_fpregs_owner_ctx).

Those might be interesting.

Currently we have:
  "x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx"

and we have to find out what happens based on where that TP was
recorded. Also I'm not sure if the recorded xfeatures change over time.
I think they do not…

>   [ Btw, those two names need adjusting too: who came up with such close,
>      confusing names?!
>      ]

you mean trace_x86_fpu_activate_state and trace_x86_fpu_regs_activated?
They were added in d1898b733619b ("x86/fpu: Add tracepoints to dump FPU
state at key points") and we wouldn't have any otherwise.

Sebastian
Borislav Petkov April 12, 2019, 5:29 p.m. UTC | #7
On Fri, Apr 12, 2019 at 07:19:55PM +0200, Sebastian Andrzej Siewior wrote:
> remove x86_fpu_activate_state.

Ok, zapping it as part of this patch.

> We could also rip all trcepoints out, rethink the situation and add new
> ones based on current code.

Well, since this changes FPU regs handling considerably, I think the
only correct step would be to adjust the tracepoints. BUT(!) we should
not forget we're not supposed to break luserspace.

> - on schedule out, we may need to save registers (depending on
>   TIF_NEED_FPU_LOAD) which is new. Before the series we always did.

That makes sense.

> - on schedule in do nothing but set that TIF bit. This is probably
>   boring.

Yah.

> - on return to userland we should load the registers but can avoid it if
>   we assume that they are valid for the current task
>   (fpregs_state_valid())

That is interesting info.

> - in kernel_fpu_begin() we may trash the task's FPU state (by saving its
>   registers or by resetting fpu_fpregs_owner_ctx).

Do we care?

You mean, in case you have workloads which might involve a lot of
in-kernel FPU use which would punish task context switches?

> Those might be interesting.
> 
> Currently we have:
>   "x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx"
> 
> and we have to find out what happens based on where that TP was
> recorded. Also I'm not sure if the recorded xfeatures change over time.
> I think they do not…

Good question.

> you mean trace_x86_fpu_activate_state and trace_x86_fpu_regs_activated?
> They were added in d1898b733619b ("x86/fpu: Add tracepoints to dump FPU
> state at key points") and we wouldn't have any otherwise.

I guess it made sense then...
Sebastian Andrzej Siewior April 15, 2019, 9:14 a.m. UTC | #8
On 2019-04-12 19:29:31 [+0200], Borislav Petkov wrote:
> > - in kernel_fpu_begin() we may trash the task's FPU state (by saving its
> >   registers or by resetting fpu_fpregs_owner_ctx).
> 
> Do we care?
> 
> You mean, in case you have workloads which might involve a lot of
> in-kernel FPU use which would punish task context switches?

Well, depends how you look on it. So for instance:
- Does kernel_fpu_begin() save task's FPU registers or are they already
  stored and it is a nop.
- The task avoid loading its registers on return to userland but that
  FPU usage in BH made it impossible.
- Is there any FPU usage in BH?

And there is this: My program usually works but with this scheduling
pattern and FPU usage in kernel it computes wrong result.
 
Sebastian
diff mbox series

Patch

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7bc105f47d21a..13e8e29af6ab7 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,6 +31,7 @@ 
 #include <asm/vdso.h>
 #include <linux/uaccess.h>
 #include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -196,6 +197,13 @@  __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
 		exit_to_usermode_loop(regs, cached_flags);
 
+	/* Reload ti->flags; we may have rescheduled above. */
+	cached_flags = READ_ONCE(ti->flags);
+
+	fpregs_assert_state_consistent();
+	if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+		switch_fpu_return();
+
 #ifdef CONFIG_COMPAT
 	/*
 	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 73e684160f354..fc3dd044e16dd 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -10,7 +10,7 @@ 
 
 #ifndef _ASM_X86_FPU_API_H
 #define _ASM_X86_FPU_API_H
-#include <linux/preempt.h>
+#include <linux/bottom_half.h>
 
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
@@ -22,17 +22,37 @@ 
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
 
+/*
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * A context switch will (and softirq might) save CPU's FPU register to
+ * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in a random
+ * state.
+ */
 static inline void fpregs_lock(void)
 {
 	preempt_disable();
+	local_bh_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
+	local_bh_enable();
 	preempt_enable();
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 037472d0be140..7a143b86d3a65 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -30,7 +30,7 @@  extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
@@ -528,13 +528,20 @@  static inline void fpregs_activate(struct fpu *fpu)
 	trace_x86_fpu_regs_activated(fpu);
 }
 
-static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+static inline void __fpregs_load_activate(void)
 {
+	struct fpu *fpu = &current->thread.fpu;
+	int cpu = smp_processor_id();
+
+	if (WARN_ON_ONCE(current->mm == NULL))
+		return;
+
 	if (!fpregs_state_valid(fpu, cpu)) {
-		if (current->mm)
-			copy_kernel_to_fpregs(&fpu->state);
+		copy_kernel_to_fpregs(&fpu->state);
 		fpregs_activate(fpu);
+		fpu->last_cpu = cpu;
 	}
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
 
 /*
@@ -545,8 +552,8 @@  static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
  *  - switch_fpu_prepare() saves the old state.
  *    This is done within the context of the old process.
  *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
+ *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ *    will get loaded on return to userspace, or when the kernel needs it.
  *
  * The FPU context is only stored/restore for user task and ->mm is used to
  * distinguish between kernel and user threads.
@@ -576,10 +583,10 @@  switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 
 /*
- * Set up the userspace FPU context for the new task, if the task
- * has used the FPU.
+ * Load PKRU from the FPU context if available. Delay loading the loading of the
+ * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
+static inline void switch_fpu_finish(struct fpu *new_fpu)
 {
 	struct pkru_state *pk;
 	u32 pkru_val = init_pkru_value;
@@ -587,7 +594,7 @@  static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return;
 
-	__fpregs_load_activate(new_fpu, cpu);
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 
 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return;
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index bd65f6ba950f8..91a1422091ceb 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -13,19 +13,22 @@  DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
+		__field(bool, load_fpu)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
 		),
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
+		__entry->load_fpu	= test_thread_flag(TIF_NEED_FPU_LOAD);
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
+			__entry->load_fpu,
 			__entry->xfeatures,
 			__entry->xcomp_bv
 	)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 739ca3ae2bdcd..7d68404a88d52 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -102,23 +102,20 @@  static void __kernel_fpu_begin(void)
 	kernel_fpu_disable();
 
 	if (current->mm) {
-		/*
-		 * Ignore return value -- we don't care if reg state
-		 * is clobbered.
-		 */
-		copy_fpregs_to_fpstate(fpu);
-	} else {
-		__cpu_invalidate_fpregs_state();
+		if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+			set_thread_flag(TIF_NEED_FPU_LOAD);
+			/*
+			 * Ignore return value -- we don't care if reg state
+			 * is clobbered.
+			 */
+			copy_fpregs_to_fpstate(fpu);
+		}
 	}
+	__cpu_invalidate_fpregs_state();
 }
 
 static void __kernel_fpu_end(void)
 {
-	struct fpu *fpu = &current->thread.fpu;
-
-	if (current->mm)
-		copy_kernel_to_fpregs(&fpu->state);
-
 	kernel_fpu_enable();
 }
 
@@ -145,14 +142,17 @@  void fpu__save(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	preempt_disable();
+	fpregs_lock();
 	trace_x86_fpu_before_save(fpu);
 
-	if (!copy_fpregs_to_fpstate(fpu))
-		copy_kernel_to_fpregs(&fpu->state);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		if (!copy_fpregs_to_fpstate(fpu)) {
+			copy_kernel_to_fpregs(&fpu->state);
+		}
+	}
 
 	trace_x86_fpu_after_save(fpu);
-	preempt_enable();
+	fpregs_unlock();
 }
 EXPORT_SYMBOL_GPL(fpu__save);
 
@@ -185,8 +185,11 @@  void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct task_struct *dst, struct task_struct *src)
 {
+	struct fpu *dst_fpu = &dst->thread.fpu;
+	struct fpu *src_fpu = &src->thread.fpu;
+
 	dst_fpu->last_cpu = -1;
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
@@ -201,16 +204,23 @@  int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
 
 	/*
-	 * Save current FPU registers directly into the child
-	 * FPU context, without any memory-to-memory copying.
+	 * If the FPU registers are not current just memcpy() the state.
+	 * Otherwise save current FPU registers directly into the child's FPU
+	 * context, without any memory-to-memory copying.
 	 *
 	 * ( The function 'fails' in the FNSAVE case, which destroys
-	 *   register contents so we have to copy them back. )
+	 *   register contents so we have to load them back. )
 	 */
-	if (!copy_fpregs_to_fpstate(dst_fpu)) {
-		memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
-		copy_kernel_to_fpregs(&src_fpu->state);
-	}
+	fpregs_lock();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
+
+	else if (!copy_fpregs_to_fpstate(dst_fpu))
+		copy_kernel_to_fpregs(&dst_fpu->state);
+
+	fpregs_unlock();
+
+	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
 
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
@@ -226,10 +236,9 @@  static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpstate_init(&fpu->state);
 	trace_x86_fpu_init_state(fpu);
-
-	trace_x86_fpu_activate_state(fpu);
 }
 
 /*
@@ -308,6 +317,8 @@  void fpu__drop(struct fpu *fpu)
  */
 static inline void copy_init_fpstate_to_fpregs(void)
 {
+	fpregs_lock();
+
 	if (use_xsave())
 		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
 	else if (static_cpu_has(X86_FEATURE_FXSR))
@@ -317,6 +328,9 @@  static inline void copy_init_fpstate_to_fpregs(void)
 
 	if (boot_cpu_has(X86_FEATURE_OSPKE))
 		copy_init_pkru_to_fpregs();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
 }
 
 /*
@@ -339,6 +353,45 @@  void fpu__clear(struct fpu *fpu)
 		copy_init_fpstate_to_fpregs();
 }
 
+/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return;
+
+	__fpregs_load_activate();
+}
+EXPORT_SYMBOL_GPL(switch_fpu_return);
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * If current FPU state according to its tracking (loaded FPU ctx on this CPU)
+ * is not valid then we must have TIF_NEED_FPU_LOAD set so the context is loaded on
+ * return to userland.
+ */
+void fpregs_assert_state_consistent(void)
+{
+       struct fpu *fpu = &current->thread.fpu;
+
+       if (test_thread_flag(TIF_NEED_FPU_LOAD))
+               return;
+       WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
+}
+EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+#endif
+
+void fpregs_mark_activate(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	fpregs_activate(fpu);
+	fpu->last_cpu = smp_processor_id();
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+EXPORT_SYMBOL_GPL(fpregs_mark_activate);
+
 /*
  * x87 math exception handling:
  */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2421fb17f643d..a5b086ec426a5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -269,11 +269,9 @@  static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	struct fpu *fpu = &tsk->thread.fpu;
 	int state_size = fpu_kernel_xstate_size;
 	struct user_i387_ia32_struct env;
-	union fpregs_state *state;
 	u64 xfeatures = 0;
 	int fx_only = 0;
 	int ret = 0;
-	void *tmp;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -308,14 +306,18 @@  static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 	}
 
-	tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-	state = PTR_ALIGN(tmp, 64);
+	/*
+	 * The current state of the FPU registers does not matter. By setting
+	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
+	 * is not modified on context switch and that the xstate is considered
+	 * to loaded again on return to userland (overriding last_cpu avoids the
+	 * optimisation).
+	 */
+	set_thread_flag(TIF_NEED_FPU_LOAD);
+	__fpu_invalidate_fpregs_state(fpu);
 
 	if ((unsigned long)buf_fx % 64)
 		fx_only = 1;
-
 	/*
 	 * For 32-bit frames with fxstate, copy the fxstate so it can be
 	 * reconstructed later.
@@ -330,43 +332,51 @@  static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		u64 init_bv = xfeatures_mask & ~xfeatures;
 
 		if (using_compacted_format()) {
-			ret = copy_user_to_xstate(&state->xsave, buf_fx);
+			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
 		} else {
-			ret = __copy_from_user(&state->xsave, buf_fx, state_size);
+			ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
 			if (!ret && state_size > offsetof(struct xregs_state, header))
-				ret = validate_xstate_header(&state->xsave.header);
+				ret = validate_xstate_header(&fpu->state.xsave.header);
 		}
 		if (ret)
 			goto err_out;
 
-		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
 
+		fpregs_lock();
 		if (unlikely(init_bv))
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures);
 
 	} else if (use_fxsr()) {
-		ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
-		if (ret)
+		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
+		if (ret) {
+			ret = -EFAULT;
 			goto err_out;
+		}
 
-		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
+
+		fpregs_lock();
 		if (use_xsave()) {
 			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
-		ret = copy_kernel_to_fxregs_err(&state->fxsave);
+		ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
 	} else {
-		ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
 		if (ret)
 			goto err_out;
-		ret = copy_kernel_to_fregs_err(&state->fsave);
+		fpregs_lock();
+		ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
 	}
+	if (!ret)
+		fpregs_mark_activate();
+	fpregs_unlock();
 
 err_out:
-	kfree(tmp);
 	if (ret)
 		fpu__clear(fpu);
 	return ret;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 58ac7be52c7a6..b9b8e6eb74f27 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -101,7 +101,7 @@  int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	dst->thread.vm86 = NULL;
 #endif
 
-	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
+	return fpu__copy(dst, src);
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 77d9eb43ccac8..1bc47f3a48854 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,8 @@  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	switch_fpu_prepare(prev_fpu, cpu);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -290,7 +291,7 @@  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_fpu, cpu);
+	switch_fpu_finish(next_fpu);
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ffea7c557963a..37b2ecef041e6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -520,7 +520,8 @@  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
 
-	switch_fpu_prepare(prev_fpu, cpu);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -572,7 +573,7 @@  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_fpu, cpu);
+	switch_fpu_finish(next_fpu);
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8022e7769b3a1..e340c3c0cba32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7877,6 +7877,10 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		wait_lapic_expire(vcpu);
 	guest_enter_irqoff();
 
+	fpregs_assert_state_consistent();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_return();
+
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -8137,22 +8141,30 @@  static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	fpregs_lock();
+
 	copy_fpregs_to_fpstate(&current->thread.fpu);
 	/* PKRU is separately restored in kvm_x86_ops->run.  */
 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
 				~XFEATURE_MASK_PKRU);
-	preempt_enable();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	fpregs_lock();
+
 	copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
 	copy_kernel_to_fpregs(&current->thread.fpu.state);
-	preempt_enable();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }