diff mbox

[3/3] arm64/sve: KVM: Avoid dereference of dead task during guest entry

Message ID 1512141582-17474-4-git-send-email-Dave.Martin@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Martin Dec. 1, 2017, 3:19 p.m. UTC
When deciding whether to invalidate FPSIMD state cached in the cpu,
the backend function sve_flush_cpu_state() attempts to dereference
__this_cpu_read(fpsimd_last_state).  However, this is not safe:
there is no guarantee that the pointer is still valid, because the
task could have exited in the meantime.  For this reason, this
percpu pointer should only be assigned or compared, never
dereferenced.

This means that we need another means to get the appropriate value
of TIF_SVE for the associated task.

This patch solves this issue by adding a cached copy of the TIF_SVE
flag in fpsimd_last_state, which we can check without dereferencing
the task pointer.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

Comments

Ard Biesheuvel Dec. 4, 2017, 1:53 p.m. UTC | #1
On 1 December 2017 at 15:19, Dave Martin <Dave.Martin@arm.com> wrote:
> When deciding whether to invalidate FPSIMD state cached in the cpu,
> the backend function sve_flush_cpu_state() attempts to dereference
> __this_cpu_read(fpsimd_last_state).  However, this is not safe:
> there is no guarantee that the pointer is still valid, because the
> task could have exited in the meantime.  For this reason, this
> percpu pointer should only be assigned or compared, never
> dereferenced.
>

Doesn't that mean the pointer could also be pointing to the
fpsimd_state of a newly created task that is completely unrelated?
IOW, are you sure comparison is safe?

> This means that we need another means to get the appropriate value
> of TIF_SVE for the associated task.
>
> This patch solves this issue by adding a cached copy of the TIF_SVE
> flag in fpsimd_last_state, which we can check without dereferencing
> the task pointer.
>
> Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm64/kernel/fpsimd.c | 28 ++++++++++++++++------------
>  1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 007140b..3dc8058 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -114,7 +114,12 @@
>   *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
>   *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
>   */
> -static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
> +struct fpsimd_last_state_struct {
> +       struct fpsimd_state *st;
> +       bool sve_in_use;
> +};
> +
> +static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
>
>  /* Default VL for tasks that don't set it explicitly: */
>  static int sve_default_vl = -1;
> @@ -905,7 +910,7 @@ void fpsimd_thread_switch(struct task_struct *next)
>                  */
>                 struct fpsimd_state *st = &next->thread.fpsimd_state;
>
> -               if (__this_cpu_read(fpsimd_last_state) == st
> +               if (__this_cpu_read(fpsimd_last_state.st) == st
>                     && st->cpu == smp_processor_id())
>                         clear_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE);
>                 else
> @@ -997,9 +1002,12 @@ void fpsimd_signal_preserve_current_state(void)
>   */
>  static void fpsimd_bind_to_cpu(void)
>  {
> +       struct fpsimd_last_state_struct *last =
> +               this_cpu_ptr(&fpsimd_last_state);
>         struct fpsimd_state *st = &current->thread.fpsimd_state;
>
> -       __this_cpu_write(fpsimd_last_state, st);
> +       last->st = st;
> +       last->sve_in_use = test_thread_flag(TIF_SVE);
>         st->cpu = smp_processor_id();
>  }
>
> @@ -1057,7 +1065,7 @@ void fpsimd_flush_task_state(struct task_struct *t)
>
>  static inline void fpsimd_flush_cpu_state(void)
>  {
> -       __this_cpu_write(fpsimd_last_state, NULL);
> +       __this_cpu_write(fpsimd_last_state.st, NULL);
>  }
>
>  /*
> @@ -1070,14 +1078,10 @@ static inline void fpsimd_flush_cpu_state(void)
>  #ifdef CONFIG_ARM64_SVE
>  void sve_flush_cpu_state(void)
>  {
> -       struct fpsimd_state *const fpstate = __this_cpu_read(fpsimd_last_state);
> -       struct task_struct *tsk;
> -
> -       if (!fpstate)
> -               return;
> +       struct fpsimd_last_state_struct const *last =
> +               this_cpu_ptr(&fpsimd_last_state);
>
> -       tsk = container_of(fpstate, struct task_struct, thread.fpsimd_state);
> -       if (test_tsk_thread_flag(tsk, TIF_SVE))
> +       if (last->st && last->sve_in_use)
>                 fpsimd_flush_cpu_state();
>  }
>  #endif /* CONFIG_ARM64_SVE */
> @@ -1272,7 +1276,7 @@ static inline void fpsimd_pm_init(void) { }
>  #ifdef CONFIG_HOTPLUG_CPU
>  static int fpsimd_cpu_dead(unsigned int cpu)
>  {
> -       per_cpu(fpsimd_last_state, cpu) = NULL;
> +       per_cpu(fpsimd_last_state.st, cpu) = NULL;
>         return 0;
>  }
>
> --
> 2.1.4
>
Dave Martin Dec. 4, 2017, 3:36 p.m. UTC | #2
On Mon, Dec 04, 2017 at 01:53:21PM +0000, Ard Biesheuvel wrote:
> On 1 December 2017 at 15:19, Dave Martin <Dave.Martin@arm.com> wrote:
> > When deciding whether to invalidate FPSIMD state cached in the cpu,
> > the backend function sve_flush_cpu_state() attempts to dereference
> > __this_cpu_read(fpsimd_last_state).  However, this is not safe:
> > there is no guarantee that the pointer is still valid, because the
> > task could have exited in the meantime.  For this reason, this
> > percpu pointer should only be assigned or compared, never
> > dereferenced.
> >
> 
> Doesn't that mean the pointer could also be pointing to the
> fpsimd_state of a newly created task that is completely unrelated?
> IOW, are you sure comparison is safe?

There are more conditions: the only place the determination is
made is for next, in fpsimd_thread_switch(next).


However, I can see your concern and I'm not sure how/if it is
resolved.

For the worst case, let's assume that some child forks off but
doesn't enter userspace yet, while another task round-robins
across all CPUs, interspersed with tasks that don't enter userspace.

So, we end up with

All cpu < NR_CPUS . per_cpu(fpsimd_last_state, cpu) == T.

Now, if T dies and a new task is allocated the same task_struct pointer,
then the _new_ T is guaranteed to get scheduled in on a CPU whose
per_cpu(fpsmid_last_state) == T.

Thus, new T can pick up old T's regs _unless_ new T's fpsimd_state.cpu
is invalid (i.e., NR_CPUS).

This is a separate bug from the one addressed by this patch though.
We can't go and harvest the bad pointers when old T exits, because
this might race new T being scheduled for real -- in any case it
would involve iterating over all CPUs which sounds racy and
inefficient.


So, I'd say we _must_ call fpsimd_flush_task_state() for every new
task.  This may result in a redundant reload of the state, but this
is what would happen anyway if the pointers did not alias.

Does this sound real to you?  If so, I'll try to write something.

And does this patch look reasonable to fix what it's trying to fix?


I wonder whether arch/arm has the same bug actually, since the kernel-
mode NEON logic was modelled from there IIUC (?)

Cheers
---Dave
Christoffer Dall Dec. 5, 2017, 9:43 a.m. UTC | #3
Hi Dave,

On Mon, Dec 04, 2017 at 03:36:50PM +0000, Dave Martin wrote:
> On Mon, Dec 04, 2017 at 01:53:21PM +0000, Ard Biesheuvel wrote:
> > On 1 December 2017 at 15:19, Dave Martin <Dave.Martin@arm.com> wrote:
> > > When deciding whether to invalidate FPSIMD state cached in the cpu,
> > > the backend function sve_flush_cpu_state() attempts to dereference
> > > __this_cpu_read(fpsimd_last_state).  However, this is not safe:
> > > there is no guarantee that the pointer is still valid, because the
> > > task could have exited in the meantime.  For this reason, this
> > > percpu pointer should only be assigned or compared, never
> > > dereferenced.
> > >
> > 
> > Doesn't that mean the pointer could also be pointing to the
> > fpsimd_state of a newly created task that is completely unrelated?
> > IOW, are you sure comparison is safe?
> 
> There are more conditions: the only place the determination is
> made is for next, in fpsimd_thread_switch(next).
> 
> 
> However, I can see your concern and I'm not sure how/if it is
> resolved.
> 
> For the worst case, let's assume that some child forks off but
> doesn't enter userspace yet, while another task round-robins
> across all CPUs, interspersed with tasks that don't enter userspace.
> 
> So, we end up with
> 
> All cpu < NR_CPUS . per_cpu(fpsimd_last_state, cpu) == T.
> 
> Now, if T dies and a new task is allocated the same task_struct pointer,
> then the _new_ T is guaranteed to get scheduled in on a CPU whose
> per_cpu(fpsmid_last_state) == T.
> 
> Thus, new T can pick up old T's regs _unless_ new T's fpsimd_state.cpu
> is invalid (i.e., NR_CPUS).
> 
> This is a separate bug from the one addressed by this patch though.
> We can't go and harvest the bad pointers when old T exits, because
> this might race new T being scheduled for real -- in any case it
> would involve iterating over all CPUs which sounds racy and
> inefficient.
> 
> 
> So, I'd say we _must_ call fpsimd_flush_task_state() for every new
> task.  This may result in a redundant reload of the state, but this
> is what would happen anyway if the pointers did not alias.
> 
> Does this sound real to you?  If so, I'll try to write something.
> 
> And does this patch look reasonable to fix what it's trying to fix?
> 
> 
> I wonder whether arch/arm has the same bug actually, since the kernel-
> mode NEON logic was modelled from there IIUC (?)
> 
Isn't this the common kernel problem of pid reuse?

It seems holding a reference to a struct pid would solve your problems.
See include/linux/pid.h.

That might also make the code more intuitive and prevent future attempts
of dereferencing potentially dead data structures.

Thanks,
-Christoffer
Dave Martin Dec. 5, 2017, 12:40 p.m. UTC | #4
On Tue, Dec 05, 2017 at 10:43:50AM +0100, Christoffer Dall wrote:
> Hi Dave,
> 
> On Mon, Dec 04, 2017 at 03:36:50PM +0000, Dave Martin wrote:
> > On Mon, Dec 04, 2017 at 01:53:21PM +0000, Ard Biesheuvel wrote:
> > > On 1 December 2017 at 15:19, Dave Martin <Dave.Martin@arm.com> wrote:
> > > > When deciding whether to invalidate FPSIMD state cached in the cpu,
> > > > the backend function sve_flush_cpu_state() attempts to dereference
> > > > __this_cpu_read(fpsimd_last_state).  However, this is not safe:
> > > > there is no guarantee that the pointer is still valid, because the
> > > > task could have exited in the meantime.  For this reason, this
> > > > percpu pointer should only be assigned or compared, never
> > > > dereferenced.
> > > >
> > > 
> > > Doesn't that mean the pointer could also be pointing to the
> > > fpsimd_state of a newly created task that is completely unrelated?
> > > IOW, are you sure comparison is safe?
> > 
> > There are more conditions: the only place the determination is
> > made is for next, in fpsimd_thread_switch(next).
> > 
> > 
> > However, I can see your concern and I'm not sure how/if it is
> > resolved.
> > 
> > For the worst case, let's assume that some child forks off but
> > doesn't enter userspace yet, while another task round-robins
> > across all CPUs, interspersed with tasks that don't enter userspace.
> > 
> > So, we end up with
> > 
> > All cpu < NR_CPUS . per_cpu(fpsimd_last_state, cpu) == T.
> > 
> > Now, if T dies and a new task is allocated the same task_struct pointer,
> > then the _new_ T is guaranteed to get scheduled in on a CPU whose
> > per_cpu(fpsmid_last_state) == T.
> > 
> > Thus, new T can pick up old T's regs _unless_ new T's fpsimd_state.cpu
> > is invalid (i.e., NR_CPUS).
> > 
> > This is a separate bug from the one addressed by this patch though.
> > We can't go and harvest the bad pointers when old T exits, because
> > this might race new T being scheduled for real -- in any case it
> > would involve iterating over all CPUs which sounds racy and
> > inefficient.
> > 
> > 
> > So, I'd say we _must_ call fpsimd_flush_task_state() for every new
> > task.  This may result in a redundant reload of the state, but this
> > is what would happen anyway if the pointers did not alias.
> > 
> > Does this sound real to you?  If so, I'll try to write something.
> > 
> > And does this patch look reasonable to fix what it's trying to fix?
> > 
> > 
> > I wonder whether arch/arm has the same bug actually, since the kernel-
> > mode NEON logic was modelled from there IIUC (?)
> > 
> Isn't this the common kernel problem of pid reuse?

It's a problem of <task identifier> reuse, so sort of.

> It seems holding a reference to a struct pid would solve your problems.
> See include/linux/pid.h.
> 
> That might also make the code more intuitive and prevent future attempts
> of dereferencing potentially dead data structures.

If we want use the same mechanism to track fpsimd contexts that are not
user task contexts in the future, then that wouldn't work.

In particular, I'd like to track vcpu fpsimd contexts in the same way
as task fpsimd contexts in the future: having two different mechanisms
adds cruft and inefficiency and/or bugs.


I'll look at the struct pid thing and have a think.

Cheers
---Dave
diff mbox

Patch

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 007140b..3dc8058 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -114,7 +114,12 @@ 
  *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
  *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
  */
-static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
+struct fpsimd_last_state_struct {
+	struct fpsimd_state *st;
+	bool sve_in_use;
+};
+
+static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
 
 /* Default VL for tasks that don't set it explicitly: */
 static int sve_default_vl = -1;
@@ -905,7 +910,7 @@  void fpsimd_thread_switch(struct task_struct *next)
 		 */
 		struct fpsimd_state *st = &next->thread.fpsimd_state;
 
-		if (__this_cpu_read(fpsimd_last_state) == st
+		if (__this_cpu_read(fpsimd_last_state.st) == st
 		    && st->cpu == smp_processor_id())
 			clear_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE);
 		else
@@ -997,9 +1002,12 @@  void fpsimd_signal_preserve_current_state(void)
  */
 static void fpsimd_bind_to_cpu(void)
 {
+	struct fpsimd_last_state_struct *last =
+		this_cpu_ptr(&fpsimd_last_state);
 	struct fpsimd_state *st = &current->thread.fpsimd_state;
 
-	__this_cpu_write(fpsimd_last_state, st);
+	last->st = st;
+	last->sve_in_use = test_thread_flag(TIF_SVE);
 	st->cpu = smp_processor_id();
 }
 
@@ -1057,7 +1065,7 @@  void fpsimd_flush_task_state(struct task_struct *t)
 
 static inline void fpsimd_flush_cpu_state(void)
 {
-	__this_cpu_write(fpsimd_last_state, NULL);
+	__this_cpu_write(fpsimd_last_state.st, NULL);
 }
 
 /*
@@ -1070,14 +1078,10 @@  static inline void fpsimd_flush_cpu_state(void)
 #ifdef CONFIG_ARM64_SVE
 void sve_flush_cpu_state(void)
 {
-	struct fpsimd_state *const fpstate = __this_cpu_read(fpsimd_last_state);
-	struct task_struct *tsk;
-
-	if (!fpstate)
-		return;
+	struct fpsimd_last_state_struct const *last =
+		this_cpu_ptr(&fpsimd_last_state);
 
-	tsk = container_of(fpstate, struct task_struct, thread.fpsimd_state);
-	if (test_tsk_thread_flag(tsk, TIF_SVE))
+	if (last->st && last->sve_in_use)
 		fpsimd_flush_cpu_state();
 }
 #endif /* CONFIG_ARM64_SVE */
@@ -1272,7 +1276,7 @@  static inline void fpsimd_pm_init(void) { }
 #ifdef CONFIG_HOTPLUG_CPU
 static int fpsimd_cpu_dead(unsigned int cpu)
 {
-	per_cpu(fpsimd_last_state, cpu) = NULL;
+	per_cpu(fpsimd_last_state.st, cpu) = NULL;
 	return 0;
 }