diff mbox

MIPS: usercopy: Implement stack frame object validation

Message ID 1502195022-18161-1-git-send-email-matt.redfearn@imgtec.com (mailing list archive)
State New, archived
Headers show

Commit Message

Matt Redfearn Aug. 8, 2017, 12:23 p.m. UTC
This implements arch_within_stack_frames() for MIPS that validates if an
object is wholly contained by a kernel stack frame.

With CONFIG_HARDENED_USERCOPY enabled, MIPS now passes the LKDTM tests
USERCOPY_STACK_FRAME_TO, USERCOPY_STACK_FRAME_FROM and
USERCOPY_STACK_BEYOND on a Creator Ci40.

Since the MIPS kernel does not use frame pointers, we re-use the MIPS
kernels stack frame unwinder which uses instruction inspection to deduce
the stack frame size. As such it introduces a larger performance penalty
than on arches which use the frame pointer.

On qemu, before this patch, hackbench gives:
Running with 10*40 (== 400) tasks.
Time: 5.484
Running with 10*40 (== 400) tasks.
Time: 4.039
Running with 10*40 (== 400) tasks.
Time: 3.908
Running with 10*40 (== 400) tasks.
Time: 3.955
Running with 10*40 (== 400) tasks.
Time: 4.185
Running with 10*40 (== 400) tasks.
Time: 4.497
Running with 10*40 (== 400) tasks.
Time: 3.980
Running with 10*40 (== 400) tasks.
Time: 4.078
Running with 10*40 (== 400) tasks.
Time: 4.219
Running with 10*40 (== 400) tasks.
Time: 4.026

Giving an average of 4.2371

With this patch, hackbench gives:
Running with 10*40 (== 400) tasks.
Time: 5.671
Running with 10*40 (== 400) tasks.
Time: 4.282
Running with 10*40 (== 400) tasks.
Time: 4.101
Running with 10*40 (== 400) tasks.
Time: 4.040
Running with 10*40 (== 400) tasks.
Time: 4.683
Running with 10*40 (== 400) tasks.
Time: 4.387
Running with 10*40 (== 400) tasks.
Time: 4.289
Running with 10*40 (== 400) tasks.
Time: 4.027
Running with 10*40 (== 400) tasks.
Time: 4.048
Running with 10*40 (== 400) tasks.
Time: 4.079

Giving an average of 4.3607

This indicates an additional 3% overhead for inspecting the kernel stack
when CONFIG_HARDENED_USERCOPY is enabled.

This patch is based on Linux v4.13-rc4, and for correct operation on
microMIPS depends on my series "MIPS: Further microMIPS stack unwinding
fixes"

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
---

 arch/mips/Kconfig                   |  1 +
 arch/mips/include/asm/thread_info.h | 74 +++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

Comments

Kees Cook Aug. 8, 2017, 7:11 p.m. UTC | #1
On Tue, Aug 8, 2017 at 5:23 AM, Matt Redfearn <matt.redfearn@imgtec.com> wrote:
> This implements arch_within_stack_frames() for MIPS that validates if an
> object is wholly contained by a kernel stack frame.
>
> With CONFIG_HARDENED_USERCOPY enabled, MIPS now passes the LKDTM tests
> USERCOPY_STACK_FRAME_TO, USERCOPY_STACK_FRAME_FROM and
> USERCOPY_STACK_BEYOND on a Creator Ci40.
>
> Since the MIPS kernel does not use frame pointers, we re-use the MIPS
> kernels stack frame unwinder which uses instruction inspection to deduce
> the stack frame size. As such it introduces a larger performance penalty
> than on arches which use the frame pointer.

Hmm, given x86's plans to drop the frame pointer, I wonder if the
inter-frame checking code should be gated by a CONFIG. This (3%) is a
rather high performance hit to take for a relatively small protection
(it's mainly about catching too-large-reads, since most
too-large-writes will be caught by the stack canary).

What do you think?

-Kees

>
> On qemu, before this patch, hackbench gives:
> Running with 10*40 (== 400) tasks.
> Time: 5.484
> Running with 10*40 (== 400) tasks.
> Time: 4.039
> Running with 10*40 (== 400) tasks.
> Time: 3.908
> Running with 10*40 (== 400) tasks.
> Time: 3.955
> Running with 10*40 (== 400) tasks.
> Time: 4.185
> Running with 10*40 (== 400) tasks.
> Time: 4.497
> Running with 10*40 (== 400) tasks.
> Time: 3.980
> Running with 10*40 (== 400) tasks.
> Time: 4.078
> Running with 10*40 (== 400) tasks.
> Time: 4.219
> Running with 10*40 (== 400) tasks.
> Time: 4.026
>
> Giving an average of 4.2371
>
> With this patch, hackbench gives:
> Running with 10*40 (== 400) tasks.
> Time: 5.671
> Running with 10*40 (== 400) tasks.
> Time: 4.282
> Running with 10*40 (== 400) tasks.
> Time: 4.101
> Running with 10*40 (== 400) tasks.
> Time: 4.040
> Running with 10*40 (== 400) tasks.
> Time: 4.683
> Running with 10*40 (== 400) tasks.
> Time: 4.387
> Running with 10*40 (== 400) tasks.
> Time: 4.289
> Running with 10*40 (== 400) tasks.
> Time: 4.027
> Running with 10*40 (== 400) tasks.
> Time: 4.048
> Running with 10*40 (== 400) tasks.
> Time: 4.079
>
> Giving an average of 4.3607
>
> This indicates an additional 3% overhead for inspecting the kernel stack
> when CONFIG_HARDENED_USERCOPY is enabled.
>
> This patch is based on Linux v4.13-rc4, and for correct operation on
> microMIPS depends on my series "MIPS: Further microMIPS stack unwinding
> fixes"
>
> Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
> Reviewed-by: James Hogan <james.hogan@imgtec.com>
> ---
>
>  arch/mips/Kconfig                   |  1 +
>  arch/mips/include/asm/thread_info.h | 74 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 75 insertions(+)
>
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 8dd20358464f..6cbf2d525c8d 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -35,6 +35,7 @@ config MIPS
>         select HAVE_ARCH_SECCOMP_FILTER
>         select HAVE_ARCH_TRACEHOOK
>         select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
> +       select HAVE_ARCH_WITHIN_STACK_FRAMES if KALLSYMS
>         select HAVE_CBPF_JIT if (!64BIT && !CPU_MICROMIPS)
>         select HAVE_EBPF_JIT if (64BIT && !CPU_MICROMIPS)
>         select HAVE_CC_STACKPROTECTOR
> diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
> index b439e512792b..931652460393 100644
> --- a/arch/mips/include/asm/thread_info.h
> +++ b/arch/mips/include/asm/thread_info.h
> @@ -14,6 +14,80 @@
>
>  #include <asm/processor.h>
>
> +#ifdef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
> +
> +/*
> + * Walks up the stack frames to make sure that the specified object is
> + * entirely contained by a single stack frame.
> + *
> + * Returns:
> + *     GOOD_FRAME      if within a frame
> + *     BAD_STACK       if placed across a frame boundary (or outside stack)
> + *     NOT_STACK       unable to determine
> + */
> +static inline int arch_within_stack_frames(const void *const stack,
> +                                          const void *const stackend,
> +                                          const void *obj, unsigned long len)
> +{
> +       /* Avoid header recursion by just declaring this here */
> +       extern unsigned long unwind_stack_by_address(
> +                                               unsigned long stack_page,
> +                                               unsigned long *sp,
> +                                               unsigned long pc,
> +                                               unsigned long *ra);
> +       unsigned long sp, lastsp, ra, pc;
> +       int skip_frames;
> +
> +       /* Get this frame's details */
> +       sp = (unsigned long)__builtin_frame_address(0);
> +       pc = (unsigned long)current_text_addr();
> +
> +       /*
> +        * Skip initial frames to get back the function requesting the copy.
> +        * Unwind the frames of:
> +        *   arch_within_stack_frames (inlined into check_stack_object)
> +        *   __check_object_size
> +        * This leaves sp & pc in the frame associated with
> +        *   copy_{to,from}_user() (inlined into do_usercopy_stack)
> +        */
> +       for (skip_frames = 0; skip_frames < 2; skip_frames++) {
> +               pc = unwind_stack_by_address((unsigned long)stack, &sp, pc, &ra);
> +               if (!pc)
> +                       return BAD_STACK;
> +       }
> +
> +       if ((unsigned long)obj < sp) {
> +               /* obj is not in the frame of the requestor or it's callers */
> +               return BAD_STACK;
> +       }
> +
> +       /*
> +        * low ---------------------------------------> high
> +        * [local vars][saved regs][ra][local vars']
> +        * ^                           ^
> +        * lastsp                      sp
> +        * ^----------------------^
> +        *  allow copies only within here
> +        */
> +       do {
> +               lastsp = sp;
> +               pc = unwind_stack_by_address((unsigned long)stack, &sp, pc, &ra);
> +               if ((((unsigned long)obj) >= lastsp) &&
> +                   (((unsigned long)obj + len) <= (sp - sizeof(void *)))) {
> +                       /* obj is entirely within this stack frame */
> +                       return GOOD_FRAME;
> +               }
> +       } while (pc);
> +
> +       /*
> +        * We can't unwind any further. If we haven't found the object entirely
> +        * within one of our callers frames, it must be a bad object.
> +        */
> +       return BAD_STACK;
> +}
> +
> +#endif /* CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES */
> +
>  /*
>   * low level task data that entry.S needs immediate access to
>   * - this struct should fit entirely inside of one cache line
> --
> 2.7.4
>
Matt Redfearn Aug. 10, 2017, 8:24 a.m. UTC | #2
Hi Kees,


On 08/08/17 20:11, Kees Cook wrote:
> On Tue, Aug 8, 2017 at 5:23 AM, Matt Redfearn <matt.redfearn@imgtec.com> wrote:
>> This implements arch_within_stack_frames() for MIPS that validates if an
>> object is wholly contained by a kernel stack frame.
>>
>> With CONFIG_HARDENED_USERCOPY enabled, MIPS now passes the LKDTM tests
>> USERCOPY_STACK_FRAME_TO, USERCOPY_STACK_FRAME_FROM and
>> USERCOPY_STACK_BEYOND on a Creator Ci40.
>>
>> Since the MIPS kernel does not use frame pointers, we re-use the MIPS
>> kernels stack frame unwinder which uses instruction inspection to deduce
>> the stack frame size. As such it introduces a larger performance penalty
>> than on arches which use the frame pointer.
> Hmm, given x86's plans to drop the frame pointer, I wonder if the
> inter-frame checking code should be gated by a CONFIG. This (3%) is a
> rather high performance hit to take for a relatively small protection
> (it's mainly about catching too-large-reads, since most
> too-large-writes will be caught by the stack canary).
>
> What do you think?

If x86 is going to move to a more expensive stack unwinding method than 
the frame pointer then I guess it may end up seeing a similar 
performance hit to what we see on MIPS. In that case it might make sense 
to add a CONFIG for this such that only those who wish to make the trade 
off of performance for the added protection need enable it.

Thanks,
Matt

>
> -Kees
>
>> On qemu, before this patch, hackbench gives:
>> Running with 10*40 (== 400) tasks.
>> Time: 5.484
>> Running with 10*40 (== 400) tasks.
>> Time: 4.039
>> Running with 10*40 (== 400) tasks.
>> Time: 3.908
>> Running with 10*40 (== 400) tasks.
>> Time: 3.955
>> Running with 10*40 (== 400) tasks.
>> Time: 4.185
>> Running with 10*40 (== 400) tasks.
>> Time: 4.497
>> Running with 10*40 (== 400) tasks.
>> Time: 3.980
>> Running with 10*40 (== 400) tasks.
>> Time: 4.078
>> Running with 10*40 (== 400) tasks.
>> Time: 4.219
>> Running with 10*40 (== 400) tasks.
>> Time: 4.026
>>
>> Giving an average of 4.2371
>>
>> With this patch, hackbench gives:
>> Running with 10*40 (== 400) tasks.
>> Time: 5.671
>> Running with 10*40 (== 400) tasks.
>> Time: 4.282
>> Running with 10*40 (== 400) tasks.
>> Time: 4.101
>> Running with 10*40 (== 400) tasks.
>> Time: 4.040
>> Running with 10*40 (== 400) tasks.
>> Time: 4.683
>> Running with 10*40 (== 400) tasks.
>> Time: 4.387
>> Running with 10*40 (== 400) tasks.
>> Time: 4.289
>> Running with 10*40 (== 400) tasks.
>> Time: 4.027
>> Running with 10*40 (== 400) tasks.
>> Time: 4.048
>> Running with 10*40 (== 400) tasks.
>> Time: 4.079
>>
>> Giving an average of 4.3607
>>
>> This indicates an additional 3% overhead for inspecting the kernel stack
>> when CONFIG_HARDENED_USERCOPY is enabled.
>>
>> This patch is based on Linux v4.13-rc4, and for correct operation on
>> microMIPS depends on my series "MIPS: Further microMIPS stack unwinding
>> fixes"
>>
>> Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
>> Reviewed-by: James Hogan <james.hogan@imgtec.com>
>> ---
>>
>>   arch/mips/Kconfig                   |  1 +
>>   arch/mips/include/asm/thread_info.h | 74 +++++++++++++++++++++++++++++++++++++
>>   2 files changed, 75 insertions(+)
>>
>> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
>> index 8dd20358464f..6cbf2d525c8d 100644
>> --- a/arch/mips/Kconfig
>> +++ b/arch/mips/Kconfig
>> @@ -35,6 +35,7 @@ config MIPS
>>          select HAVE_ARCH_SECCOMP_FILTER
>>          select HAVE_ARCH_TRACEHOOK
>>          select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
>> +       select HAVE_ARCH_WITHIN_STACK_FRAMES if KALLSYMS
>>          select HAVE_CBPF_JIT if (!64BIT && !CPU_MICROMIPS)
>>          select HAVE_EBPF_JIT if (64BIT && !CPU_MICROMIPS)
>>          select HAVE_CC_STACKPROTECTOR
>> diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
>> index b439e512792b..931652460393 100644
>> --- a/arch/mips/include/asm/thread_info.h
>> +++ b/arch/mips/include/asm/thread_info.h
>> @@ -14,6 +14,80 @@
>>
>>   #include <asm/processor.h>
>>
>> +#ifdef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
>> +
>> +/*
>> + * Walks up the stack frames to make sure that the specified object is
>> + * entirely contained by a single stack frame.
>> + *
>> + * Returns:
>> + *     GOOD_FRAME      if within a frame
>> + *     BAD_STACK       if placed across a frame boundary (or outside stack)
>> + *     NOT_STACK       unable to determine
>> + */
>> +static inline int arch_within_stack_frames(const void *const stack,
>> +                                          const void *const stackend,
>> +                                          const void *obj, unsigned long len)
>> +{
>> +       /* Avoid header recursion by just declaring this here */
>> +       extern unsigned long unwind_stack_by_address(
>> +                                               unsigned long stack_page,
>> +                                               unsigned long *sp,
>> +                                               unsigned long pc,
>> +                                               unsigned long *ra);
>> +       unsigned long sp, lastsp, ra, pc;
>> +       int skip_frames;
>> +
>> +       /* Get this frame's details */
>> +       sp = (unsigned long)__builtin_frame_address(0);
>> +       pc = (unsigned long)current_text_addr();
>> +
>> +       /*
>> +        * Skip initial frames to get back the function requesting the copy.
>> +        * Unwind the frames of:
>> +        *   arch_within_stack_frames (inlined into check_stack_object)
>> +        *   __check_object_size
>> +        * This leaves sp & pc in the frame associated with
>> +        *   copy_{to,from}_user() (inlined into do_usercopy_stack)
>> +        */
>> +       for (skip_frames = 0; skip_frames < 2; skip_frames++) {
>> +               pc = unwind_stack_by_address((unsigned long)stack, &sp, pc, &ra);
>> +               if (!pc)
>> +                       return BAD_STACK;
>> +       }
>> +
>> +       if ((unsigned long)obj < sp) {
>> +               /* obj is not in the frame of the requestor or it's callers */
>> +               return BAD_STACK;
>> +       }
>> +
>> +       /*
>> +        * low ---------------------------------------> high
>> +        * [local vars][saved regs][ra][local vars']
>> +        * ^                           ^
>> +        * lastsp                      sp
>> +        * ^----------------------^
>> +        *  allow copies only within here
>> +        */
>> +       do {
>> +               lastsp = sp;
>> +               pc = unwind_stack_by_address((unsigned long)stack, &sp, pc, &ra);
>> +               if ((((unsigned long)obj) >= lastsp) &&
>> +                   (((unsigned long)obj + len) <= (sp - sizeof(void *)))) {
>> +                       /* obj is entirely within this stack frame */
>> +                       return GOOD_FRAME;
>> +               }
>> +       } while (pc);
>> +
>> +       /*
>> +        * We can't unwind any further. If we haven't found the object entirely
>> +        * within one of our callers frames, it must be a bad object.
>> +        */
>> +       return BAD_STACK;
>> +}
>> +
>> +#endif /* CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES */
>> +
>>   /*
>>    * low level task data that entry.S needs immediate access to
>>    * - this struct should fit entirely inside of one cache line
>> --
>> 2.7.4
>>
>
>
Kees Cook Aug. 10, 2017, 5:32 p.m. UTC | #3
On Thu, Aug 10, 2017 at 1:24 AM, Matt Redfearn <matt.redfearn@imgtec.com> wrote:
> Hi Kees,
>
>
> On 08/08/17 20:11, Kees Cook wrote:
>>
>> On Tue, Aug 8, 2017 at 5:23 AM, Matt Redfearn <matt.redfearn@imgtec.com>
>> wrote:
>>>
>>> This implements arch_within_stack_frames() for MIPS that validates if an
>>> object is wholly contained by a kernel stack frame.
>>>
>>> With CONFIG_HARDENED_USERCOPY enabled, MIPS now passes the LKDTM tests
>>> USERCOPY_STACK_FRAME_TO, USERCOPY_STACK_FRAME_FROM and
>>> USERCOPY_STACK_BEYOND on a Creator Ci40.
>>>
>>> Since the MIPS kernel does not use frame pointers, we re-use the MIPS
>>> kernels stack frame unwinder which uses instruction inspection to deduce
>>> the stack frame size. As such it introduces a larger performance penalty
>>> than on arches which use the frame pointer.
>>
>> Hmm, given x86's plans to drop the frame pointer, I wonder if the
>> inter-frame checking code should be gated by a CONFIG. This (3%) is a
>> rather high performance hit to take for a relatively small protection
>> (it's mainly about catching too-large-reads, since most
>> too-large-writes will be caught by the stack canary).
>>
>> What do you think?
>
>
> If x86 is going to move to a more expensive stack unwinding method than the
> frame pointer then I guess it may end up seeing a similar performance hit to
> what we see on MIPS. In that case it might make sense to add a CONFIG for
> this such that only those who wish to make the trade off of performance for
> the added protection need enable it.

Sounds good. Can you send a v2 that adds a CONFIG, maybe something
like CONFIG_HARDENED_USERCOPY_UNWINDER with a description of the
trade-offs? Then x86 can do this too when it drops frame pointers.

Thanks!

-Kees
diff mbox

Patch

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8dd20358464f..6cbf2d525c8d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -35,6 +35,7 @@  config MIPS
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
+	select HAVE_ARCH_WITHIN_STACK_FRAMES if KALLSYMS
 	select HAVE_CBPF_JIT if (!64BIT && !CPU_MICROMIPS)
 	select HAVE_EBPF_JIT if (64BIT && !CPU_MICROMIPS)
 	select HAVE_CC_STACKPROTECTOR
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index b439e512792b..931652460393 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -14,6 +14,80 @@ 
 
 #include <asm/processor.h>
 
+#ifdef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ *	GOOD_FRAME	if within a frame
+ *	BAD_STACK	if placed across a frame boundary (or outside stack)
+ *	NOT_STACK	unable to determine
+ */
+static inline int arch_within_stack_frames(const void *const stack,
+					   const void *const stackend,
+					   const void *obj, unsigned long len)
+{
+	/* Avoid header recursion by just declaring this here */
+	extern unsigned long unwind_stack_by_address(
+						unsigned long stack_page,
+						unsigned long *sp,
+						unsigned long pc,
+						unsigned long *ra);
+	unsigned long sp, lastsp, ra, pc;
+	int skip_frames;
+
+	/* Get this frame's details */
+	sp = (unsigned long)__builtin_frame_address(0);
+	pc = (unsigned long)current_text_addr();
+
+	/*
+	 * Skip initial frames to get back the function requesting the copy.
+	 * Unwind the frames of:
+	 *   arch_within_stack_frames (inlined into check_stack_object)
+	 *   __check_object_size
+	 * This leaves sp & pc in the frame associated with
+	 *   copy_{to,from}_user() (inlined into do_usercopy_stack)
+	 */
+	for (skip_frames = 0; skip_frames < 2; skip_frames++) {
+		pc = unwind_stack_by_address((unsigned long)stack, &sp, pc, &ra);
+		if (!pc)
+			return BAD_STACK;
+	}
+
+	if ((unsigned long)obj < sp) {
+		/* obj is not in the frame of the requestor or it's callers */
+		return BAD_STACK;
+	}
+
+	/*
+	 * low ---------------------------------------> high
+	 * [local vars][saved regs][ra][local vars']
+	 * ^                           ^
+	 * lastsp                      sp
+	 * ^----------------------^
+	 *  allow copies only within here
+	 */
+	do {
+		lastsp = sp;
+		pc = unwind_stack_by_address((unsigned long)stack, &sp, pc, &ra);
+		if ((((unsigned long)obj) >= lastsp) &&
+		    (((unsigned long)obj + len) <= (sp - sizeof(void *)))) {
+			/* obj is entirely within this stack frame */
+			return GOOD_FRAME;
+		}
+	} while (pc);
+
+	/*
+	 * We can't unwind any further. If we haven't found the object entirely
+	 * within one of our callers frames, it must be a bad object.
+	 */
+	return BAD_STACK;
+}
+
+#endif /* CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES */
+
 /*
  * low level task data that entry.S needs immediate access to
  * - this struct should fit entirely inside of one cache line