diff mbox

[v2,02/11] mm: Hardened usercopy

Message ID 1468446964-22213-3-git-send-email-keescook@chromium.org (mailing list archive)
State New, archived
Headers show

Commit Message

Kees Cook July 13, 2016, 9:55 p.m. UTC
This is the start of porting PAX_USERCOPY into the mainline kernel. This
is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
work is based on code by PaX Team and Brad Spengler, and an earlier port
from Casey Schaufler. Additional non-slab page tests are from Rik van Riel.

This patch contains the logic for validating several conditions when
performing copy_to_user() and copy_from_user() on the kernel object
being copied to/from:
- address range doesn't wrap around
- address range isn't NULL or zero-allocated (with a non-zero copy size)
- if on the slab allocator:
  - object size must be less than or equal to copy size (when check is
    implemented in the allocator, which appear in subsequent patches)
- otherwise, object must not span page allocations
- if on the stack
  - object must not extend before/after the current process task
  - object must be contained by the current stack frame (when there is
    arch/build support for identifying stack frames)
- object must not overlap with kernel text

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig                |   7 ++
 include/linux/slab.h        |  12 +++
 include/linux/thread_info.h |  15 +++
 mm/Makefile                 |   4 +
 mm/usercopy.c               | 219 ++++++++++++++++++++++++++++++++++++++++++++
 security/Kconfig            |  27 ++++++
 6 files changed, 284 insertions(+)
 create mode 100644 mm/usercopy.c

Comments

Education Directorate July 14, 2016, 11:20 p.m. UTC | #1
On Wed, Jul 13, 2016 at 02:55:55PM -0700, Kees Cook wrote:
> This is the start of porting PAX_USERCOPY into the mainline kernel. This
> is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
> work is based on code by PaX Team and Brad Spengler, and an earlier port
> from Casey Schaufler. Additional non-slab page tests are from Rik van Riel.
> 
> This patch contains the logic for validating several conditions when
> performing copy_to_user() and copy_from_user() on the kernel object
> being copied to/from:
> - address range doesn't wrap around
> - address range isn't NULL or zero-allocated (with a non-zero copy size)
> - if on the slab allocator:
>   - object size must be less than or equal to copy size (when check is
>     implemented in the allocator, which appear in subsequent patches)
> - otherwise, object must not span page allocations
> - if on the stack
>   - object must not extend before/after the current process task
>   - object must be contained by the current stack frame (when there is
>     arch/build support for identifying stack frames)
> - object must not overlap with kernel text
> 
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  arch/Kconfig                |   7 ++
>  include/linux/slab.h        |  12 +++
>  include/linux/thread_info.h |  15 +++
>  mm/Makefile                 |   4 +
>  mm/usercopy.c               | 219 ++++++++++++++++++++++++++++++++++++++++++++
>  security/Kconfig            |  27 ++++++
>  6 files changed, 284 insertions(+)
>  create mode 100644 mm/usercopy.c
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 5e2776562035..195ee4cc939a 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -433,6 +433,13 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES
>  	  and similar) by implementing an inline arch_within_stack_frames(),
>  	  which is used by CONFIG_HARDENED_USERCOPY.
>  
> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
> +	bool
> +	help
> +	  An architecture should select this if it has a secondary linear
> +	  mapping of the kernel text. This is used to verify that kernel
> +	  text exposures are not visible under CONFIG_HARDENED_USERCOPY.
> +
>  config HAVE_CONTEXT_TRACKING
>  	bool
>  	help
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index aeb3e6d00a66..96a16a3fb7cb 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -155,6 +155,18 @@ void kfree(const void *);
>  void kzfree(const void *);
>  size_t ksize(const void *);
>  
> +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
> +const char *__check_heap_object(const void *ptr, unsigned long n,
> +				struct page *page);
> +#else
> +static inline const char *__check_heap_object(const void *ptr,
> +					      unsigned long n,
> +					      struct page *page)
> +{
> +	return NULL;
> +}
> +#endif
> +
>  /*
>   * Some archs want to perform DMA into kmalloc caches and need a guaranteed
>   * alignment larger than the alignment of a 64-bit integer.
> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
> index 3d5c80b4391d..f24b99eac969 100644
> --- a/include/linux/thread_info.h
> +++ b/include/linux/thread_info.h
> @@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void * const stack,
>  }
>  #endif
>  
> +#ifdef CONFIG_HARDENED_USERCOPY
> +extern void __check_object_size(const void *ptr, unsigned long n,
> +					bool to_user);
> +
> +static inline void check_object_size(const void *ptr, unsigned long n,
> +				     bool to_user)
> +{
> +	__check_object_size(ptr, n, to_user);
> +}
> +#else
> +static inline void check_object_size(const void *ptr, unsigned long n,
> +				     bool to_user)
> +{ }
> +#endif /* CONFIG_HARDENED_USERCOPY */
> +
>  #endif	/* __KERNEL__ */
>  
>  #endif /* _LINUX_THREAD_INFO_H */
> diff --git a/mm/Makefile b/mm/Makefile
> index 78c6f7dedb83..32d37247c7e5 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
>  KCOV_INSTRUMENT_mmzone.o := n
>  KCOV_INSTRUMENT_vmstat.o := n
>  
> +# Since __builtin_frame_address does work as used, disable the warning.
> +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
> +
>  mmu-y			:= nommu.o
>  mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
>  			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
> @@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
>  obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
>  obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
>  obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
> +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> diff --git a/mm/usercopy.c b/mm/usercopy.c
> new file mode 100644
> index 000000000000..4161a1fb1909
> --- /dev/null
> +++ b/mm/usercopy.c
> @@ -0,0 +1,219 @@
> +/*
> + * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
> + * which are designed to protect kernel memory from needless exposure
> + * and overwrite under many unintended conditions. This code is based
> + * on PAX_USERCOPY, which is:
> + *
> + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
> + * Security Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <asm/sections.h>
> +
> +/*
> + * Checks if a given pointer and length is contained by the current
> + * stack frame (if possible).
> + *
> + *	0: not at all on the stack
> + *	1: fully within a valid stack frame
> + *	2: fully on the stack (when can't do frame-checking)
> + *	-1: error condition (invalid stack position or bad stack frame)

Can we use enums? Makes it easier to read/debug

> + */
> +static noinline int check_stack_object(const void *obj, unsigned long len)
> +{
> +	const void * const stack = task_stack_page(current);
> +	const void * const stackend = stack + THREAD_SIZE;
> +	int ret;
> +
> +	/* Object is not on the stack at all. */
> +	if (obj + len <= stack || stackend <= obj)
> +		return 0;
> +
> +	/*
> +	 * Reject: object partially overlaps the stack (passing the
> +	 * the check above means at least one end is within the stack,
> +	 * so if this check fails, the other end is outside the stack).
> +	 */
> +	if (obj < stack || stackend < obj + len)
> +		return -1;
> +
> +	/* Check if object is safely within a valid frame. */
> +	ret = arch_within_stack_frames(stack, stackend, obj, len);
> +	if (ret)
> +		return ret;
> +
> +	return 2;
> +}
> +
> +static void report_usercopy(const void *ptr, unsigned long len,
> +			    bool to_user, const char *type)
> +{
> +	pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
> +		to_user ? "exposure" : "overwrite",
> +		to_user ? "from" : "to", ptr, type ? : "unknown", len);
> +	dump_stack();
> +	do_group_exit(SIGKILL);

SIGKILL -- SIGBUS?

> +}
> +
> +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
> +static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
> +		     unsigned long high)
> +{
> +	unsigned long check_low = (uintptr_t)ptr;
> +	unsigned long check_high = check_low + n;
> +
> +	/* Does not overlap if entirely above or entirely below. */
> +	if (check_low >= high || check_high < low)
> +		return false;
> +
> +	return true;
> +}
> +
> +/* Is this address range in the kernel text area? */
> +static inline const char *check_kernel_text_object(const void *ptr,
> +						   unsigned long n)
> +{
> +	unsigned long textlow = (unsigned long)_stext;
> +	unsigned long texthigh = (unsigned long)_etext;
> +
> +	if (overlaps(ptr, n, textlow, texthigh))
> +		return "<kernel text>";
> +
> +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING
> +	/* Check against linear mapping as well. */
> +	if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)),
> +		     (unsigned long)__va(__pa(texthigh))))
> +		return "<linear kernel text>";
> +#endif
> +
> +	return NULL;
> +}
> +
> +static inline const char *check_bogus_address(const void *ptr, unsigned long n)
> +{
> +	/* Reject if object wraps past end of memory. */
> +	if (ptr + n < ptr)
> +		return "<wrapped address>";
> +
> +	/* Reject if NULL or ZERO-allocation. */
> +	if (ZERO_OR_NULL_PTR(ptr))
> +		return "<null>";
> +
> +	return NULL;
> +}
> +
> +static inline const char *check_heap_object(const void *ptr, unsigned long n,
> +					    bool to_user)
> +{
> +	struct page *page, *endpage;
> +	const void *end = ptr + n - 1;
> +
> +	if (!virt_addr_valid(ptr))
> +		return NULL;
> +
> +	page = virt_to_head_page(ptr);
> +
> +	/* Check slab allocator for flags and size. */
> +	if (PageSlab(page))
> +		return __check_heap_object(ptr, n, page);
> +
> +	/*
> +	 * Sometimes the kernel data regions are not marked Reserved (see
> +	 * check below). And sometimes [_sdata,_edata) does not cover
> +	 * rodata and/or bss, so check each range explicitly.
> +	 */
> +
> +	/* Allow reads of kernel rodata region (if not marked as Reserved). */
> +	if (ptr >= (const void *)__start_rodata &&
> +	    end <= (const void *)__end_rodata) {
> +		if (!to_user)
> +			return "<rodata>";
> +		return NULL;
> +	}
> +
> +	/* Allow kernel data region (if not marked as Reserved). */
> +	if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
> +		return NULL;
> +
> +	/* Allow kernel bss region (if not marked as Reserved). */
> +	if (ptr >= (const void *)__bss_start &&
> +	    end <= (const void *)__bss_stop)
> +		return NULL;
> +
> +	/* Is the object wholly within one base page? */
> +	if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
> +		   ((unsigned long)end & (unsigned long)PAGE_MASK)))
> +		return NULL;
> +
> +	/* Allow if start and end are inside the same compound page. */
> +	endpage = virt_to_head_page(end);
> +	if (likely(endpage == page))
> +		return NULL;
> +
> +	/* Allow special areas, device memory, and sometimes kernel data. */
> +	if (PageReserved(page) && PageReserved(endpage))
> +		return NULL;a

If we came here, it's likely that endpage > page, do we need to check
that only the first and last pages are reserved? What about the ones in
the middle?


> +
> +	/* Uh oh. The "object" spans several independently allocated pages. */
> +	return "<spans multiple pages>";
> +}
> +
> +/*
> + * Validates that the given object is one of:
> + * - known safe heap object
> + * - known safe stack object
> + * - not in kernel text
> + */
> +void __check_object_size(const void *ptr, unsigned long n, bool to_user)
> +{
> +	const char *err;
> +
> +	/* Skip all tests if size is zero. */
> +	if (!n)
> +		return;
> +
> +	/* Check for invalid addresses. */
> +	err = check_bogus_address(ptr, n);
> +	if (err)
> +		goto report;
> +
> +	/* Check for bad heap object. */
> +	err = check_heap_object(ptr, n, to_user);
> +	if (err)
> +		goto report;
> +
> +	/* Check for bad stack object. */
> +	switch (check_stack_object(ptr, n)) {
> +	case 0:
> +		/* Object is not touching the current process stack. */
> +		break;
> +	case 1:
> +	case 2:
> +		/*
> +		 * Object is either in the correct frame (when it
> +		 * is possible to check) or just generally on the
> +		 * process stack (when frame checking not available).
> +		 */
> +		return;
> +	default:
> +		err = "<process stack>";
> +		goto report;
> +	}
> +
> +	/* Check for object in kernel to avoid text exposure. */
> +	err = check_kernel_text_object(ptr, n);
> +	if (!err)
> +		return;
> +
> +report:
> +	report_usercopy(ptr, n, to_user, err);
> +}

Looks good otherwise

Balbir Singh
Rik van Riel July 15, 2016, 1:04 a.m. UTC | #2
On Fri, 2016-07-15 at 09:20 +1000, Balbir Singh wrote:

> > ==
> > +		   ((unsigned long)end & (unsigned
> > long)PAGE_MASK)))
> > +		return NULL;
> > +
> > +	/* Allow if start and end are inside the same compound
> > page. */
> > +	endpage = virt_to_head_page(end);
> > +	if (likely(endpage == page))
> > +		return NULL;
> > +
> > +	/* Allow special areas, device memory, and sometimes
> > kernel data. */
> > +	if (PageReserved(page) && PageReserved(endpage))
> > +		return NULL;
> 
> If we came here, it's likely that endpage > page, do we need to check
> that only the first and last pages are reserved? What about the ones
> in
> the middle?

I think this will be so rare, we can get away with just
checking the beginning and the end.
Education Directorate July 15, 2016, 1:41 a.m. UTC | #3
On Thu, Jul 14, 2016 at 09:04:18PM -0400, Rik van Riel wrote:
> On Fri, 2016-07-15 at 09:20 +1000, Balbir Singh wrote:
> 
> > > ==
> > > +		   ((unsigned long)end & (unsigned
> > > long)PAGE_MASK)))
> > > +		return NULL;
> > > +
> > > +	/* Allow if start and end are inside the same compound
> > > page. */
> > > +	endpage = virt_to_head_page(end);
> > > +	if (likely(endpage == page))
> > > +		return NULL;
> > > +
> > > +	/* Allow special areas, device memory, and sometimes
> > > kernel data. */
> > > +	if (PageReserved(page) && PageReserved(endpage))
> > > +		return NULL;
> > 
> > If we came here, it's likely that endpage > page, do we need to check
> > that only the first and last pages are reserved? What about the ones
> > in
> > the middle?
> 
> I think this will be so rare, we can get away with just
> checking the beginning and the end.
>

But do we want to leave a hole where an aware user space
can try a longer copy_* to avoid this check? If it is unlikely
should we just bite the bullet and do the check for the entire
range?

Balbir Singh.
Kees Cook July 15, 2016, 4:05 a.m. UTC | #4
On Thu, Jul 14, 2016 at 6:41 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> On Thu, Jul 14, 2016 at 09:04:18PM -0400, Rik van Riel wrote:
>> On Fri, 2016-07-15 at 09:20 +1000, Balbir Singh wrote:
>>
>> > > ==
>> > > +            ((unsigned long)end & (unsigned
>> > > long)PAGE_MASK)))
>> > > +         return NULL;
>> > > +
>> > > + /* Allow if start and end are inside the same compound
>> > > page. */
>> > > + endpage = virt_to_head_page(end);
>> > > + if (likely(endpage == page))
>> > > +         return NULL;
>> > > +
>> > > + /* Allow special areas, device memory, and sometimes
>> > > kernel data. */
>> > > + if (PageReserved(page) && PageReserved(endpage))
>> > > +         return NULL;
>> >
>> > If we came here, it's likely that endpage > page, do we need to check
>> > that only the first and last pages are reserved? What about the ones
>> > in
>> > the middle?
>>
>> I think this will be so rare, we can get away with just
>> checking the beginning and the end.
>>
>
> But do we want to leave a hole where an aware user space
> can try a longer copy_* to avoid this check? If it is unlikely
> should we just bite the bullet and do the check for the entire
> range?

I'd be okay with expanding the test -- it should be an extremely rare
situation already since the common Reserved areas (kernel data) will
have already been explicitly tested.

What's the best way to do "next page"? Should it just be:

for ( ; page <= endpage ; ptr += PAGE_SIZE, page = virt_to_head_page(ptr) ) {
    if (!PageReserved(page))
        return "<spans multiple pages>";
}

return NULL;

?
Kees Cook July 15, 2016, 4:25 a.m. UTC | #5
On Thu, Jul 14, 2016 at 4:20 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> On Wed, Jul 13, 2016 at 02:55:55PM -0700, Kees Cook wrote:
>> [...]
>> +++ b/mm/usercopy.c
>> @@ -0,0 +1,219 @@
>> [...]
>> +/*
>> + * Checks if a given pointer and length is contained by the current
>> + * stack frame (if possible).
>> + *
>> + *   0: not at all on the stack
>> + *   1: fully within a valid stack frame
>> + *   2: fully on the stack (when can't do frame-checking)
>> + *   -1: error condition (invalid stack position or bad stack frame)
>
> Can we use enums? Makes it easier to read/debug

Sure, I will update this.

>> [...]
>> +static void report_usercopy(const void *ptr, unsigned long len,
>> +                         bool to_user, const char *type)
>> +{
>> +     pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
>> +             to_user ? "exposure" : "overwrite",
>> +             to_user ? "from" : "to", ptr, type ? : "unknown", len);
>> +     dump_stack();
>> +     do_group_exit(SIGKILL);
>
> SIGKILL -- SIGBUS?

I'd like to keep SIGKILL since it indicates a process fiddling with a
kernel bug. The real problem here is that there doesn't seem to be an
arch-independent way to Oops the kernel and kill a process ("die()" is
closest, but it's defined on a per-arch basis with varying arguments).
This could be a BUG, but I'd rather not panic the entire kernel.

-Kees
Kees Cook July 15, 2016, 4:53 a.m. UTC | #6
On Thu, Jul 14, 2016 at 9:05 PM, Kees Cook <keescook@chromium.org> wrote:
> On Thu, Jul 14, 2016 at 6:41 PM, Balbir Singh <bsingharora@gmail.com> wrote:
>> On Thu, Jul 14, 2016 at 09:04:18PM -0400, Rik van Riel wrote:
>>> On Fri, 2016-07-15 at 09:20 +1000, Balbir Singh wrote:
>>>
>>> > > ==
>>> > > +            ((unsigned long)end & (unsigned
>>> > > long)PAGE_MASK)))
>>> > > +         return NULL;
>>> > > +
>>> > > + /* Allow if start and end are inside the same compound
>>> > > page. */
>>> > > + endpage = virt_to_head_page(end);
>>> > > + if (likely(endpage == page))
>>> > > +         return NULL;
>>> > > +
>>> > > + /* Allow special areas, device memory, and sometimes
>>> > > kernel data. */
>>> > > + if (PageReserved(page) && PageReserved(endpage))
>>> > > +         return NULL;
>>> >
>>> > If we came here, it's likely that endpage > page, do we need to check
>>> > that only the first and last pages are reserved? What about the ones
>>> > in
>>> > the middle?
>>>
>>> I think this will be so rare, we can get away with just
>>> checking the beginning and the end.
>>>
>>
>> But do we want to leave a hole where an aware user space
>> can try a longer copy_* to avoid this check? If it is unlikely
>> should we just bite the bullet and do the check for the entire
>> range?
>
> I'd be okay with expanding the test -- it should be an extremely rare
> situation already since the common Reserved areas (kernel data) will
> have already been explicitly tested.
>
> What's the best way to do "next page"? Should it just be:
>
> for ( ; page <= endpage ; ptr += PAGE_SIZE, page = virt_to_head_page(ptr) ) {
>     if (!PageReserved(page))
>         return "<spans multiple pages>";
> }
>
> return NULL;
>
> ?

Er, I was testing the wrong thing. How about:

        /*
         * Reject if range is not Reserved (i.e. special or device memory),
         * since then the object spans several independently allocated pages.
         */
        for (; ptr <= end ; ptr += PAGE_SIZE, page = virt_to_head_page(ptr)) {
                if (!PageReserved(page))
                        return "<spans multiple pages>";
        }

        return NULL;
Education Directorate July 15, 2016, 12:55 p.m. UTC | #7
On Thu, Jul 14, 2016 at 09:53:31PM -0700, Kees Cook wrote:
> On Thu, Jul 14, 2016 at 9:05 PM, Kees Cook <keescook@chromium.org> wrote:
> > On Thu, Jul 14, 2016 at 6:41 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> >> On Thu, Jul 14, 2016 at 09:04:18PM -0400, Rik van Riel wrote:
> >>> On Fri, 2016-07-15 at 09:20 +1000, Balbir Singh wrote:
> >>>
> >>> > > ==
> >>> > > +            ((unsigned long)end & (unsigned
> >>> > > long)PAGE_MASK)))
> >>> > > +         return NULL;
> >>> > > +
> >>> > > + /* Allow if start and end are inside the same compound
> >>> > > page. */
> >>> > > + endpage = virt_to_head_page(end);
> >>> > > + if (likely(endpage == page))
> >>> > > +         return NULL;
> >>> > > +
> >>> > > + /* Allow special areas, device memory, and sometimes
> >>> > > kernel data. */
> >>> > > + if (PageReserved(page) && PageReserved(endpage))
> >>> > > +         return NULL;
> >>> >
> >>> > If we came here, it's likely that endpage > page, do we need to check
> >>> > that only the first and last pages are reserved? What about the ones
> >>> > in
> >>> > the middle?
> >>>
> >>> I think this will be so rare, we can get away with just
> >>> checking the beginning and the end.
> >>>
> >>
> >> But do we want to leave a hole where an aware user space
> >> can try a longer copy_* to avoid this check? If it is unlikely
> >> should we just bite the bullet and do the check for the entire
> >> range?
> >
> > I'd be okay with expanding the test -- it should be an extremely rare
> > situation already since the common Reserved areas (kernel data) will
> > have already been explicitly tested.
> >
> > What's the best way to do "next page"? Should it just be:
> >
> > for ( ; page <= endpage ; ptr += PAGE_SIZE, page = virt_to_head_page(ptr) ) {
> >     if (!PageReserved(page))
> >         return "<spans multiple pages>";
> > }
> >
> > return NULL;
> >
> > ?
> 
> Er, I was testing the wrong thing. How about:
> 
>         /*
>          * Reject if range is not Reserved (i.e. special or device memory),
>          * since then the object spans several independently allocated pages.
>          */
>         for (; ptr <= end ; ptr += PAGE_SIZE, page = virt_to_head_page(ptr)) {
>                 if (!PageReserved(page))
>                         return "<spans multiple pages>";
>         }
> 
>         return NULL;

That looks reasonable to me

Balbir
Daniel Micay July 15, 2016, 7 p.m. UTC | #8
> This could be a BUG, but I'd rather not panic the entire kernel.

It seems unlikely that it will panic without panic_on_oops and that's
an explicit opt-in to taking down the system on kernel logic errors
exactly like this. In grsecurity, it calls the kernel exploit handling
logic (panic if root, otherwise kill all process of that user and ban
them until reboot) but that same logic is also called for BUG via oops
handling so there's only really a distinction with panic_on_oops=1.

Does it make sense to be less fatal for a fatal assertion that's more
likely to be security-related? Maybe you're worried about having some
false positives for the whitelisting portion, but I don't think those
will lurk around very long with the way this works.
Kees Cook July 15, 2016, 7:14 p.m. UTC | #9
On Fri, Jul 15, 2016 at 12:00 PM, Daniel Micay <danielmicay@gmail.com> wrote:
>> This could be a BUG, but I'd rather not panic the entire kernel.
>
> It seems unlikely that it will panic without panic_on_oops and that's
> an explicit opt-in to taking down the system on kernel logic errors
> exactly like this. In grsecurity, it calls the kernel exploit handling
> logic (panic if root, otherwise kill all process of that user and ban
> them until reboot) but that same logic is also called for BUG via oops
> handling so there's only really a distinction with panic_on_oops=1.
>
> Does it make sense to be less fatal for a fatal assertion that's more
> likely to be security-related? Maybe you're worried about having some
> false positives for the whitelisting portion, but I don't think those
> will lurk around very long with the way this works.

I'd like it to dump stack and be fatal to the process involved, but
yeah, I guess BUG() would work. Creating an infrastructure for
handling security-related Oopses can be done separately from this (and
I'd like to see that added, since it's a nice bit of configurable
reactivity to possible attacks).

-Kees
Daniel Micay July 15, 2016, 7:19 p.m. UTC | #10
> I'd like it to dump stack and be fatal to the process involved, but
> yeah, I guess BUG() would work. Creating an infrastructure for
> handling security-related Oopses can be done separately from this
> (and
> I'd like to see that added, since it's a nice bit of configurable
> reactivity to possible attacks).

In grsecurity, the oops handling also uses do_group_exit instead of
do_exit but both that change (or at least the option to do it) and the
exploit handling could be done separately from this without actually
needing special treatment for USERCOPY. Could expose is as something
like panic_on_oops=2 as a balance between the existing options.
Kees Cook July 15, 2016, 7:23 p.m. UTC | #11
On Fri, Jul 15, 2016 at 12:19 PM, Daniel Micay <danielmicay@gmail.com> wrote:
>> I'd like it to dump stack and be fatal to the process involved, but
>> yeah, I guess BUG() would work. Creating an infrastructure for
>> handling security-related Oopses can be done separately from this
>> (and
>> I'd like to see that added, since it's a nice bit of configurable
>> reactivity to possible attacks).
>
> In grsecurity, the oops handling also uses do_group_exit instead of
> do_exit but both that change (or at least the option to do it) and the
> exploit handling could be done separately from this without actually
> needing special treatment for USERCOPY. Could expose is as something
> like panic_on_oops=2 as a balance between the existing options.

I'm also uncomfortable about BUG() being removed by unsetting
CONFIG_BUG, but that seems unlikely. :)

-Kees
diff mbox

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 5e2776562035..195ee4cc939a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -433,6 +433,13 @@  config HAVE_ARCH_WITHIN_STACK_FRAMES
 	  and similar) by implementing an inline arch_within_stack_frames(),
 	  which is used by CONFIG_HARDENED_USERCOPY.
 
+config HAVE_ARCH_LINEAR_KERNEL_MAPPING
+	bool
+	help
+	  An architecture should select this if it has a secondary linear
+	  mapping of the kernel text. This is used to verify that kernel
+	  text exposures are not visible under CONFIG_HARDENED_USERCOPY.
+
 config HAVE_CONTEXT_TRACKING
 	bool
 	help
diff --git a/include/linux/slab.h b/include/linux/slab.h
index aeb3e6d00a66..96a16a3fb7cb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -155,6 +155,18 @@  void kfree(const void *);
 void kzfree(const void *);
 size_t ksize(const void *);
 
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page);
+#else
+static inline const char *__check_heap_object(const void *ptr,
+					      unsigned long n,
+					      struct page *page)
+{
+	return NULL;
+}
+#endif
+
 /*
  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
  * alignment larger than the alignment of a 64-bit integer.
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 3d5c80b4391d..f24b99eac969 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -155,6 +155,21 @@  static inline int arch_within_stack_frames(const void * const stack,
 }
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+extern void __check_object_size(const void *ptr, unsigned long n,
+					bool to_user);
+
+static inline void check_object_size(const void *ptr, unsigned long n,
+				     bool to_user)
+{
+	__check_object_size(ptr, n, to_user);
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+				     bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
diff --git a/mm/Makefile b/mm/Makefile
index 78c6f7dedb83..32d37247c7e5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,6 +21,9 @@  KCOV_INSTRUMENT_memcontrol.o := n
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
 
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -99,3 +102,4 @@  obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000000..4161a1fb1909
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,219 @@ 
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ *	0: not at all on the stack
+ *	1: fully within a valid stack frame
+ *	2: fully on the stack (when can't do frame-checking)
+ *	-1: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+	const void * const stack = task_stack_page(current);
+	const void * const stackend = stack + THREAD_SIZE;
+	int ret;
+
+	/* Object is not on the stack at all. */
+	if (obj + len <= stack || stackend <= obj)
+		return 0;
+
+	/*
+	 * Reject: object partially overlaps the stack (passing the
+	 * the check above means at least one end is within the stack,
+	 * so if this check fails, the other end is outside the stack).
+	 */
+	if (obj < stack || stackend < obj + len)
+		return -1;
+
+	/* Check if object is safely within a valid frame. */
+	ret = arch_within_stack_frames(stack, stackend, obj, len);
+	if (ret)
+		return ret;
+
+	return 2;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+			    bool to_user, const char *type)
+{
+	pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+		to_user ? "exposure" : "overwrite",
+		to_user ? "from" : "to", ptr, type ? : "unknown", len);
+	dump_stack();
+	do_group_exit(SIGKILL);
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+		     unsigned long high)
+{
+	unsigned long check_low = (uintptr_t)ptr;
+	unsigned long check_high = check_low + n;
+
+	/* Does not overlap if entirely above or entirely below. */
+	if (check_low >= high || check_high < low)
+		return false;
+
+	return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+						   unsigned long n)
+{
+	unsigned long textlow = (unsigned long)_stext;
+	unsigned long texthigh = (unsigned long)_etext;
+
+	if (overlaps(ptr, n, textlow, texthigh))
+		return "<kernel text>";
+
+#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING
+	/* Check against linear mapping as well. */
+	if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)),
+		     (unsigned long)__va(__pa(texthigh))))
+		return "<linear kernel text>";
+#endif
+
+	return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+	/* Reject if object wraps past end of memory. */
+	if (ptr + n < ptr)
+		return "<wrapped address>";
+
+	/* Reject if NULL or ZERO-allocation. */
+	if (ZERO_OR_NULL_PTR(ptr))
+		return "<null>";
+
+	return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+					    bool to_user)
+{
+	struct page *page, *endpage;
+	const void *end = ptr + n - 1;
+
+	if (!virt_addr_valid(ptr))
+		return NULL;
+
+	page = virt_to_head_page(ptr);
+
+	/* Check slab allocator for flags and size. */
+	if (PageSlab(page))
+		return __check_heap_object(ptr, n, page);
+
+	/*
+	 * Sometimes the kernel data regions are not marked Reserved (see
+	 * check below). And sometimes [_sdata,_edata) does not cover
+	 * rodata and/or bss, so check each range explicitly.
+	 */
+
+	/* Allow reads of kernel rodata region (if not marked as Reserved). */
+	if (ptr >= (const void *)__start_rodata &&
+	    end <= (const void *)__end_rodata) {
+		if (!to_user)
+			return "<rodata>";
+		return NULL;
+	}
+
+	/* Allow kernel data region (if not marked as Reserved). */
+	if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+		return NULL;
+
+	/* Allow kernel bss region (if not marked as Reserved). */
+	if (ptr >= (const void *)__bss_start &&
+	    end <= (const void *)__bss_stop)
+		return NULL;
+
+	/* Is the object wholly within one base page? */
+	if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+		   ((unsigned long)end & (unsigned long)PAGE_MASK)))
+		return NULL;
+
+	/* Allow if start and end are inside the same compound page. */
+	endpage = virt_to_head_page(end);
+	if (likely(endpage == page))
+		return NULL;
+
+	/* Allow special areas, device memory, and sometimes kernel data. */
+	if (PageReserved(page) && PageReserved(endpage))
+		return NULL;
+
+	/* Uh oh. The "object" spans several independently allocated pages. */
+	return "<spans multiple pages>";
+}
+
+/*
+ * Validates that the given object is one of:
+ * - known safe heap object
+ * - known safe stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+	const char *err;
+
+	/* Skip all tests if size is zero. */
+	if (!n)
+		return;
+
+	/* Check for invalid addresses. */
+	err = check_bogus_address(ptr, n);
+	if (err)
+		goto report;
+
+	/* Check for bad heap object. */
+	err = check_heap_object(ptr, n, to_user);
+	if (err)
+		goto report;
+
+	/* Check for bad stack object. */
+	switch (check_stack_object(ptr, n)) {
+	case 0:
+		/* Object is not touching the current process stack. */
+		break;
+	case 1:
+	case 2:
+		/*
+		 * Object is either in the correct frame (when it
+		 * is possible to check) or just generally on the
+		 * process stack (when frame checking not available).
+		 */
+		return;
+	default:
+		err = "<process stack>";
+		goto report;
+	}
+
+	/* Check for object in kernel to avoid text exposure. */
+	err = check_kernel_text_object(ptr, n);
+	if (!err)
+		return;
+
+report:
+	report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);
diff --git a/security/Kconfig b/security/Kconfig
index 176758cdfa57..63340ad0b9f9 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -118,6 +118,33 @@  config LSM_MMAP_MIN_ADDR
 	  this low address space will need the permission specific to the
 	  systems running LSM.
 
+config HAVE_HARDENED_USERCOPY_ALLOCATOR
+	bool
+	help
+	  The heap allocator implements __check_heap_object() for
+	  validating memory ranges against heap object sizes in
+	  support of CONFIG_HARDENED_USERCOPY.
+
+config HAVE_ARCH_HARDENED_USERCOPY
+	bool
+	help
+	  The architecture supports CONFIG_HARDENED_USERCOPY by
+	  calling check_object_size() just before performing the
+	  userspace copies in the low level implementation of
+	  copy_to_user() and copy_from_user().
+
+config HARDENED_USERCOPY
+	bool "Harden memory copies between kernel and userspace"
+	depends on HAVE_ARCH_HARDENED_USERCOPY
+	help
+	  This option checks for obviously wrong memory regions when
+	  copying memory to/from the kernel (via copy_to_user() and
+	  copy_from_user() functions) by rejecting memory ranges that
+	  are larger than the specified heap object, span multiple
+	  separately allocates pages, are not on the process stack,
+	  or are part of the kernel text. This kills entire classes
+	  of heap overflow exploits and similar kernel memory exposures.
+
 source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig