diff mbox

[RFC,1/8] Introduce rare_write() infrastructure

Message ID 1488228186-110679-2-git-send-email-keescook@chromium.org (mailing list archive)
State New, archived
Headers show

Commit Message

Kees Cook Feb. 27, 2017, 8:42 p.m. UTC
Several types of data storage exist in the kernel: read-write data (.data,
.bss), read-only data (.rodata), and RO-after-init. This introduces the
infrastructure for another type: write-rarely, which intended for data
that is either only rarely modified or especially security-sensitive. The
intent is to further reduce the internal attack surface of the kernel by
making this storage read-only when "at rest". This makes it much harder
to be subverted by attackers who have a kernel-write flaw, since they
cannot directly change the memory contents.

Variables declared __wr_rare will be made const when an architecture
supports HAVE_ARCH_WRITE_RARE. To change these variables, either the
rare_write() macro can be used, or multiple uses of __rare_write(),
wrapped in rare_write_enable()/rare_write_disable() macros. These macros
are handled by the arch-specific functions that perform the actions needed
to write to otherwise read-only memory.

The arch-specific helpers must be not allow non-current CPUs to write
the memory area, run non-preemptible to avoid accidentally leaving
memory writable, and defined as inline to avoid making them desirable
ROP targets for attackers.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig             | 15 +++++++++++++++
 include/linux/compiler.h | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/preempt.h  |  6 ++++--
 3 files changed, 57 insertions(+), 2 deletions(-)

Comments

Hoeun Ryu Feb. 28, 2017, 8:22 a.m. UTC | #1
On Mon, 2017-02-27 at 12:42 -0800, Kees Cook wrote:
> Several types of data storage exist in the kernel: read-write data
> (.data,
> .bss), read-only data (.rodata), and RO-after-init. This introduces
> the
> infrastructure for another type: write-rarely, which intended for
> data
> that is either only rarely modified or especially security-sensitive. 
> The
> intent is to further reduce the internal attack surface of the kernel
> by
> making this storage read-only when "at rest". This makes it much
> harder
> to be subverted by attackers who have a kernel-write flaw, since they
> cannot directly change the memory contents.
> 
> Variables declared __wr_rare will be made const when an architecture
> supports HAVE_ARCH_WRITE_RARE. To change these variables, either the
> rare_write() macro can be used, or multiple uses of __rare_write(),
> wrapped in rare_write_enable()/rare_write_disable() macros. These
> macros
> are handled by the arch-specific functions that perform the actions
> needed
> to write to otherwise read-only memory.
> 
> The arch-specific helpers must be not allow non-current CPUs to write
> the memory area, run non-preemptible to avoid accidentally leaving
> memory writable, and defined as inline to avoid making them desirable
> ROP targets for attackers.
> 
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  arch/Kconfig             | 15 +++++++++++++++
>  include/linux/compiler.h | 38 ++++++++++++++++++++++++++++++++++++++
>  include/linux/preempt.h  |  6 ++++--
>  3 files changed, 57 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 99839c23d453..2446de19f66d 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -781,4 +781,19 @@ config VMAP_STACK
>  	  the stack to map directly to the KASAN shadow map using a
> formula
>  	  that is incorrect if the stack is in vmalloc space.
>  
> +config HAVE_ARCH_RARE_WRITE
> +	def_bool n
> +	help
> +	  An arch should select this option if it has defined the
> functions
> +	  __arch_rare_write_map() and __arch_rare_write_unmap() to
> +	  respectively enable and disable writing to read-only
> memory.The
> +	  routines must meet the following requirements:
> +	  - read-only memory writing must only be available on the
> current
> +	    CPU (to make sure other CPUs can't race to make changes
> too).
> +	  - the routines must be declared inline (to discourage ROP
> use).
> +	  - the routines must not be preemptible (likely they will
> call
> +	    preempt_disable() and preempt_enable_no_resched()
> respectively).
> +	  - the routines must validate expected state (e.g. when
> enabling
> +	    writes, BUG() if writes are already be enabled).
> +
>  source "kernel/gcov/Kconfig"
> diff --git a/include/linux/compiler.h b/include/linux/compiler.h
> index cf0fa5d86059..f95603a8ee72 100644
> --- a/include/linux/compiler.h
> +++ b/include/linux/compiler.h
> @@ -325,6 +325,44 @@ static __always_inline void
> __write_once_size(volatile void *p, void *res, int s
>  	__u.__val;					\
>  })
>  
> +/*
> + * Build "write rarely" infrastructure for flipping memory r/w
> + * on a per-CPU basis.
> + */
> +#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
> +# define __wr_rare
> +# define __wr_rare_type
> +# define __rare_write_type(v)	typeof(v)
> +# define __rare_write_ptr(v)	(&(v))
> +# define __rare_write(__var, __val)	({		\
> +	__var = __val;					\
> +	__var;						\
> +})
> +# define rare_write_enable()	do { } while (0)
> +# define rare_write_disable()	do { } while (0)
> +#else
> +# define __wr_rare		__ro_after_init
> +# define __wr_rare_type		const
> +# define __rare_write_type(v)	typeof((typeof(v))0)
> +# define __rare_write_ptr(v)	((__rare_write_type(v) *)&(v))

Should we make __rare_write_ptr arch-specific so architectures can
convert the pointer to rw alias from ro address like this ? [1]

#ifndef __arch_rare_write_ptr
# define __rare_write_ptr(v)	((__rare_write_type(v) *)&(v))
#else
# define __rate_write_ptr(v)	__arch_rare_write_ptr
#endif

[1] https://lkml.org/lkml/2017/2/22/254

> +# define __rare_write(__var, __val) ({			\
> +	__rare_write_type(__var) *__rw_var;		\
> +							\
> +	__rw_var = __rare_write_ptr(__var);		\
> +	*__rw_var = (__val);				\
> +	__var;						\
> +})
> +# define rare_write_enable()	__arch_rare_write_map()
> +# define rare_write_disable()	__arch_rare_write_unmap()
> +#endif
> +
> +#define rare_write(__var, __val) ({			\
> +	rare_write_enable();				\
> +	__rare_write(__var, __val);			\
> +	rare_write_disable();				\
> +	__var;						\
> +})
> +
>  #endif /* __KERNEL__ */
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index 7eeceac52dea..183c1d7a8594 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -237,10 +237,12 @@ do { \
>  /*
>   * Modules have no business playing preemption tricks.
>   */
> -#undef sched_preempt_enable_no_resched
> -#undef preempt_enable_no_resched
>  #undef preempt_enable_no_resched_notrace
>  #undef preempt_check_resched
> +#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
> +#undef sched_preempt_enable_no_resched
> +#undef preempt_enable_no_resched
> +#endif
>  #endif
>  
>  #define preempt_set_need_resched() \
Kees Cook Feb. 28, 2017, 3:05 p.m. UTC | #2
On Tue, Feb 28, 2017 at 12:22 AM, Hoeun Ryu <hoeun.ryu@gmail.com> wrote:
> On Mon, 2017-02-27 at 12:42 -0800, Kees Cook wrote:
>> Several types of data storage exist in the kernel: read-write data
>> (.data,
>> .bss), read-only data (.rodata), and RO-after-init. This introduces
>> the
>> infrastructure for another type: write-rarely, which intended for
>> data
>> that is either only rarely modified or especially security-sensitive.
>> The
>> intent is to further reduce the internal attack surface of the kernel
>> by
>> making this storage read-only when "at rest". This makes it much
>> harder
>> to be subverted by attackers who have a kernel-write flaw, since they
>> cannot directly change the memory contents.
>>
>> Variables declared __wr_rare will be made const when an architecture
>> supports HAVE_ARCH_WRITE_RARE. To change these variables, either the
>> rare_write() macro can be used, or multiple uses of __rare_write(),
>> wrapped in rare_write_enable()/rare_write_disable() macros. These
>> macros
>> are handled by the arch-specific functions that perform the actions
>> needed
>> to write to otherwise read-only memory.
>>
>> The arch-specific helpers must be not allow non-current CPUs to write
>> the memory area, run non-preemptible to avoid accidentally leaving
>> memory writable, and defined as inline to avoid making them desirable
>> ROP targets for attackers.
>>
>> Signed-off-by: Kees Cook <keescook@chromium.org>
>> ---
>>  arch/Kconfig             | 15 +++++++++++++++
>>  include/linux/compiler.h | 38 ++++++++++++++++++++++++++++++++++++++
>>  include/linux/preempt.h  |  6 ++++--
>>  3 files changed, 57 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 99839c23d453..2446de19f66d 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -781,4 +781,19 @@ config VMAP_STACK
>>         the stack to map directly to the KASAN shadow map using a
>> formula
>>         that is incorrect if the stack is in vmalloc space.
>>
>> +config HAVE_ARCH_RARE_WRITE
>> +     def_bool n
>> +     help
>> +       An arch should select this option if it has defined the
>> functions
>> +       __arch_rare_write_map() and __arch_rare_write_unmap() to
>> +       respectively enable and disable writing to read-only
>> memory.The
>> +       routines must meet the following requirements:
>> +       - read-only memory writing must only be available on the
>> current
>> +         CPU (to make sure other CPUs can't race to make changes
>> too).
>> +       - the routines must be declared inline (to discourage ROP
>> use).
>> +       - the routines must not be preemptible (likely they will
>> call
>> +         preempt_disable() and preempt_enable_no_resched()
>> respectively).
>> +       - the routines must validate expected state (e.g. when
>> enabling
>> +         writes, BUG() if writes are already be enabled).
>> +
>>  source "kernel/gcov/Kconfig"
>> diff --git a/include/linux/compiler.h b/include/linux/compiler.h
>> index cf0fa5d86059..f95603a8ee72 100644
>> --- a/include/linux/compiler.h
>> +++ b/include/linux/compiler.h
>> @@ -325,6 +325,44 @@ static __always_inline void
>> __write_once_size(volatile void *p, void *res, int s
>>       __u.__val;                                      \
>>  })
>>
>> +/*
>> + * Build "write rarely" infrastructure for flipping memory r/w
>> + * on a per-CPU basis.
>> + */
>> +#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
>> +# define __wr_rare
>> +# define __wr_rare_type
>> +# define __rare_write_type(v)        typeof(v)
>> +# define __rare_write_ptr(v) (&(v))
>> +# define __rare_write(__var, __val)  ({              \
>> +     __var = __val;                                  \
>> +     __var;                                          \
>> +})
>> +# define rare_write_enable() do { } while (0)
>> +# define rare_write_disable()        do { } while (0)
>> +#else
>> +# define __wr_rare           __ro_after_init
>> +# define __wr_rare_type              const
>> +# define __rare_write_type(v)        typeof((typeof(v))0)
>> +# define __rare_write_ptr(v) ((__rare_write_type(v) *)&(v))
>
> Should we make __rare_write_ptr arch-specific so architectures can
> convert the pointer to rw alias from ro address like this ? [1]
>
> #ifndef __arch_rare_write_ptr
> # define __rare_write_ptr(v)    ((__rare_write_type(v) *)&(v))
> #else
> # define __rate_write_ptr(v)    __arch_rare_write_ptr
> #endif

I think that was Mark's idea too, but I'm not sure to handle the
complex cases of doing multiple updates at once (e.g. linked list
updates, etc). I think we're better doing a full disabling of RO
protection, but there had been objections to this idea in the past,
which is why I included the "complex" example, so I don't see a way
around it.

-Kees
Mark Rutland March 1, 2017, 10:43 a.m. UTC | #3
Hi,

On Tue, Feb 28, 2017 at 07:05:32AM -0800, Kees Cook wrote:
> On Tue, Feb 28, 2017 at 12:22 AM, Hoeun Ryu <hoeun.ryu@gmail.com> wrote:
> > On Mon, 2017-02-27 at 12:42 -0800, Kees Cook wrote:

> >> +/*
> >> + * Build "write rarely" infrastructure for flipping memory r/w
> >> + * on a per-CPU basis.
> >> + */
> >> +#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
> >> +# define __wr_rare
> >> +# define __wr_rare_type
> >> +# define __rare_write_type(v)        typeof(v)
> >> +# define __rare_write_ptr(v) (&(v))
> >> +# define __rare_write(__var, __val)  ({              \
> >> +     __var = __val;                                  \
> >> +     __var;                                          \
> >> +})
> >> +# define rare_write_enable() do { } while (0)
> >> +# define rare_write_disable()        do { } while (0)
> >> +#else
> >> +# define __wr_rare           __ro_after_init
> >> +# define __wr_rare_type              const
> >> +# define __rare_write_type(v)        typeof((typeof(v))0)
> >> +# define __rare_write_ptr(v) ((__rare_write_type(v) *)&(v))
> >
> > Should we make __rare_write_ptr arch-specific so architectures can
> > convert the pointer to rw alias from ro address like this ? [1]
> >
> > #ifndef __arch_rare_write_ptr
> > # define __rare_write_ptr(v)    ((__rare_write_type(v) *)&(v))
> > #else
> > # define __rate_write_ptr(v)    __arch_rare_write_ptr
> > #endif
> 
> I think that was Mark's idea too, but I'm not sure to handle the
> complex cases of doing multiple updates at once (e.g. linked list
> updates, etc). 

Assuming there aren't that many places with complex updates, we could
allow code to explicitly do the map/unmap and use __rare_write_ptr
directly. That does mean that the list code has to be specialised for
__wr_rare, which is less than ideal.

That would also depend on what is __rw_rare in those cases, too.  i.e.
is it just the list_head for the list itself, the list_head of all
elements, or only some of them?

For the latter mixed case, the __rare_write_ptr() approach probably
doesn't work, and that rules out the mm-based approach, too, I guess. :/

> I think we're better doing a full disabling of RO protection, but
> there had been objections to this idea in the past, which is why I
> included the "complex" example, so I don't see a way around it.

My objection to that was that we can't implement CPU-local full
disabling of RO protection for the kernel page tables on some
architectures and configurations, e.g. arm64, or 32-bit arm with LPAE.

The RW alias write_write(var, val) approach only relies on what arches
already have to implement for userspace to work, so if we can figure out
how to make that work API-wise, we can probably implement that
generically with switch_mm() and {get,put}_user().

The only other way I can think to make this work would be to make a copy
of the whole swapper page tables, with the write-rarely data marked RW.
For arm64 at least, that'll be incredibly painful to keep in-sync with
the usual tables, in addition to being very expensive to switch to/from.

Thanks,
Mark.
Kees Cook March 1, 2017, 8:13 p.m. UTC | #4
On Wed, Mar 1, 2017 at 2:43 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> Hi,
>
> On Tue, Feb 28, 2017 at 07:05:32AM -0800, Kees Cook wrote:
>> On Tue, Feb 28, 2017 at 12:22 AM, Hoeun Ryu <hoeun.ryu@gmail.com> wrote:
>> > On Mon, 2017-02-27 at 12:42 -0800, Kees Cook wrote:
>
>> >> +/*
>> >> + * Build "write rarely" infrastructure for flipping memory r/w
>> >> + * on a per-CPU basis.
>> >> + */
>> >> +#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
>> >> +# define __wr_rare
>> >> +# define __wr_rare_type
>> >> +# define __rare_write_type(v)        typeof(v)
>> >> +# define __rare_write_ptr(v) (&(v))
>> >> +# define __rare_write(__var, __val)  ({              \
>> >> +     __var = __val;                                  \
>> >> +     __var;                                          \
>> >> +})
>> >> +# define rare_write_enable() do { } while (0)
>> >> +# define rare_write_disable()        do { } while (0)
>> >> +#else
>> >> +# define __wr_rare           __ro_after_init
>> >> +# define __wr_rare_type              const
>> >> +# define __rare_write_type(v)        typeof((typeof(v))0)
>> >> +# define __rare_write_ptr(v) ((__rare_write_type(v) *)&(v))
>> >
>> > Should we make __rare_write_ptr arch-specific so architectures can
>> > convert the pointer to rw alias from ro address like this ? [1]
>> >
>> > #ifndef __arch_rare_write_ptr
>> > # define __rare_write_ptr(v)    ((__rare_write_type(v) *)&(v))
>> > #else
>> > # define __rate_write_ptr(v)    __arch_rare_write_ptr
>> > #endif
>>
>> I think that was Mark's idea too, but I'm not sure to handle the
>> complex cases of doing multiple updates at once (e.g. linked list
>> updates, etc).
>
> Assuming there aren't that many places with complex updates, we could
> allow code to explicitly do the map/unmap and use __rare_write_ptr
> directly. That does mean that the list code has to be specialised for
> __wr_rare, which is less than ideal.

Here's some sed output: http://paste.ubuntu.com/24092015/

grsecurity currently has 314 instances of using
pax_open/close_kernel(). The number of lines of code between them is
about half a single line, but there is a lot of variation on how it's
used:

count  lines-of-code
    164 1
     72 2
     21 3
     20 4
      2 5
      8 6
      3 7
      2 8
      1 9
     18 10+

The obvious helpers are in the list code (for double, rcu, and single
lists), with most header file uses are operating on page tables.

The list code is already specialized (see the rare_list_add()/del()
function), and could be made even more explicit if needed.

> That would also depend on what is __rw_rare in those cases, too.  i.e.
> is it just the list_head for the list itself, the list_head of all
> elements, or only some of them?

lib/list_debug.c:EXPORT_SYMBOL(__pax_list_add);
lib/list_debug.c:EXPORT_SYMBOL(pax_list_del);
lib/list_debug.c:EXPORT_SYMBOL(pax_list_del_init);
lib/list_debug.c:EXPORT_SYMBOL(__pax_list_add_rcu);
lib/list_debug.c:EXPORT_SYMBOL(pax_list_del_rcu);
lib/llist.c:EXPORT_SYMBOL_GPL(pax_llist_add_batch);

I haven't exhaustively checked, but I assume it could touch any of the
list pieces? I suspect there are mixtures of read-only and
non-read-only elements, but I haven't dug that far yet. From what I
can see in the cgroup case I showed, all the struct cftype variables
were in .data (static struct cftype foo[] = { ... }) so no kmalloc nor
vmalloc ones are mixed in. But I doubt we can entirely depend on
that...

> For the latter mixed case, the __rare_write_ptr() approach probably
> doesn't work, and that rules out the mm-based approach, too, I guess. :/
>
>> I think we're better doing a full disabling of RO protection, but
>> there had been objections to this idea in the past, which is why I
>> included the "complex" example, so I don't see a way around it.
>
> My objection to that was that we can't implement CPU-local full
> disabling of RO protection for the kernel page tables on some
> architectures and configurations, e.g. arm64, or 32-bit arm with LPAE.

The CPU-local is a pretty important requirement... Hrmm

> The RW alias write_write(var, val) approach only relies on what arches
> already have to implement for userspace to work, so if we can figure out
> how to make that work API-wise, we can probably implement that
> generically with switch_mm() and {get,put}_user().

With a third mm? I maybe misunderstood what you meant about userspace...

> The only other way I can think to make this work would be to make a copy
> of the whole swapper page tables, with the write-rarely data marked RW.
> For arm64 at least, that'll be incredibly painful to keep in-sync with
> the usual tables, in addition to being very expensive to switch to/from.

Well, I think sync shouldn't be hard since it'd just be a mirror of
the .rodata section, which is configured once. (Well, and for
modules... oof.)

-Kees
Kees Cook March 1, 2017, 8:31 p.m. UTC | #5
On Wed, Mar 1, 2017 at 12:13 PM, Kees Cook <keescook@chromium.org> wrote:

> Here's some sed output: http://paste.ubuntu.com/24092015/
>
> grsecurity currently has 314 instances of using
> pax_open/close_kernel(). The number of lines of code between them is
> about half a single line, but there is a lot of variation on how it's
> used:
>
> count  lines-of-code
>     164 1
>      72 2
>      21 3
>      20 4
>       2 5
>       8 6
>       3 7
>       2 8
>       1 9
>      18 10+

Oops, bug in grsecurity fooled my scripts and evaded detection.
There's another 3 line use. If you search the pastebin for
pax_open_kernel, you can see a giant bit in the middle that isn't
supposed to be there:

drivers/pci/hotplug/cpcihp_zt5550.c
       pax_open_kernel();
       const_cast(zt5550_hpc_ops.enable_irq) = zt5550_hc_enable_irq;
       const_cast(zt5550_hpc_ops.disable_irq) = zt5550_hc_disable_irq;
       const_cast(zt5550_hpc_ops.check_irq) = zt5550_hc_check_irq;
       pax_open_kernel();

I'll send a patch...

-Kees
Andy Lutomirski March 1, 2017, 9 p.m. UTC | #6
On Wed, Mar 1, 2017 at 12:13 PM, Kees Cook <keescook@chromium.org> wrote:
> On Wed, Mar 1, 2017 at 2:43 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>> My objection to that was that we can't implement CPU-local full
>> disabling of RO protection for the kernel page tables on some
>> architectures and configurations, e.g. arm64, or 32-bit arm with LPAE.
>
> The CPU-local is a pretty important requirement... Hrmm

Why?  Presumably we'd do pretty well if we made an alias somewhere at
a random address and got rid of it quickly.

>
>> The RW alias write_write(var, val) approach only relies on what arches
>> already have to implement for userspace to work, so if we can figure out
>> how to make that work API-wise, we can probably implement that
>> generically with switch_mm() and {get,put}_user().
>
> With a third mm? I maybe misunderstood what you meant about userspace...
>

I think I'd like this series a lot better if the arch hooks were
designed in such a way that a generic implementation could be backed
by kmap, use_mm, or similar.  This would *require* that the arch hooks
be able to return a different address.  Also, I see no reason that the
list manipulation needs to be particularly special.  If the arch hook
sets up an alias, couldn't the generic code just call it twice.

So here's a new proposal for how the hooks could look:

void __arch_rare_write_begin(void);
void *__arch_rare_write_map(void *addr, size_t len);
void *__arch_rare_write_unmap(void *addr, size_t len);
void __arch_rare_write_end(void);

or an even simpler variant:

void __arch_rare_write_begin(void);
void __arch_rare_write(void *dest, const void *source, size_t len);
void __arch_rare_write_end(void);

Now a generic implementation could work by allocating a percpu
mm_struct that contains a single giant VMA.  __arch_rare_write_begin()
switches to that mm.  __arch_rare_write pokes some PTEs into the mm
and calls copy_to_user().  __arch_rare_write_end() switches back to
the original mm.  An x86 implementation could just fiddle with CR0.WP.
Kees Cook March 1, 2017, 11:14 p.m. UTC | #7
On Wed, Mar 1, 2017 at 1:00 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Mar 1, 2017 at 12:13 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Wed, Mar 1, 2017 at 2:43 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>>> My objection to that was that we can't implement CPU-local full
>>> disabling of RO protection for the kernel page tables on some
>>> architectures and configurations, e.g. arm64, or 32-bit arm with LPAE.
>>
>> The CPU-local is a pretty important requirement... Hrmm
>
> Why?  Presumably we'd do pretty well if we made an alias somewhere at
> a random address and got rid of it quickly.

I'd much rather avoid any risk like this, just so the protection could
be safely reasoned about...

>>> The RW alias write_write(var, val) approach only relies on what arches
>>> already have to implement for userspace to work, so if we can figure out
>>> how to make that work API-wise, we can probably implement that
>>> generically with switch_mm() and {get,put}_user().
>>
>> With a third mm? I maybe misunderstood what you meant about userspace...
>
> I think I'd like this series a lot better if the arch hooks were
> designed in such a way that a generic implementation could be backed
> by kmap, use_mm, or similar.  This would *require* that the arch hooks
> be able to return a different address.  Also, I see no reason that the
> list manipulation needs to be particularly special.  If the arch hook

Yeah, there's nothing special about the list manipulation except that
it already dropped the const casts, so adding them back is trivial.

> sets up an alias, couldn't the generic code just call it twice.
>
> So here's a new proposal for how the hooks could look:
>
> void __arch_rare_write_begin(void);
> void *__arch_rare_write_map(void *addr, size_t len);
> void *__arch_rare_write_unmap(void *addr, size_t len);
> void __arch_rare_write_end(void);
>
> or an even simpler variant:
>
> void __arch_rare_write_begin(void);
> void __arch_rare_write(void *dest, const void *source, size_t len);
> void __arch_rare_write_end(void);

Based on your and Mark's feedback from last year, I'd probably create
rare_write() as a macro that examines types and calls some other the
macro that has the above parameters but enforces the
builtin_const-ness of "len", before ultimately resolving to
__arch_rare_write() as above, just to be as type-safe as we can
manage...

#define __rare_write_n(dst, src, len)    ({ \
    BUILD_BUG(!builtin_const(len));      \
    __arch_rare_write((dst), (src), (len);  \
})
#define __rare_write(var, val)     __rare_write_n(&(var), &(val), sizeof(var))

> Now a generic implementation could work by allocating a percpu
> mm_struct that contains a single giant VMA.  __arch_rare_write_begin()
> switches to that mm.  __arch_rare_write pokes some PTEs into the mm
> and calls copy_to_user().  __arch_rare_write_end() switches back to
> the original mm.  An x86 implementation could just fiddle with CR0.WP.

If that works for Russell and Mark, cool by me. :) I'll reorganize my
series with the new API...

-Kees
Mark Rutland March 2, 2017, 11:19 a.m. UTC | #8
On Wed, Mar 01, 2017 at 01:00:05PM -0800, Andy Lutomirski wrote:
> On Wed, Mar 1, 2017 at 12:13 PM, Kees Cook <keescook@chromium.org> wrote:
> > On Wed, Mar 1, 2017 at 2:43 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> >> The RW alias write_write(var, val) approach only relies on what arches
> >> already have to implement for userspace to work, so if we can figure out
> >> how to make that work API-wise, we can probably implement that
> >> generically with switch_mm() and {get,put}_user().
> >
> > With a third mm? I maybe misunderstood what you meant about userspace...
> 
> I think I'd like this series a lot better if the arch hooks were
> designed in such a way that a generic implementation could be backed
> by kmap, use_mm, or similar.  This would *require* that the arch hooks
> be able to return a different address.  Also, I see no reason that the
> list manipulation needs to be particularly special.  If the arch hook
> sets up an alias, couldn't the generic code just call it twice.

I completely agree.

There's some fun to be had with switch_mm/use_mm (e.g. with arm64's
TTBR0_SW_PAN), but I think we can solve that generically.

> So here's a new proposal for how the hooks could look:

> void __arch_rare_write_begin(void);
> void __arch_rare_write(void *dest, const void *source, size_t len);
> void __arch_rare_write_end(void);

I think we're on the same page, API-wise.

Modulo naming, and the len argument to the write function, this is
exactly the same as my original proposal.

I had assumed that we could derive the len argument implicitly from the
object being assigned to, but it doesn't really matter either way.

> Now a generic implementation could work by allocating a percpu
> mm_struct that contains a single giant VMA.  __arch_rare_write_begin()
> switches to that mm.  __arch_rare_write pokes some PTEs into the mm
> and calls copy_to_user().  __arch_rare_write_end() switches back to
> the original mm.  An x86 implementation could just fiddle with CR0.WP.

I'd expected that we'd know where the write_rarely data was up-front, so
we could set up the mapping statically, and just map it in at map/begin,
but otherwise this sounds like what I had in mind.

Thanks,
Mark.
Andy Lutomirski March 2, 2017, 4:33 p.m. UTC | #9
On Thu, Mar 2, 2017 at 3:19 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> On Wed, Mar 01, 2017 at 01:00:05PM -0800, Andy Lutomirski wrote:
>> On Wed, Mar 1, 2017 at 12:13 PM, Kees Cook <keescook@chromium.org> wrote:
>> > On Wed, Mar 1, 2017 at 2:43 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>> >> The RW alias write_write(var, val) approach only relies on what arches
>> >> already have to implement for userspace to work, so if we can figure out
>> >> how to make that work API-wise, we can probably implement that
>> >> generically with switch_mm() and {get,put}_user().
>> >
>> > With a third mm? I maybe misunderstood what you meant about userspace...
>>
>> I think I'd like this series a lot better if the arch hooks were
>> designed in such a way that a generic implementation could be backed
>> by kmap, use_mm, or similar.  This would *require* that the arch hooks
>> be able to return a different address.  Also, I see no reason that the
>> list manipulation needs to be particularly special.  If the arch hook
>> sets up an alias, couldn't the generic code just call it twice.
>
> I completely agree.
>
> There's some fun to be had with switch_mm/use_mm (e.g. with arm64's
> TTBR0_SW_PAN), but I think we can solve that generically.
>
>> So here's a new proposal for how the hooks could look:
>
>> void __arch_rare_write_begin(void);
>> void __arch_rare_write(void *dest, const void *source, size_t len);
>> void __arch_rare_write_end(void);
>
> I think we're on the same page, API-wise.
>
> Modulo naming, and the len argument to the write function, this is
> exactly the same as my original proposal.
>
> I had assumed that we could derive the len argument implicitly from the
> object being assigned to, but it doesn't really matter either way.

I think we can, but I was imagining that this kind of logic would live
in generic code that called __arch_rare_write (or whatever it's
called).

>
>> Now a generic implementation could work by allocating a percpu
>> mm_struct that contains a single giant VMA.  __arch_rare_write_begin()
>> switches to that mm.  __arch_rare_write pokes some PTEs into the mm
>> and calls copy_to_user().  __arch_rare_write_end() switches back to
>> the original mm.  An x86 implementation could just fiddle with CR0.WP.
>
> I'd expected that we'd know where the write_rarely data was up-front, so
> we could set up the mapping statically, and just map it in at map/begin,
> but otherwise this sounds like what I had in mind.
>

Duh.  That works too and would be considerably simpler :)

--Andy
Kees Cook March 2, 2017, 7:48 p.m. UTC | #10
On Thu, Mar 2, 2017 at 8:33 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Mar 2, 2017 at 3:19 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>> On Wed, Mar 01, 2017 at 01:00:05PM -0800, Andy Lutomirski wrote:
>>> On Wed, Mar 1, 2017 at 12:13 PM, Kees Cook <keescook@chromium.org> wrote:
>>> > On Wed, Mar 1, 2017 at 2:43 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>>> Now a generic implementation could work by allocating a percpu
>>> mm_struct that contains a single giant VMA.  __arch_rare_write_begin()
>>> switches to that mm.  __arch_rare_write pokes some PTEs into the mm
>>> and calls copy_to_user().  __arch_rare_write_end() switches back to
>>> the original mm.  An x86 implementation could just fiddle with CR0.WP.
>>
>> I'd expected that we'd know where the write_rarely data was up-front, so
>> we could set up the mapping statically, and just map it in at map/begin,
>> but otherwise this sounds like what I had in mind.
>>
>
> Duh.  That works too and would be considerably simpler :)

It may be slightly complicated by needing to track the module .rodata
areas, but ultimately, yeah, it seems like that could work.

-Kees
diff mbox

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 99839c23d453..2446de19f66d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -781,4 +781,19 @@  config VMAP_STACK
 	  the stack to map directly to the KASAN shadow map using a formula
 	  that is incorrect if the stack is in vmalloc space.
 
+config HAVE_ARCH_RARE_WRITE
+	def_bool n
+	help
+	  An arch should select this option if it has defined the functions
+	  __arch_rare_write_map() and __arch_rare_write_unmap() to
+	  respectively enable and disable writing to read-only memory.The
+	  routines must meet the following requirements:
+	  - read-only memory writing must only be available on the current
+	    CPU (to make sure other CPUs can't race to make changes too).
+	  - the routines must be declared inline (to discourage ROP use).
+	  - the routines must not be preemptible (likely they will call
+	    preempt_disable() and preempt_enable_no_resched() respectively).
+	  - the routines must validate expected state (e.g. when enabling
+	    writes, BUG() if writes are already be enabled).
+
 source "kernel/gcov/Kconfig"
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index cf0fa5d86059..f95603a8ee72 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -325,6 +325,44 @@  static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__u.__val;					\
 })
 
+/*
+ * Build "write rarely" infrastructure for flipping memory r/w
+ * on a per-CPU basis.
+ */
+#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
+# define __wr_rare
+# define __wr_rare_type
+# define __rare_write_type(v)	typeof(v)
+# define __rare_write_ptr(v)	(&(v))
+# define __rare_write(__var, __val)	({		\
+	__var = __val;					\
+	__var;						\
+})
+# define rare_write_enable()	do { } while (0)
+# define rare_write_disable()	do { } while (0)
+#else
+# define __wr_rare		__ro_after_init
+# define __wr_rare_type		const
+# define __rare_write_type(v)	typeof((typeof(v))0)
+# define __rare_write_ptr(v)	((__rare_write_type(v) *)&(v))
+# define __rare_write(__var, __val) ({			\
+	__rare_write_type(__var) *__rw_var;		\
+							\
+	__rw_var = __rare_write_ptr(__var);		\
+	*__rw_var = (__val);				\
+	__var;						\
+})
+# define rare_write_enable()	__arch_rare_write_map()
+# define rare_write_disable()	__arch_rare_write_unmap()
+#endif
+
+#define rare_write(__var, __val) ({			\
+	rare_write_enable();				\
+	__rare_write(__var, __val);			\
+	rare_write_disable();				\
+	__var;						\
+})
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7eeceac52dea..183c1d7a8594 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -237,10 +237,12 @@  do { \
 /*
  * Modules have no business playing preemption tricks.
  */
-#undef sched_preempt_enable_no_resched
-#undef preempt_enable_no_resched
 #undef preempt_enable_no_resched_notrace
 #undef preempt_check_resched
+#ifndef CONFIG_HAVE_ARCH_RARE_WRITE
+#undef sched_preempt_enable_no_resched
+#undef preempt_enable_no_resched
+#endif
 #endif
 
 #define preempt_set_need_resched() \