diff mbox series

[2/6] __wr_after_init: write rare for static allocation

Message ID 20181204121805.4621-3-igor.stoppa@huawei.com (mailing list archive)
State New, archived
Headers show
Series hardening: statically allocated protected memory | expand

Commit Message

Igor Stoppa Dec. 4, 2018, 12:18 p.m. UTC
Implementation of write rare for statically allocated data, located in a
specific memory section through the use of the __write_rare label.

The basic functions are:
- wr_memset(): write rare counterpart of memset()
- wr_memcpy(): write rare counterpart of memcpy()
- wr_assign(): write rare counterpart of the assignment ('=') operator
- wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()

The implementation is based on code from Andy Lutomirski and Nadav Amit
for patching the text on x86 [here goes reference to commits, once merged]

The modification of write protected data is done through an alternate
mapping of the same pages, as writable.
This mapping is local to each core and is active only for the duration
of each write operation.
Local interrupts are disabled, while the alternate mapping is active.

In theory, it could introduce a non-predictable delay, in a preemptible
system, however the amount of data to be altered is likely to be far
smaller than a page.

Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>

CC: Andy Lutomirski <luto@amacapital.net>
CC: Nadav Amit <nadav.amit@gmail.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Kees Cook <keescook@chromium.org>
CC: Dave Hansen <dave.hansen@linux.intel.com>
CC: linux-integrity@vger.kernel.org
CC: kernel-hardening@lists.openwall.com
CC: linux-mm@kvack.org
CC: linux-kernel@vger.kernel.org
---
 include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
 init/main.c           |   2 +
 mm/Kconfig            |   4 ++
 mm/Makefile           |   1 +
 mm/prmem.c            | 124 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 264 insertions(+)
 create mode 100644 include/linux/prmem.h
 create mode 100644 mm/prmem.c

Comments

Andy Lutomirski Dec. 5, 2018, 11:13 p.m. UTC | #1
I added some s390 and powerpc people.

On Tue, Dec 4, 2018 at 4:18 AM Igor Stoppa <igor.stoppa@gmail.com> wrote:
>
> Implementation of write rare for statically allocated data, located in a
> specific memory section through the use of the __write_rare label.
>
> The basic functions are:
> - wr_memset(): write rare counterpart of memset()
> - wr_memcpy(): write rare counterpart of memcpy()
> - wr_assign(): write rare counterpart of the assignment ('=') operator
> - wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()
>
> The implementation is based on code from Andy Lutomirski and Nadav Amit
> for patching the text on x86 [here goes reference to commits, once merged]
>
> The modification of write protected data is done through an alternate
> mapping of the same pages, as writable.
> This mapping is local to each core and is active only for the duration
> of each write operation.
> Local interrupts are disabled, while the alternate mapping is active.
>
> In theory, it could introduce a non-predictable delay, in a preemptible
> system, however the amount of data to be altered is likely to be far
> smaller than a page.
>
> Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>
>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Nadav Amit <nadav.amit@gmail.com>
> CC: Matthew Wilcox <willy@infradead.org>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Kees Cook <keescook@chromium.org>
> CC: Dave Hansen <dave.hansen@linux.intel.com>
> CC: linux-integrity@vger.kernel.org
> CC: kernel-hardening@lists.openwall.com
> CC: linux-mm@kvack.org
> CC: linux-kernel@vger.kernel.org
> ---
>  include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
>  init/main.c           |   2 +
>  mm/Kconfig            |   4 ++
>  mm/Makefile           |   1 +
>  mm/prmem.c            | 124 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 264 insertions(+)
>  create mode 100644 include/linux/prmem.h
>  create mode 100644 mm/prmem.c
>
> diff --git a/include/linux/prmem.h b/include/linux/prmem.h
> new file mode 100644
> index 000000000000..b0131c1f5dc0
> --- /dev/null
> +++ b/include/linux/prmem.h
> @@ -0,0 +1,133 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * prmem.h: Header for memory protection library
> + *
> + * (C) Copyright 2018 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@huawei.com>
> + *
> + * Support for:
> + * - statically allocated write rare data
> + */
> +
> +#ifndef _LINUX_PRMEM_H
> +#define _LINUX_PRMEM_H
> +
> +#include <linux/set_memory.h>
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/string.h>
> +#include <linux/slab.h>
> +#include <linux/mutex.h>
> +#include <linux/compiler.h>
> +#include <linux/irqflags.h>
> +
> +/**
> + * memtst() - test n bytes of the source to match the c value
> + * @p: beginning of the memory to test
> + * @c: byte to compare against
> + * @len: amount of bytes to test
> + *
> + * Returns 0 on success, non-zero otherwise.
> + */
> +static inline int memtst(void *p, int c, __kernel_size_t len)
> +{
> +       __kernel_size_t i;
> +
> +       for (i = 0; i < len; i++) {
> +               u8 d =  *(i + (u8 *)p) - (u8)c;
> +
> +               if (unlikely(d))
> +                       return d;
> +       }
> +       return 0;
> +}
> +
> +
> +#ifndef CONFIG_PRMEM
> +
> +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> +{
> +       return memset(p, c, len);
> +}
> +
> +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> +{
> +       return memcpy(p, q, size);
> +}
> +
> +#define wr_assign(var, val)    ((var) = (val))
> +
> +#define wr_rcu_assign_pointer(p, v)    \
> +       rcu_assign_pointer(p, v)
> +
> +#else
> +
> +enum wr_op_type {
> +       WR_MEMCPY,
> +       WR_MEMSET,
> +       WR_RCU_ASSIGN_PTR,
> +       WR_OPS_NUMBER,
> +};
> +
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> +             enum wr_op_type op);
> +
> +/**
> + * wr_memset() - sets n bytes of the destination to the c value
> + * @p: beginning of the memory to write to
> + * @c: byte to replicate
> + * @len: amount of bytes to copy
> + *
> + * Returns true on success, false otherwise.
> + */
> +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> +{
> +       return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
> +}
> +
> +/**
> + * wr_memcpy() - copyes n bytes from source to destination
> + * @dst: beginning of the memory to write to
> + * @src: beginning of the memory to read from
> + * @n_bytes: amount of bytes to copy
> + *
> + * Returns pointer to the destination
> + */
> +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> +{
> +       return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
> +}
> +
> +/**
> + * wr_assign() - sets a write-rare variable to a specified value
> + * @var: the variable to set
> + * @val: the new value
> + *
> + * Returns: the variable
> + *
> + * Note: it might be possible to optimize this, to use wr_memset in some
> + * cases (maybe with NULL?).
> + */
> +
> +#define wr_assign(var, val) ({                 \
> +       typeof(var) tmp = (typeof(var))val;     \
> +                                               \
> +       wr_memcpy(&var, &tmp, sizeof(var));     \
> +       var;                                    \
> +})
> +
> +/**
> + * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
> + * @p: the rcu pointer
> + * @v: the new value
> + *
> + * Returns the value assigned to the rcu pointer.
> + *
> + * It is provided as macro, to match rcu_assign_pointer()
> + */
> +#define wr_rcu_assign_pointer(p, v) ({                                 \
> +       __wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR);    \
> +       p;                                                              \
> +})
> +#endif
> +#endif
> diff --git a/init/main.c b/init/main.c
> index a461150adfb1..a36f2e54f937 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -498,6 +498,7 @@ void __init __weak thread_stack_cache_init(void)
>  void __init __weak mem_encrypt_init(void) { }
>
>  void __init __weak poking_init(void) { }
> +void __init __weak wr_poking_init(void) { }
>
>  bool initcall_debug;
>  core_param(initcall_debug, initcall_debug, bool, 0644);
> @@ -734,6 +735,7 @@ asmlinkage __visible void __init start_kernel(void)
>         delayacct_init();
>
>         poking_init();
> +       wr_poking_init();
>         check_bugs();
>
>         acpi_subsystem_init();
> diff --git a/mm/Kconfig b/mm/Kconfig
> index d85e39da47ae..9b09339c027f 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -142,6 +142,10 @@ config ARCH_DISCARD_MEMBLOCK
>  config MEMORY_ISOLATION
>         bool
>
> +config PRMEM
> +       def_bool n
> +       depends on STRICT_KERNEL_RWX && X86_64
> +
>  #
>  # Only be set on architectures that have completely implemented memory hotplug
>  # feature. If you are not sure, don't touch it.
> diff --git a/mm/Makefile b/mm/Makefile
> index d210cc9d6f80..ef3867c16ce0 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -58,6 +58,7 @@ obj-$(CONFIG_SPARSEMEM)       += sparse.o
>  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
>  obj-$(CONFIG_SLOB) += slob.o
>  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> +obj-$(CONFIG_PRMEM) += prmem.o
>  obj-$(CONFIG_KSM) += ksm.o
>  obj-$(CONFIG_PAGE_POISONING) += page_poison.o
>  obj-$(CONFIG_SLAB) += slab.o
> diff --git a/mm/prmem.c b/mm/prmem.c
> new file mode 100644
> index 000000000000..e8ab76701831
> --- /dev/null
> +++ b/mm/prmem.c
> @@ -0,0 +1,124 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * prmem.c: Memory Protection Library
> + *
> + * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@huawei.com>
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/string.h>
> +#include <linux/compiler.h>
> +#include <linux/slab.h>
> +#include <linux/mmu_context.h>
> +#include <linux/rcupdate.h>
> +#include <linux/prmem.h>
> +
> +static __ro_after_init bool wr_ready;
> +static __ro_after_init struct mm_struct *wr_poking_mm;
> +static __ro_after_init unsigned long wr_poking_base;
> +
> +/*
> + * The following two variables are statically allocated by the linker
> + * script at the the boundaries of the memory region (rounded up to
> + * multiples of PAGE_SIZE) reserved for __wr_after_init.
> + */
> +extern long __start_wr_after_init;
> +extern long __end_wr_after_init;
> +
> +static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
> +{
> +       unsigned long start = (unsigned long)&__start_wr_after_init;
> +       unsigned long end = (unsigned long)&__end_wr_after_init;
> +       unsigned long low = ptr;
> +       unsigned long high = ptr + size;
> +
> +       return likely(start <= low && low <= high && high <= end);
> +}
> +
> +
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> +             enum wr_op_type op)
> +{

You might end up wanting something like:

#ifdef __arch_wr_op
return __arch_wr_op(...);
#endif

if an arch (s390? powerpc?) decides to have a totally different
implementation of this.

Hi s390 and powerpc people: it would be nice if this generic
implementation *worked* on your architectures and that it will allow
you to add some straightforward way to add a better arch-specific
implementation if you think that would be better.

--Andy

> +       temporary_mm_state_t prev;
> +       unsigned long flags;
> +       unsigned long offset;
> +       unsigned long wr_poking_addr;
> +
> +       /* Confirm that the writable mapping exists. */
> +       BUG_ON(!wr_ready);
> +
> +       if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> +           WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> +               return (void *)dst;
> +
> +       offset = dst - (unsigned long)&__start_wr_after_init;
> +       wr_poking_addr = wr_poking_base + offset;
> +       local_irq_save(flags);
> +       prev = use_temporary_mm(wr_poking_mm);
> +
> +       kasan_disable_current();
> +       if (op == WR_MEMCPY)
> +               memcpy((void *)wr_poking_addr, (void *)src, len);
> +       else if (op == WR_MEMSET)
> +               memset((u8 *)wr_poking_addr, (u8)src, len);
> +       else if (op == WR_RCU_ASSIGN_PTR)
> +               /* generic version of rcu_assign_pointer */
> +               smp_store_release((void **)wr_poking_addr,
> +                                 RCU_INITIALIZER((void **)src));
> +       kasan_enable_current();

Hmm.  I suspect this will explode quite badly on sane architectures
like s390.  (In my book, despite how weird s390 is, it has a vastly
nicer model of "user" memory than any other architecture I know
of...).  I think you should use copy_to_user(), etc, instead.  I'm not
entirely sure what the best smp_store_release() replacement is.
Making this change may also mean you can get rid of the
kasan_disable_current().

> +
> +       barrier(); /* XXX redundant? */

I think it's redundant.  If unuse_temporary_mm() allows earlier stores
to hit the wrong address space, then something is very very wrong, and
something is also very very wrong if the optimizer starts moving
stores across a function call that is most definitely a barrier.

> +
> +       unuse_temporary_mm(prev);
> +       /* XXX make the verification optional? */
> +       if (op == WR_MEMCPY)
> +               BUG_ON(memcmp((void *)dst, (void *)src, len));
> +       else if (op == WR_MEMSET)
> +               BUG_ON(memtst((void *)dst, (u8)src, len));
> +       else if (op == WR_RCU_ASSIGN_PTR)
> +               BUG_ON(*(unsigned long *)dst != src);

Hmm.  If you allowed cmpxchg or even plain xchg, then these bug_ons
would be thoroughly buggy, but maybe they're okay.  But they should,
at most, be WARN_ON_ONCE(), given that you can trigger them by writing
the same addresses from two threads at once, and this isn't even
entirely obviously bogus given the presence of smp_store_release().

> +       local_irq_restore(flags);
> +       return (void *)dst;
> +}
> +
> +struct mm_struct *copy_init_mm(void);
> +void __init wr_poking_init(void)
> +{
> +       unsigned long start = (unsigned long)&__start_wr_after_init;
> +       unsigned long end = (unsigned long)&__end_wr_after_init;
> +       unsigned long i;
> +       unsigned long wr_range;
> +
> +       wr_poking_mm = copy_init_mm();
> +       BUG_ON(!wr_poking_mm);
> +
> +       /* XXX What if it's too large to fit in the task unmapped mem? */
> +       wr_range = round_up(end - start, PAGE_SIZE);
> +
> +       /* Randomize the poking address base*/
> +       wr_poking_base = TASK_UNMAPPED_BASE +
> +               (kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> +               (TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));
> +
> +       /* Create alternate mapping for the entire wr_after_init range. */
> +       for (i = start; i < end; i += PAGE_SIZE) {
> +               struct page *page;
> +               spinlock_t *ptl;
> +               pte_t pte;
> +               pte_t *ptep;
> +               unsigned long wr_poking_addr;
> +
> +               BUG_ON(!(page = virt_to_page(i)));
> +               wr_poking_addr = i - start + wr_poking_base;
> +
> +               /* The lock is not needed, but avoids open-coding. */
> +               ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
> +               VM_BUG_ON(!ptep);
> +
> +               pte = mk_pte(page, PAGE_KERNEL);
> +               set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
> +               spin_unlock(ptl);
> +       }
> +       wr_ready = true;
> +}
> --
> 2.19.1
>
Matthew Wilcox Dec. 6, 2018, 4:44 a.m. UTC | #2
On Tue, Dec 04, 2018 at 02:18:01PM +0200, Igor Stoppa wrote:
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> +	      enum wr_op_type op)
> +{
> +	temporary_mm_state_t prev;
> +	unsigned long flags;
> +	unsigned long offset;
> +	unsigned long wr_poking_addr;
> +
> +	/* Confirm that the writable mapping exists. */
> +	BUG_ON(!wr_ready);
> +
> +	if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> +	    WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> +		return (void *)dst;
> +
> +	offset = dst - (unsigned long)&__start_wr_after_init;
> +	wr_poking_addr = wr_poking_base + offset;
> +	local_irq_save(flags);

Why not local_irq_disable()?  Do we have a use-case for wanting to access
this from interrupt context?

> +	/* XXX make the verification optional? */

Well, yes.  It seems like debug code to me.

> +	/* Randomize the poking address base*/
> +	wr_poking_base = TASK_UNMAPPED_BASE +
> +		(kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> +		(TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));

I don't think this is a great idea.  We want to use the same mm for both
static and dynamic wr memory, yes?  So we should have enough space for
all of ram, not splatter the static section all over the address space.

On x86-64 (4 level page tables), we have a 64TB space for all of physmem
and 128TB of user space, so we can place the base anywhere in a 64TB
range.
Peter Zijlstra Dec. 6, 2018, 9:44 a.m. UTC | #3
On Wed, Dec 05, 2018 at 03:13:56PM -0800, Andy Lutomirski wrote:

> > +       if (op == WR_MEMCPY)
> > +               memcpy((void *)wr_poking_addr, (void *)src, len);
> > +       else if (op == WR_MEMSET)
> > +               memset((u8 *)wr_poking_addr, (u8)src, len);
> > +       else if (op == WR_RCU_ASSIGN_PTR)
> > +               /* generic version of rcu_assign_pointer */
> > +               smp_store_release((void **)wr_poking_addr,
> > +                                 RCU_INITIALIZER((void **)src));
> > +       kasan_enable_current();
> 
> Hmm.  I suspect this will explode quite badly on sane architectures
> like s390.  (In my book, despite how weird s390 is, it has a vastly
> nicer model of "user" memory than any other architecture I know
> of...).  I think you should use copy_to_user(), etc, instead.  I'm not
> entirely sure what the best smp_store_release() replacement is.
> Making this change may also mean you can get rid of the
> kasan_disable_current().

If you make the MEMCPY one guarantee single-copy atomicity for native
words then you're basically done.

smp_store_release() can be implemented with:

	smp_mb();
	WRITE_ONCE();

So if we make MEMCPY provide the WRITE_ONCE(), all we need is that
barrier, which we can easily place at the call site and not overly
complicate our interface with this.

Because performance is down the drain already, an additional full
memory barrier is peanuts here (and in fact already implied by the x86
CR3 munging).
Igor Stoppa Dec. 9, 2018, 10:09 p.m. UTC | #4
On 06/12/2018 01:13, Andy Lutomirski wrote:

>> +       kasan_disable_current();
>> +       if (op == WR_MEMCPY)
>> +               memcpy((void *)wr_poking_addr, (void *)src, len);
>> +       else if (op == WR_MEMSET)
>> +               memset((u8 *)wr_poking_addr, (u8)src, len);
>> +       else if (op == WR_RCU_ASSIGN_PTR)
>> +               /* generic version of rcu_assign_pointer */
>> +               smp_store_release((void **)wr_poking_addr,
>> +                                 RCU_INITIALIZER((void **)src));
>> +       kasan_enable_current();
> 
> Hmm.  I suspect this will explode quite badly on sane architectures
> like s390.  (In my book, despite how weird s390 is, it has a vastly
> nicer model of "user" memory than any other architecture I know
> of...).

I see. I can try to setup also a qemu target for s390, for my tests.
There seems to be a Debian image, to have a fully bootable system.

> I think you should use copy_to_user(), etc, instead.

I'm having troubles with the "etc" part: as far as I can see, there are 
both generic and specific support for both copying and clearing 
user-space memory from kernel, however I couldn't find something that 
looks like a memset_user().

I can of course roll my own, for example iterating copy_to_user() with 
the support of a pre-allocated static buffer (1 page should be enough).

But, before I go down this path, I wanted to confirm that there's really 
nothing better that I could use.

If that's really the case, the static buffer instance should be 
replicated for each core, I think, since each core could be performing 
its own memset_user() at the same time.

Alternatively, I could do a loop of WRITE_ONCE(), however I'm not sure 
how that would work with (lack-of) alignment and might require also a 
preamble/epilogue to deal with unaligned data?

>  I'm not
> entirely sure what the best smp_store_release() replacement is.
> Making this change may also mean you can get rid of the
> kasan_disable_current().
> 
>> +
>> +       barrier(); /* XXX redundant? */
> 
> I think it's redundant.  If unuse_temporary_mm() allows earlier stores
> to hit the wrong address space, then something is very very wrong, and
> something is also very very wrong if the optimizer starts moving
> stores across a function call that is most definitely a barrier.

ok, thanks

>> +
>> +       unuse_temporary_mm(prev);
>> +       /* XXX make the verification optional? */
>> +       if (op == WR_MEMCPY)
>> +               BUG_ON(memcmp((void *)dst, (void *)src, len));
>> +       else if (op == WR_MEMSET)
>> +               BUG_ON(memtst((void *)dst, (u8)src, len));
>> +       else if (op == WR_RCU_ASSIGN_PTR)
>> +               BUG_ON(*(unsigned long *)dst != src);
> 
> Hmm.  If you allowed cmpxchg or even plain xchg, then these bug_ons
> would be thoroughly buggy, but maybe they're okay.  But they should,
> at most, be WARN_ON_ONCE(), 

I have to confess that I do not understand why Nadav's patchset was 
required to use BUG_ON(), while here it's not correct, not even for 
memcopy or memset .

Is it because it is single-threaded?
Or is it because text_poke() is patching code, instead of data?
I can turn to WARN_ON_ONCE(), but I'd like to understand the reason.

> given that you can trigger them by writing
> the same addresses from two threads at once, and this isn't even
> entirely obviously bogus given the presence of smp_store_release().

True, however would it be reasonable to require the use of an explicit 
writer lock, from the user?

This operation is not exactly fast and should happen seldom; I'm not 
sure if it's worth supporting cmpxchg. The speedup would be minimal.

I'd rather not implement the locking implicitly, even if it would be 
possible to detect simultaneous writes, because it might lead to overall 
inconsistent data.

--
igor
Igor Stoppa Dec. 9, 2018, 10:22 p.m. UTC | #5
On 06/12/2018 06:44, Matthew Wilcox wrote:
> On Tue, Dec 04, 2018 at 02:18:01PM +0200, Igor Stoppa wrote:
>> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
>> +	      enum wr_op_type op)
>> +{
>> +	temporary_mm_state_t prev;
>> +	unsigned long flags;
>> +	unsigned long offset;
>> +	unsigned long wr_poking_addr;
>> +
>> +	/* Confirm that the writable mapping exists. */
>> +	BUG_ON(!wr_ready);
>> +
>> +	if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
>> +	    WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
>> +		return (void *)dst;
>> +
>> +	offset = dst - (unsigned long)&__start_wr_after_init;
>> +	wr_poking_addr = wr_poking_base + offset;
>> +	local_irq_save(flags);
> 
> Why not local_irq_disable()?  Do we have a use-case for wanting to access
> this from interrupt context?

No, not that I can think of. It was "just in case", but I can remove it.

>> +	/* XXX make the verification optional? */
> 
> Well, yes.  It seems like debug code to me.

Ok, I was not sure about this, because text_poke() does it as part of 
its normal operations.

>> +	/* Randomize the poking address base*/
>> +	wr_poking_base = TASK_UNMAPPED_BASE +
>> +		(kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
>> +		(TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));
> 
> I don't think this is a great idea.  We want to use the same mm for both
> static and dynamic wr memory, yes?  So we should have enough space for
> all of ram, not splatter the static section all over the address space.
> 
> On x86-64 (4 level page tables), we have a 64TB space for all of physmem
> and 128TB of user space, so we can place the base anywhere in a 64TB
> range.

I was actually wondering about the dynamic part.
It's still not clear to me if it's possible to write the code in a 
sufficiently generic way that it could work on all 64 bit architectures.
I'll start with x86-64 as you suggest.

--
igor
Igor Stoppa Dec. 9, 2018, 10:32 p.m. UTC | #6
On 06/12/2018 11:44, Peter Zijlstra wrote:
> On Wed, Dec 05, 2018 at 03:13:56PM -0800, Andy Lutomirski wrote:
> 
>>> +       if (op == WR_MEMCPY)
>>> +               memcpy((void *)wr_poking_addr, (void *)src, len);
>>> +       else if (op == WR_MEMSET)
>>> +               memset((u8 *)wr_poking_addr, (u8)src, len);
>>> +       else if (op == WR_RCU_ASSIGN_PTR)
>>> +               /* generic version of rcu_assign_pointer */
>>> +               smp_store_release((void **)wr_poking_addr,
>>> +                                 RCU_INITIALIZER((void **)src));
>>> +       kasan_enable_current();
>>
>> Hmm.  I suspect this will explode quite badly on sane architectures
>> like s390.  (In my book, despite how weird s390 is, it has a vastly
>> nicer model of "user" memory than any other architecture I know
>> of...).  I think you should use copy_to_user(), etc, instead.  I'm not
>> entirely sure what the best smp_store_release() replacement is.
>> Making this change may also mean you can get rid of the
>> kasan_disable_current().
> 
> If you make the MEMCPY one guarantee single-copy atomicity for native
> words then you're basically done.
> 
> smp_store_release() can be implemented with:
> 
> 	smp_mb();
> 	WRITE_ONCE();
> 
> So if we make MEMCPY provide the WRITE_ONCE(), all we need is that
> barrier, which we can easily place at the call site and not overly
> complicate our interface with this.

Ok, so the 3rd case (WR_RCU_ASSIGN_PTR) could be handled outside of this 
function.
But, since now memcpy() will be replaced by copy_to_user(), can I assume 
that also copy_to_user() will be atomic, if the destination is properly 
aligned? On x86_64 it seems yes, however it's not clear to me if this is 
the outcome of an optimization or if I can expect it to be always true.


--
igor
Peter Zijlstra Dec. 10, 2018, 9:59 a.m. UTC | #7
On Mon, Dec 10, 2018 at 12:32:21AM +0200, Igor Stoppa wrote:
> 
> 
> On 06/12/2018 11:44, Peter Zijlstra wrote:
> > On Wed, Dec 05, 2018 at 03:13:56PM -0800, Andy Lutomirski wrote:
> > 
> > > > +       if (op == WR_MEMCPY)
> > > > +               memcpy((void *)wr_poking_addr, (void *)src, len);
> > > > +       else if (op == WR_MEMSET)
> > > > +               memset((u8 *)wr_poking_addr, (u8)src, len);
> > > > +       else if (op == WR_RCU_ASSIGN_PTR)
> > > > +               /* generic version of rcu_assign_pointer */
> > > > +               smp_store_release((void **)wr_poking_addr,
> > > > +                                 RCU_INITIALIZER((void **)src));
> > > > +       kasan_enable_current();
> > > 
> > > Hmm.  I suspect this will explode quite badly on sane architectures
> > > like s390.  (In my book, despite how weird s390 is, it has a vastly
> > > nicer model of "user" memory than any other architecture I know
> > > of...).  I think you should use copy_to_user(), etc, instead.  I'm not
> > > entirely sure what the best smp_store_release() replacement is.
> > > Making this change may also mean you can get rid of the
> > > kasan_disable_current().
> > 
> > If you make the MEMCPY one guarantee single-copy atomicity for native
> > words then you're basically done.
> > 
> > smp_store_release() can be implemented with:
> > 
> > 	smp_mb();
> > 	WRITE_ONCE();
> > 
> > So if we make MEMCPY provide the WRITE_ONCE(), all we need is that
> > barrier, which we can easily place at the call site and not overly
> > complicate our interface with this.
> 
> Ok, so the 3rd case (WR_RCU_ASSIGN_PTR) could be handled outside of this
> function.
> But, since now memcpy() will be replaced by copy_to_user(), can I assume
> that also copy_to_user() will be atomic, if the destination is properly
> aligned? On x86_64 it seems yes, however it's not clear to me if this is the
> outcome of an optimization or if I can expect it to be always true.

This would be a new contraint; one that needs to be documented and
verified by the various arch maintainers as they enable this feature on
their platform.
Martin Schwidefsky Dec. 12, 2018, 9:49 a.m. UTC | #8
On Wed, 5 Dec 2018 15:13:56 -0800
Andy Lutomirski <luto@kernel.org> wrote:

> I added some s390 and powerpc people.
> 
> On Tue, Dec 4, 2018 at 4:18 AM Igor Stoppa <igor.stoppa@gmail.com> wrote:
> >
> > Implementation of write rare for statically allocated data, located in a
> > specific memory section through the use of the __write_rare label.
> >
> > The basic functions are:
> > - wr_memset(): write rare counterpart of memset()
> > - wr_memcpy(): write rare counterpart of memcpy()
> > - wr_assign(): write rare counterpart of the assignment ('=') operator
> > - wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()
> >
> > The implementation is based on code from Andy Lutomirski and Nadav Amit
> > for patching the text on x86 [here goes reference to commits, once merged]
> >
> > The modification of write protected data is done through an alternate
> > mapping of the same pages, as writable.
> > This mapping is local to each core and is active only for the duration
> > of each write operation.
> > Local interrupts are disabled, while the alternate mapping is active.
> >
> > In theory, it could introduce a non-predictable delay, in a preemptible
> > system, however the amount of data to be altered is likely to be far
> > smaller than a page.
> >
> > Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>
> >
> > CC: Andy Lutomirski <luto@amacapital.net>
> > CC: Nadav Amit <nadav.amit@gmail.com>
> > CC: Matthew Wilcox <willy@infradead.org>
> > CC: Peter Zijlstra <peterz@infradead.org>
> > CC: Kees Cook <keescook@chromium.org>
> > CC: Dave Hansen <dave.hansen@linux.intel.com>
> > CC: linux-integrity@vger.kernel.org
> > CC: kernel-hardening@lists.openwall.com
> > CC: linux-mm@kvack.org
> > CC: linux-kernel@vger.kernel.org
> > ---
> >  include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
> >  init/main.c           |   2 +
> >  mm/Kconfig            |   4 ++
> >  mm/Makefile           |   1 +
> >  mm/prmem.c            | 124 +++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 264 insertions(+)
> >  create mode 100644 include/linux/prmem.h
> >  create mode 100644 mm/prmem.c
> >
> > diff --git a/include/linux/prmem.h b/include/linux/prmem.h
> > new file mode 100644
> > index 000000000000..b0131c1f5dc0
> > --- /dev/null
> > +++ b/include/linux/prmem.h
> > @@ -0,0 +1,133 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * prmem.h: Header for memory protection library
> > + *
> > + * (C) Copyright 2018 Huawei Technologies Co. Ltd.
> > + * Author: Igor Stoppa <igor.stoppa@huawei.com>
> > + *
> > + * Support for:
> > + * - statically allocated write rare data
> > + */
> > +
> > +#ifndef _LINUX_PRMEM_H
> > +#define _LINUX_PRMEM_H
> > +
> > +#include <linux/set_memory.h>
> > +#include <linux/mm.h>
> > +#include <linux/vmalloc.h>
> > +#include <linux/string.h>
> > +#include <linux/slab.h>
> > +#include <linux/mutex.h>
> > +#include <linux/compiler.h>
> > +#include <linux/irqflags.h>
> > +
> > +/**
> > + * memtst() - test n bytes of the source to match the c value
> > + * @p: beginning of the memory to test
> > + * @c: byte to compare against
> > + * @len: amount of bytes to test
> > + *
> > + * Returns 0 on success, non-zero otherwise.
> > + */
> > +static inline int memtst(void *p, int c, __kernel_size_t len)
> > +{
> > +       __kernel_size_t i;
> > +
> > +       for (i = 0; i < len; i++) {
> > +               u8 d =  *(i + (u8 *)p) - (u8)c;
> > +
> > +               if (unlikely(d))
> > +                       return d;
> > +       }
> > +       return 0;
> > +}
> > +
> > +
> > +#ifndef CONFIG_PRMEM
> > +
> > +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> > +{
> > +       return memset(p, c, len);
> > +}
> > +
> > +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> > +{
> > +       return memcpy(p, q, size);
> > +}
> > +
> > +#define wr_assign(var, val)    ((var) = (val))
> > +
> > +#define wr_rcu_assign_pointer(p, v)    \
> > +       rcu_assign_pointer(p, v)
> > +
> > +#else
> > +
> > +enum wr_op_type {
> > +       WR_MEMCPY,
> > +       WR_MEMSET,
> > +       WR_RCU_ASSIGN_PTR,
> > +       WR_OPS_NUMBER,
> > +};
> > +
> > +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> > +             enum wr_op_type op);
> > +
> > +/**
> > + * wr_memset() - sets n bytes of the destination to the c value
> > + * @p: beginning of the memory to write to
> > + * @c: byte to replicate
> > + * @len: amount of bytes to copy
> > + *
> > + * Returns true on success, false otherwise.
> > + */
> > +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> > +{
> > +       return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
> > +}
> > +
> > +/**
> > + * wr_memcpy() - copyes n bytes from source to destination
> > + * @dst: beginning of the memory to write to
> > + * @src: beginning of the memory to read from
> > + * @n_bytes: amount of bytes to copy
> > + *
> > + * Returns pointer to the destination
> > + */
> > +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> > +{
> > +       return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
> > +}
> > +
> > +/**
> > + * wr_assign() - sets a write-rare variable to a specified value
> > + * @var: the variable to set
> > + * @val: the new value
> > + *
> > + * Returns: the variable
> > + *
> > + * Note: it might be possible to optimize this, to use wr_memset in some
> > + * cases (maybe with NULL?).
> > + */
> > +
> > +#define wr_assign(var, val) ({                 \
> > +       typeof(var) tmp = (typeof(var))val;     \
> > +                                               \
> > +       wr_memcpy(&var, &tmp, sizeof(var));     \
> > +       var;                                    \
> > +})
> > +
> > +/**
> > + * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
> > + * @p: the rcu pointer
> > + * @v: the new value
> > + *
> > + * Returns the value assigned to the rcu pointer.
> > + *
> > + * It is provided as macro, to match rcu_assign_pointer()
> > + */
> > +#define wr_rcu_assign_pointer(p, v) ({                                 \
> > +       __wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR);    \
> > +       p;                                                              \
> > +})
> > +#endif
> > +#endif
> > diff --git a/init/main.c b/init/main.c
> > index a461150adfb1..a36f2e54f937 100644
> > --- a/init/main.c
> > +++ b/init/main.c
> > @@ -498,6 +498,7 @@ void __init __weak thread_stack_cache_init(void)
> >  void __init __weak mem_encrypt_init(void) { }
> >
> >  void __init __weak poking_init(void) { }
> > +void __init __weak wr_poking_init(void) { }
> >
> >  bool initcall_debug;
> >  core_param(initcall_debug, initcall_debug, bool, 0644);
> > @@ -734,6 +735,7 @@ asmlinkage __visible void __init start_kernel(void)
> >         delayacct_init();
> >
> >         poking_init();
> > +       wr_poking_init();
> >         check_bugs();
> >
> >         acpi_subsystem_init();
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index d85e39da47ae..9b09339c027f 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -142,6 +142,10 @@ config ARCH_DISCARD_MEMBLOCK
> >  config MEMORY_ISOLATION
> >         bool
> >
> > +config PRMEM
> > +       def_bool n
> > +       depends on STRICT_KERNEL_RWX && X86_64
> > +
> >  #
> >  # Only be set on architectures that have completely implemented memory hotplug
> >  # feature. If you are not sure, don't touch it.
> > diff --git a/mm/Makefile b/mm/Makefile
> > index d210cc9d6f80..ef3867c16ce0 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -58,6 +58,7 @@ obj-$(CONFIG_SPARSEMEM)       += sparse.o
> >  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> >  obj-$(CONFIG_SLOB) += slob.o
> >  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> > +obj-$(CONFIG_PRMEM) += prmem.o
> >  obj-$(CONFIG_KSM) += ksm.o
> >  obj-$(CONFIG_PAGE_POISONING) += page_poison.o
> >  obj-$(CONFIG_SLAB) += slab.o
> > diff --git a/mm/prmem.c b/mm/prmem.c
> > new file mode 100644
> > index 000000000000..e8ab76701831
> > --- /dev/null
> > +++ b/mm/prmem.c
> > @@ -0,0 +1,124 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * prmem.c: Memory Protection Library
> > + *
> > + * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
> > + * Author: Igor Stoppa <igor.stoppa@huawei.com>
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/string.h>
> > +#include <linux/compiler.h>
> > +#include <linux/slab.h>
> > +#include <linux/mmu_context.h>
> > +#include <linux/rcupdate.h>
> > +#include <linux/prmem.h>
> > +
> > +static __ro_after_init bool wr_ready;
> > +static __ro_after_init struct mm_struct *wr_poking_mm;
> > +static __ro_after_init unsigned long wr_poking_base;
> > +
> > +/*
> > + * The following two variables are statically allocated by the linker
> > + * script at the the boundaries of the memory region (rounded up to
> > + * multiples of PAGE_SIZE) reserved for __wr_after_init.
> > + */
> > +extern long __start_wr_after_init;
> > +extern long __end_wr_after_init;
> > +
> > +static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
> > +{
> > +       unsigned long start = (unsigned long)&__start_wr_after_init;
> > +       unsigned long end = (unsigned long)&__end_wr_after_init;
> > +       unsigned long low = ptr;
> > +       unsigned long high = ptr + size;
> > +
> > +       return likely(start <= low && low <= high && high <= end);
> > +}
> > +
> > +
> > +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> > +             enum wr_op_type op)
> > +{  
> 
> You might end up wanting something like:
> 
> #ifdef __arch_wr_op
> return __arch_wr_op(...);
> #endif
> 
> if an arch (s390? powerpc?) decides to have a totally different
> implementation of this.
> 
> Hi s390 and powerpc people: it would be nice if this generic
> implementation *worked* on your architectures and that it will allow
> you to add some straightforward way to add a better arch-specific
> implementation if you think that would be better.

As the code is right now I can guarantee that it will not work on s390.

> > +       temporary_mm_state_t prev;
> > +       unsigned long flags;
> > +       unsigned long offset;
> > +       unsigned long wr_poking_addr;
> > +
> > +       /* Confirm that the writable mapping exists. */
> > +       BUG_ON(!wr_ready);
> > +
> > +       if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> > +           WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> > +               return (void *)dst;
> > +
> > +       offset = dst - (unsigned long)&__start_wr_after_init;
> > +       wr_poking_addr = wr_poking_base + offset;
> > +       local_irq_save(flags);
> > +       prev = use_temporary_mm(wr_poking_mm);
> > +
> > +       kasan_disable_current();
> > +       if (op == WR_MEMCPY)
> > +               memcpy((void *)wr_poking_addr, (void *)src, len);
> > +       else if (op == WR_MEMSET)
> > +               memset((u8 *)wr_poking_addr, (u8)src, len);
> > +       else if (op == WR_RCU_ASSIGN_PTR)
> > +               /* generic version of rcu_assign_pointer */
> > +               smp_store_release((void **)wr_poking_addr,
> > +                                 RCU_INITIALIZER((void **)src));
> > +       kasan_enable_current();  
> 
> Hmm.  I suspect this will explode quite badly on sane architectures
> like s390.  (In my book, despite how weird s390 is, it has a vastly
> nicer model of "user" memory than any other architecture I know
> of...).  I think you should use copy_to_user(), etc, instead.  I'm not
> entirely sure what the best smp_store_release() replacement is.
> Making this change may also mean you can get rid of the
> kasan_disable_current().

use_temporary_mm() replaces the *user* mm, for s390 this is completely
independent from the kernel mm (different control register cr1 vs cr7).
The wr_poking_addr is not available when running in the kernel address
space.

> > +
> > +       barrier(); /* XXX redundant? */  
> 
> I think it's redundant.  If unuse_temporary_mm() allows earlier stores
> to hit the wrong address space, then something is very very wrong, and
> something is also very very wrong if the optimizer starts moving
> stores across a function call that is most definitely a barrier.
> 
> > +
> > +       unuse_temporary_mm(prev);
> > +       /* XXX make the verification optional? */
> > +       if (op == WR_MEMCPY)
> > +               BUG_ON(memcmp((void *)dst, (void *)src, len));
> > +       else if (op == WR_MEMSET)
> > +               BUG_ON(memtst((void *)dst, (u8)src, len));
> > +       else if (op == WR_RCU_ASSIGN_PTR)
> > +               BUG_ON(*(unsigned long *)dst != src);  
> 
> Hmm.  If you allowed cmpxchg or even plain xchg, then these bug_ons
> would be thoroughly buggy, but maybe they're okay.  But they should,
> at most, be WARN_ON_ONCE(), given that you can trigger them by writing
> the same addresses from two threads at once, and this isn't even
> entirely obviously bogus given the presence of smp_store_release().
> 
> > +       local_irq_restore(flags);
> > +       return (void *)dst;
> > +}
> > +
> > +struct mm_struct *copy_init_mm(void);
> > +void __init wr_poking_init(void)
> > +{
> > +       unsigned long start = (unsigned long)&__start_wr_after_init;
> > +       unsigned long end = (unsigned long)&__end_wr_after_init;
> > +       unsigned long i;
> > +       unsigned long wr_range;
> > +
> > +       wr_poking_mm = copy_init_mm();
> > +       BUG_ON(!wr_poking_mm);
> > +
> > +       /* XXX What if it's too large to fit in the task unmapped mem? */
> > +       wr_range = round_up(end - start, PAGE_SIZE);
> > +
> > +       /* Randomize the poking address base*/
> > +       wr_poking_base = TASK_UNMAPPED_BASE +
> > +               (kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> > +               (TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));

The wr_poking_base address is a user space address, no? This will be usable
only with the uaccess primitives on s390.

> > +
> > +       /* Create alternate mapping for the entire wr_after_init range. */
> > +       for (i = start; i < end; i += PAGE_SIZE) {
> > +               struct page *page;
> > +               spinlock_t *ptl;
> > +               pte_t pte;
> > +               pte_t *ptep;
> > +               unsigned long wr_poking_addr;
> > +
> > +               BUG_ON(!(page = virt_to_page(i)));
> > +               wr_poking_addr = i - start + wr_poking_base;
> > +
> > +               /* The lock is not needed, but avoids open-coding. */
> > +               ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
> > +               VM_BUG_ON(!ptep);
> > +
> > +               pte = mk_pte(page, PAGE_KERNEL);
> > +               set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
> > +               spin_unlock(ptl);
> > +       }
> > +       wr_ready = true;
> > +}
> > --
> > 2.19.1
> >  
>
Igor Stoppa Dec. 19, 2018, 10:50 p.m. UTC | #9
On 12/12/2018 11:49, Martin Schwidefsky wrote:
> On Wed, 5 Dec 2018 15:13:56 -0800
> Andy Lutomirski <luto@kernel.org> wrote:

>> Hi s390 and powerpc people: it would be nice if this generic
>> implementation *worked* on your architectures and that it will allow
>> you to add some straightforward way to add a better arch-specific
>> implementation if you think that would be better.
> 
> As the code is right now I can guarantee that it will not work on s390.

OK, I have thrown the towel wrt developing at the same time for multiple 
architectures.

ATM I'm oriented toward getting support for one (x86_64), leaving the 
actual mechanism as architecture specific.

Then I can add another one or two and see what makes sense to refactor.
This approach should minimize the churning, overall.


--
igor
diff mbox series

Patch

diff --git a/include/linux/prmem.h b/include/linux/prmem.h
new file mode 100644
index 000000000000..b0131c1f5dc0
--- /dev/null
+++ b/include/linux/prmem.h
@@ -0,0 +1,133 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * prmem.h: Header for memory protection library
+ *
+ * (C) Copyright 2018 Huawei Technologies Co. Ltd.
+ * Author: Igor Stoppa <igor.stoppa@huawei.com>
+ *
+ * Support for:
+ * - statically allocated write rare data
+ */
+
+#ifndef _LINUX_PRMEM_H
+#define _LINUX_PRMEM_H
+
+#include <linux/set_memory.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/compiler.h>
+#include <linux/irqflags.h>
+
+/**
+ * memtst() - test n bytes of the source to match the c value
+ * @p: beginning of the memory to test
+ * @c: byte to compare against
+ * @len: amount of bytes to test
+ *
+ * Returns 0 on success, non-zero otherwise.
+ */
+static inline int memtst(void *p, int c, __kernel_size_t len)
+{
+	__kernel_size_t i;
+
+	for (i = 0; i < len; i++) {
+		u8 d =  *(i + (u8 *)p) - (u8)c;
+
+		if (unlikely(d))
+			return d;
+	}
+	return 0;
+}
+
+
+#ifndef CONFIG_PRMEM
+
+static inline void *wr_memset(void *p, int c, __kernel_size_t len)
+{
+	return memset(p, c, len);
+}
+
+static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
+{
+	return memcpy(p, q, size);
+}
+
+#define wr_assign(var, val)	((var) = (val))
+
+#define wr_rcu_assign_pointer(p, v)	\
+	rcu_assign_pointer(p, v)
+
+#else
+
+enum wr_op_type {
+	WR_MEMCPY,
+	WR_MEMSET,
+	WR_RCU_ASSIGN_PTR,
+	WR_OPS_NUMBER,
+};
+
+void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
+	      enum wr_op_type op);
+
+/**
+ * wr_memset() - sets n bytes of the destination to the c value
+ * @p: beginning of the memory to write to
+ * @c: byte to replicate
+ * @len: amount of bytes to copy
+ *
+ * Returns true on success, false otherwise.
+ */
+static inline void *wr_memset(void *p, int c, __kernel_size_t len)
+{
+	return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
+}
+
+/**
+ * wr_memcpy() - copyes n bytes from source to destination
+ * @dst: beginning of the memory to write to
+ * @src: beginning of the memory to read from
+ * @n_bytes: amount of bytes to copy
+ *
+ * Returns pointer to the destination
+ */
+static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
+{
+	return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
+}
+
+/**
+ * wr_assign() - sets a write-rare variable to a specified value
+ * @var: the variable to set
+ * @val: the new value
+ *
+ * Returns: the variable
+ *
+ * Note: it might be possible to optimize this, to use wr_memset in some
+ * cases (maybe with NULL?).
+ */
+
+#define wr_assign(var, val) ({			\
+	typeof(var) tmp = (typeof(var))val;	\
+						\
+	wr_memcpy(&var, &tmp, sizeof(var));	\
+	var;					\
+})
+
+/**
+ * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
+ * @p: the rcu pointer
+ * @v: the new value
+ *
+ * Returns the value assigned to the rcu pointer.
+ *
+ * It is provided as macro, to match rcu_assign_pointer()
+ */
+#define wr_rcu_assign_pointer(p, v) ({					\
+	__wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR);	\
+	p;								\
+})
+#endif
+#endif
diff --git a/init/main.c b/init/main.c
index a461150adfb1..a36f2e54f937 100644
--- a/init/main.c
+++ b/init/main.c
@@ -498,6 +498,7 @@  void __init __weak thread_stack_cache_init(void)
 void __init __weak mem_encrypt_init(void) { }
 
 void __init __weak poking_init(void) { }
+void __init __weak wr_poking_init(void) { }
 
 bool initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
@@ -734,6 +735,7 @@  asmlinkage __visible void __init start_kernel(void)
 	delayacct_init();
 
 	poking_init();
+	wr_poking_init();
 	check_bugs();
 
 	acpi_subsystem_init();
diff --git a/mm/Kconfig b/mm/Kconfig
index d85e39da47ae..9b09339c027f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -142,6 +142,10 @@  config ARCH_DISCARD_MEMBLOCK
 config MEMORY_ISOLATION
 	bool
 
+config PRMEM
+	def_bool n
+	depends on STRICT_KERNEL_RWX && X86_64
+
 #
 # Only be set on architectures that have completely implemented memory hotplug
 # feature. If you are not sure, don't touch it.
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..ef3867c16ce0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,6 +58,7 @@  obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PRMEM) += prmem.o
 obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/prmem.c b/mm/prmem.c
new file mode 100644
index 000000000000..e8ab76701831
--- /dev/null
+++ b/mm/prmem.c
@@ -0,0 +1,124 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * prmem.c: Memory Protection Library
+ *
+ * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
+ * Author: Igor Stoppa <igor.stoppa@huawei.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+#include <linux/mmu_context.h>
+#include <linux/rcupdate.h>
+#include <linux/prmem.h>
+
+static __ro_after_init bool wr_ready;
+static __ro_after_init struct mm_struct *wr_poking_mm;
+static __ro_after_init unsigned long wr_poking_base;
+
+/*
+ * The following two variables are statically allocated by the linker
+ * script at the the boundaries of the memory region (rounded up to
+ * multiples of PAGE_SIZE) reserved for __wr_after_init.
+ */
+extern long __start_wr_after_init;
+extern long __end_wr_after_init;
+
+static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
+{
+	unsigned long start = (unsigned long)&__start_wr_after_init;
+	unsigned long end = (unsigned long)&__end_wr_after_init;
+	unsigned long low = ptr;
+	unsigned long high = ptr + size;
+
+	return likely(start <= low && low <= high && high <= end);
+}
+
+
+void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
+	      enum wr_op_type op)
+{
+	temporary_mm_state_t prev;
+	unsigned long flags;
+	unsigned long offset;
+	unsigned long wr_poking_addr;
+
+	/* Confirm that the writable mapping exists. */
+	BUG_ON(!wr_ready);
+
+	if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
+	    WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
+		return (void *)dst;
+
+	offset = dst - (unsigned long)&__start_wr_after_init;
+	wr_poking_addr = wr_poking_base + offset;
+	local_irq_save(flags);
+	prev = use_temporary_mm(wr_poking_mm);
+
+	kasan_disable_current();
+	if (op == WR_MEMCPY)
+		memcpy((void *)wr_poking_addr, (void *)src, len);
+	else if (op == WR_MEMSET)
+		memset((u8 *)wr_poking_addr, (u8)src, len);
+	else if (op == WR_RCU_ASSIGN_PTR)
+		/* generic version of rcu_assign_pointer */
+		smp_store_release((void **)wr_poking_addr,
+				  RCU_INITIALIZER((void **)src));
+	kasan_enable_current();
+
+	barrier(); /* XXX redundant? */
+
+	unuse_temporary_mm(prev);
+	/* XXX make the verification optional? */
+	if (op == WR_MEMCPY)
+		BUG_ON(memcmp((void *)dst, (void *)src, len));
+	else if (op == WR_MEMSET)
+		BUG_ON(memtst((void *)dst, (u8)src, len));
+	else if (op == WR_RCU_ASSIGN_PTR)
+		BUG_ON(*(unsigned long *)dst != src);
+	local_irq_restore(flags);
+	return (void *)dst;
+}
+
+struct mm_struct *copy_init_mm(void);
+void __init wr_poking_init(void)
+{
+	unsigned long start = (unsigned long)&__start_wr_after_init;
+	unsigned long end = (unsigned long)&__end_wr_after_init;
+	unsigned long i;
+	unsigned long wr_range;
+
+	wr_poking_mm = copy_init_mm();
+	BUG_ON(!wr_poking_mm);
+
+	/* XXX What if it's too large to fit in the task unmapped mem? */
+	wr_range = round_up(end - start, PAGE_SIZE);
+
+	/* Randomize the poking address base*/
+	wr_poking_base = TASK_UNMAPPED_BASE +
+		(kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
+		(TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));
+
+	/* Create alternate mapping for the entire wr_after_init range. */
+	for (i = start; i < end; i += PAGE_SIZE) {
+		struct page *page;
+		spinlock_t *ptl;
+		pte_t pte;
+		pte_t *ptep;
+		unsigned long wr_poking_addr;
+
+		BUG_ON(!(page = virt_to_page(i)));
+		wr_poking_addr = i - start + wr_poking_base;
+
+		/* The lock is not needed, but avoids open-coding. */
+		ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
+		VM_BUG_ON(!ptep);
+
+		pte = mk_pte(page, PAGE_KERNEL);
+		set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
+		spin_unlock(ptl);
+	}
+	wr_ready = true;
+}