diff mbox

[1/1] arm64/mm: move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata section

Message ID 20180620085755.20045-2-yaojun8558363@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jun Yao June 20, 2018, 8:57 a.m. UTC
Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
section. And update the swapper_pg_dir by fixmap.

Signed-off-by: Jun Yao <yaojun8558363@gmail.com>
---
 arch/arm64/include/asm/pgalloc.h | 19 +++++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S  | 32 ++++++++++++++++++--------------
 arch/arm64/mm/mmu.c              | 23 +++++++++++++++++++----
 3 files changed, 56 insertions(+), 18 deletions(-)

Comments

Ard Biesheuvel June 20, 2018, 10:09 a.m. UTC | #1
On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
> Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
> section. And update the swapper_pg_dir by fixmap.
>

I think we may be able to get away with not mapping idmap_pg_dir and
tramp_pg_dir at all.

As for swapper_pg_dir, it would indeed be nice if we could keep those
mappings read-only most of the time, but I'm not sure how useful this
is if we apply it to the root level only.

> Signed-off-by: Jun Yao <yaojun8558363@gmail.com>
> ---
>  arch/arm64/include/asm/pgalloc.h | 19 +++++++++++++++++++
>  arch/arm64/kernel/vmlinux.lds.S  | 32 ++++++++++++++++++--------------
>  arch/arm64/mm/mmu.c              | 23 +++++++++++++++++++----
>  3 files changed, 56 insertions(+), 18 deletions(-)
>
> diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> index 2e05bcd944c8..cc96a7e6957d 100644
> --- a/arch/arm64/include/asm/pgalloc.h
> +++ b/arch/arm64/include/asm/pgalloc.h
> @@ -29,6 +29,10 @@
>  #define PGALLOC_GFP    (GFP_KERNEL | __GFP_ZERO)
>  #define PGD_SIZE       (PTRS_PER_PGD * sizeof(pgd_t))
>
> +#if CONFIG_STRICT_KERNEL_RWX
> +extern spinlock_t pgdir_lock;
> +#endif
> +
>  #if CONFIG_PGTABLE_LEVELS > 2
>
>  static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
> @@ -78,6 +82,21 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
>
>  static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp)
>  {
> +#if CONFIG_STRICT_KERNEL_RWX
> +       if (mm == &init_mm) {
> +               pgd_t *pgd;
> +
> +               spin_lock(&pgdir_lock);
> +               pgd = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
> +
> +               pgd = (pgd_t *)((unsigned long)pgd + pgdp - swapper_pg_dir);
> +               __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
> +

This only works for 4-level paging, but we support 2 and 3 level paging as well.

> +               pgd_clear_fixmap();
> +               spin_unlock(&pgdir_lock);
> +               return;
> +       }
> +#endif
>         __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
>  }
>  #else
> diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
> index 605d1b60469c..86532c57206a 100644
> --- a/arch/arm64/kernel/vmlinux.lds.S
> +++ b/arch/arm64/kernel/vmlinux.lds.S
> @@ -216,21 +216,25 @@ SECTIONS
>         BSS_SECTION(0, 0, 0)
>
>         . = ALIGN(PAGE_SIZE);
> -       idmap_pg_dir = .;
> -       . += IDMAP_DIR_SIZE;
>
> -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> -       tramp_pg_dir = .;
> -       . += PAGE_SIZE;
> -#endif
> -
> -#ifdef CONFIG_ARM64_SW_TTBR0_PAN
> -       reserved_ttbr0 = .;
> -       . += RESERVED_TTBR0_SIZE;
> -#endif
> -       swapper_pg_dir = .;
> -       . += SWAPPER_DIR_SIZE;
> -       swapper_pg_end = .;
> +       .rodata : {
> +               . = ALIGN(PAGE_SIZE);
> +               idmap_pg_dir = .;
> +               . += IDMAP_DIR_SIZE;
> +
> +               #ifdef CONFIG_UNMAP_KERNEL_AT_EL0

CPP directives should start in the first column

> +               tramp_pg_dir = .;
> +               . += PAGE_SIZE;
> +               #endif
> +
> +               #ifdef CONFIG_ARM64_SW_TTBR0_PAN
> +               reserved_ttbr0 = .;
> +               . += RESERVED_TTBR0_SIZE;
> +               #endif
> +               swapper_pg_dir = .;
> +               . += SWAPPER_DIR_SIZE;
> +               swapper_pg_end = .;
> +       }
>
>         __pecoff_data_size = ABSOLUTE(. - __initdata_begin);
>         _end = .;
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 2dbb2c9f1ec1..c1aa85a6ada5 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -66,6 +66,10 @@ static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
>  static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
>  static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
>
> +#ifdef CONFIG_STRICT_KERNEL_RWX
> +DEFINE_SPINLOCK(pgdir_lock);
> +#endif
> +
>  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>                               unsigned long size, pgprot_t vma_prot)
>  {
> @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
>
>  void __init mark_linear_text_alias_ro(void)
>  {
> +       unsigned long size;
> +
>         /*
>          * Remove the write permissions from the linear alias of .text/.rodata
> +        *
> +        * We free some pages in .rodata at paging_init(), which generates a
> +        * hole. And the hole splits .rodata into two pieces.
>          */
> +       size = (unsigned long)swapper_pg_dir + PAGE_SIZE - (unsigned long)_text;
>         update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
> -                           (unsigned long)__init_begin - (unsigned long)_text,
> -                           PAGE_KERNEL_RO);
> +                           size, PAGE_KERNEL_RO);
> +
> +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
> +       update_mapping_prot(__pa_symbol(swapper_pg_end),
> +                           (unsigned long)lm_alias(swapper_pg_end),
> +                           size, PAGE_KERNEL_RO);

I don't think this is necessary. Even if some pages are freed, it
doesn't harm to keep a read-only alias of them here since the new
owner won't access them via this mapping anyway. So we can keep
.rodata as a single region.

>  }
>
>  static void __init map_mem(pgd_t *pgdp)
> @@ -587,8 +601,9 @@ static void __init map_kernel(pgd_t *pgdp)
>          */
>         map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
>                            VM_NO_GUARD);
> -       map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
> -                          &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
> +       map_kernel_segment(pgdp, __start_rodata, __inittext_begin,
> +                          PAGE_KERNEL, &vmlinux_rodata,
> +                          NO_CONT_MAPPINGS | NO_BLOCK_MAPPINGS, VM_NO_GUARD);

Given the above, you should be able to drop this as well.

>         map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
>                            &vmlinux_inittext, 0, VM_NO_GUARD);
>         map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
> --
> 2.17.1
>
Jun Yao June 21, 2018, 2:51 a.m. UTC | #2
Hi Ard,

On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
> > Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
> > section. And update the swapper_pg_dir by fixmap.
> >
> 
> I think we may be able to get away with not mapping idmap_pg_dir and
> tramp_pg_dir at all.

I think we need to move tramp_pg_dir to .rodata. The attacker can write
a block-mapping(AP=01) to tramp_pg_dir and then he can access kernel
memory.

> As for swapper_pg_dir, it would indeed be nice if we could keep those
> mappings read-only most of the time, but I'm not sure how useful this
> is if we apply it to the root level only.

The purpose of it is to make 'KSMA' harder, where an single arbitrary
write is used to add a block mapping to the page-tables, giving the
attacker full access to kernel memory. That's why we just apply it to
the root level only. If the attacker can arbitrary write multiple times,
I think it's hard to defend.

> > @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
> >
> >  void __init mark_linear_text_alias_ro(void)
> >  {
> > +       unsigned long size;
> > +
> >         /*
> >          * Remove the write permissions from the linear alias of .text/.rodata
> > +        *
> > +        * We free some pages in .rodata at paging_init(), which generates a
> > +        * hole. And the hole splits .rodata into two pieces.
> >          */
> > +       size = (unsigned long)swapper_pg_dir + PAGE_SIZE - (unsigned long)_text;
> >         update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
> > -                           (unsigned long)__init_begin - (unsigned long)_text,
> > -                           PAGE_KERNEL_RO);
> > +                           size, PAGE_KERNEL_RO);
> > +
> > +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
> > +       update_mapping_prot(__pa_symbol(swapper_pg_end),
> > +                           (unsigned long)lm_alias(swapper_pg_end),
> > +                           size, PAGE_KERNEL_RO);
> 
> I don't think this is necessary. Even if some pages are freed, it
> doesn't harm to keep a read-only alias of them here since the new
> owner won't access them via this mapping anyway. So we can keep
> .rodata as a single region.

To be honest, I didn't think of this issue at first. I later found a
problem when testing the code on qemu:

[    7.027935] Unable to handle kernel write to read-only memory at virtual address ffff800000f42c00
[    7.028388] Mem abort info:
[    7.028495]   ESR = 0x9600004f
[    7.028602]   Exception class = DABT (current EL), IL = 32 bits
[    7.028749]   SET = 0, FnV = 0
[    7.028837]   EA = 0, S1PTW = 0
[    7.028930] Data abort info:
[    7.029017]   ISV = 0, ISS = 0x0000004f
[    7.029120]   CM = 0, WnR = 1
[    7.029253] swapper pgtable: 4k pages, 48-bit VAs, pgdp =         (ptrval)
[    7.029418] [ffff800000f42c00] pgd=00000000beff6803, pud=00000000beff5803, pmd=00000000beff3803, pte=00e0000040f42f93
[    7.029807] Internal error: Oops: 9600004f [#1] PREEMPT SMP
[    7.030027] Modules linked in:
[    7.030256] CPU: 0 PID: 1321 Comm: jbd2/vda-8 Not tainted 4.17.0-rc4-02908-g0fe42512b2f0-dirty #71
[    7.030486] Hardware name: linux,dummy-virt (DT)
[    7.030708] pstate: 40400005 (nZcv daif +PAN -UAO)
[    7.030880] pc : __memset+0x16c/0x1c0
[    7.030993] lr : jbd2_journal_get_descriptor_buffer+0x7c/0xfc
[    7.031134] sp : ffff00000a8ebbe0
[    7.031264] x29: ffff00000a8ebbe0 x28: ffff80007c104800
[    7.031430] x27: ffff00000a8ebd98 x26: ffff80007c4410d0
[    7.031567] x25: ffff80007c441118 x24: 00000000ffffffff
[    7.031704] x23: ffff80007c41b000 x22: ffff0000090d9000
[    7.031838] x21: 0000000002000000 x20: ffff80007bcee800
[    7.031973] x19: ffff80007c4413a8 x18: 0000000000000727
[    7.032107] x17: 0000ffff89eba028 x16: ffff0000080e2c38
[    7.032286] x15: ffff7e0000000000 x14: 0000000000048018
[    7.032424] x13: 0000000048018c00 x12: ffff80007bc65788
[    7.032558] x11: ffff00000a8eba68 x10: 0000000000000040
[    7.032709] x9 : 0000000000000000 x8 : ffff800000f42c00
[    7.032849] x7 : 0000000000000000 x6 : 000000000000003f
[    7.032984] x5 : 0000000000000040 x4 : 0000000000000000
[    7.033119] x3 : 0000000000000004 x2 : 00000000000003c0
[    7.033254] x1 : 0000000000000000 x0 : ffff800000f42c00
[    7.033414] Process jbd2/vda-8 (pid: 1321, stack limit = 0x        (ptrval))
[    7.033633] Call trace:
[    7.033757]  __memset+0x16c/0x1c0
[    7.033858]  journal_submit_commit_record+0x60/0x174
[    7.033985]  jbd2_journal_commit_transaction+0xf38/0x1330
[    7.034115]  kjournald2+0xcc/0x250
[    7.034207]  kthread+0xfc/0x128
[    7.034295]  ret_from_fork+0x10/0x18
[    7.034718] Code: 91010108 54ffff4a 8b040108 cb050042 (d50b7428)
[    7.035104] ---[ end trace 26d65a14ae983167 ]---

/sys/kernel/debug/kernel_page_tables shows that:

---[ Linear Mapping ]---
0xffff800000000000-0xffff800000080000         512K PTE       RW NX SHD AF NG CON     UXN MEM/NORMAL
0xffff800000080000-0xffff800000200000        1536K PTE       ro NX SHD AF NG         UXN MEM/NORMAL
0xffff800000200000-0xffff800000e00000          12M PMD       RW NX SHD AF NG     BLK UXN MEM/NORMAL
0xffff800000e00000-0xffff800000fb0000        1728K PTE       ro NX SHD AF NG         UXN MEM/NORMAL

So I split it into pieces.

Thanks,

Jun
Ard Biesheuvel June 21, 2018, 6:39 a.m. UTC | #3
On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
> Hi Ard,
>
> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
>> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
>> > Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
>> > section. And update the swapper_pg_dir by fixmap.
>> >
>>
>> I think we may be able to get away with not mapping idmap_pg_dir and
>> tramp_pg_dir at all.
>
> I think we need to move tramp_pg_dir to .rodata. The attacker can write
> a block-mapping(AP=01) to tramp_pg_dir and then he can access kernel
> memory.
>

Why does it need to be mapped at all? When do we ever access it from the code?


>> As for swapper_pg_dir, it would indeed be nice if we could keep those
>> mappings read-only most of the time, but I'm not sure how useful this
>> is if we apply it to the root level only.
>
> The purpose of it is to make 'KSMA' harder, where an single arbitrary
> write is used to add a block mapping to the page-tables, giving the
> attacker full access to kernel memory. That's why we just apply it to
> the root level only. If the attacker can arbitrary write multiple times,
> I think it's hard to defend.
>

So the assumption is that the root level is more easy to find?
Otherwise, I'm not sure I understand why being able to write a level 0
entry is so harmful, given that we don't have block mappings at that
level.

>> > @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
>> >
>> >  void __init mark_linear_text_alias_ro(void)
>> >  {
>> > +       unsigned long size;
>> > +
>> >         /*
>> >          * Remove the write permissions from the linear alias of .text/.rodata
>> > +        *
>> > +        * We free some pages in .rodata at paging_init(), which generates a
>> > +        * hole. And the hole splits .rodata into two pieces.
>> >          */
>> > +       size = (unsigned long)swapper_pg_dir + PAGE_SIZE - (unsigned long)_text;
>> >         update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
>> > -                           (unsigned long)__init_begin - (unsigned long)_text,
>> > -                           PAGE_KERNEL_RO);
>> > +                           size, PAGE_KERNEL_RO);
>> > +
>> > +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
>> > +       update_mapping_prot(__pa_symbol(swapper_pg_end),
>> > +                           (unsigned long)lm_alias(swapper_pg_end),
>> > +                           size, PAGE_KERNEL_RO);
>>
>> I don't think this is necessary. Even if some pages are freed, it
>> doesn't harm to keep a read-only alias of them here since the new
>> owner won't access them via this mapping anyway. So we can keep
>> .rodata as a single region.
>
> To be honest, I didn't think of this issue at first. I later found a
> problem when testing the code on qemu:
>

OK, you're right. I missed the fact that this operates on the linear
alias, not the kernel mapping itself.

What I don't like is that we lose the ability to use block mappings
for the entire .rodata section this way. Isn't it possible to move
these pgdirs to the end of the .rodata segment, perhaps by using a
separate input section name and placing that explicitly? We could even
simply forget about freeing those pages, given that [on 4k pages] the
benefit of freeing 12 KB of space is likely to get lost in the
rounding noise anyway [segments are rounded up to 64 KB in size]


> [    7.027935] Unable to handle kernel write to read-only memory at virtual address ffff800000f42c00
> [    7.028388] Mem abort info:
> [    7.028495]   ESR = 0x9600004f
> [    7.028602]   Exception class = DABT (current EL), IL = 32 bits
> [    7.028749]   SET = 0, FnV = 0
> [    7.028837]   EA = 0, S1PTW = 0
> [    7.028930] Data abort info:
> [    7.029017]   ISV = 0, ISS = 0x0000004f
> [    7.029120]   CM = 0, WnR = 1
> [    7.029253] swapper pgtable: 4k pages, 48-bit VAs, pgdp =         (ptrval)
> [    7.029418] [ffff800000f42c00] pgd=00000000beff6803, pud=00000000beff5803, pmd=00000000beff3803, pte=00e0000040f42f93
> [    7.029807] Internal error: Oops: 9600004f [#1] PREEMPT SMP
> [    7.030027] Modules linked in:
> [    7.030256] CPU: 0 PID: 1321 Comm: jbd2/vda-8 Not tainted 4.17.0-rc4-02908-g0fe42512b2f0-dirty #71
> [    7.030486] Hardware name: linux,dummy-virt (DT)
> [    7.030708] pstate: 40400005 (nZcv daif +PAN -UAO)
> [    7.030880] pc : __memset+0x16c/0x1c0
> [    7.030993] lr : jbd2_journal_get_descriptor_buffer+0x7c/0xfc
> [    7.031134] sp : ffff00000a8ebbe0
> [    7.031264] x29: ffff00000a8ebbe0 x28: ffff80007c104800
> [    7.031430] x27: ffff00000a8ebd98 x26: ffff80007c4410d0
> [    7.031567] x25: ffff80007c441118 x24: 00000000ffffffff
> [    7.031704] x23: ffff80007c41b000 x22: ffff0000090d9000
> [    7.031838] x21: 0000000002000000 x20: ffff80007bcee800
> [    7.031973] x19: ffff80007c4413a8 x18: 0000000000000727
> [    7.032107] x17: 0000ffff89eba028 x16: ffff0000080e2c38
> [    7.032286] x15: ffff7e0000000000 x14: 0000000000048018
> [    7.032424] x13: 0000000048018c00 x12: ffff80007bc65788
> [    7.032558] x11: ffff00000a8eba68 x10: 0000000000000040
> [    7.032709] x9 : 0000000000000000 x8 : ffff800000f42c00
> [    7.032849] x7 : 0000000000000000 x6 : 000000000000003f
> [    7.032984] x5 : 0000000000000040 x4 : 0000000000000000
> [    7.033119] x3 : 0000000000000004 x2 : 00000000000003c0
> [    7.033254] x1 : 0000000000000000 x0 : ffff800000f42c00
> [    7.033414] Process jbd2/vda-8 (pid: 1321, stack limit = 0x        (ptrval))
> [    7.033633] Call trace:
> [    7.033757]  __memset+0x16c/0x1c0
> [    7.033858]  journal_submit_commit_record+0x60/0x174
> [    7.033985]  jbd2_journal_commit_transaction+0xf38/0x1330
> [    7.034115]  kjournald2+0xcc/0x250
> [    7.034207]  kthread+0xfc/0x128
> [    7.034295]  ret_from_fork+0x10/0x18
> [    7.034718] Code: 91010108 54ffff4a 8b040108 cb050042 (d50b7428)
> [    7.035104] ---[ end trace 26d65a14ae983167 ]---
>
> /sys/kernel/debug/kernel_page_tables shows that:
>
> ---[ Linear Mapping ]---
> 0xffff800000000000-0xffff800000080000         512K PTE       RW NX SHD AF NG CON     UXN MEM/NORMAL
> 0xffff800000080000-0xffff800000200000        1536K PTE       ro NX SHD AF NG         UXN MEM/NORMAL
> 0xffff800000200000-0xffff800000e00000          12M PMD       RW NX SHD AF NG     BLK UXN MEM/NORMAL
> 0xffff800000e00000-0xffff800000fb0000        1728K PTE       ro NX SHD AF NG         UXN MEM/NORMAL
>
> So I split it into pieces.
>
> Thanks,
>
> Jun
James Morse June 21, 2018, 8:59 a.m. UTC | #4
Hi guys,

On 21/06/18 07:39, Ard Biesheuvel wrote:
> On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
>> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
>>> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
>>>> Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
>>>> section. And update the swapper_pg_dir by fixmap.
>>>>
>>>
>>> I think we may be able to get away with not mapping idmap_pg_dir and
>>> tramp_pg_dir at all.
>>
>> I think we need to move tramp_pg_dir to .rodata. The attacker can write
>> a block-mapping(AP=01) to tramp_pg_dir and then he can access kernel
>> memory.

> Why does it need to be mapped at all? When do we ever access it from the code?

(We would want to make its fixmap entry read-only too)


>>> As for swapper_pg_dir, it would indeed be nice if we could keep those
>>> mappings read-only most of the time, but I'm not sure how useful this
>>> is if we apply it to the root level only.
>>
>> The purpose of it is to make 'KSMA' harder, where an single arbitrary
>> write is used to add a block mapping to the page-tables, giving the
>> attacker full access to kernel memory. That's why we just apply it to
>> the root level only. If the attacker can arbitrary write multiple times,
>> I think it's hard to defend.
>>
> 
> So the assumption is that the root level is more easy to find?
> Otherwise, I'm not sure I understand why being able to write a level 0
> entry is so harmful, given that we don't have block mappings at that
> level.

I think this thing assumes 3-level page tables with 39bit VA.


>>>> @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
>>>>
>>>>  void __init mark_linear_text_alias_ro(void)
>>>>  {

>>>> +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
>>>> +       update_mapping_prot(__pa_symbol(swapper_pg_end),
>>>> +                           (unsigned long)lm_alias(swapper_pg_end),
>>>> +                           size, PAGE_KERNEL_RO);
>>>
>>> I don't think this is necessary. Even if some pages are freed, it
>>> doesn't harm to keep a read-only alias of them here since the new
>>> owner won't access them via this mapping anyway. So we can keep
>>> .rodata as a single region.
>>
>> To be honest, I didn't think of this issue at first. I later found a
>> problem when testing the code on qemu:
> 
> OK, you're right. I missed the fact that this operates on the linear
> alias, not the kernel mapping itself.
> 
> What I don't like is that we lose the ability to use block mappings
> for the entire .rodata section this way. Isn't it possible to move
> these pgdirs to the end of the .rodata segment, perhaps by using a
> separate input section name and placing that explicitly? We could even
> simply forget about freeing those pages, given that [on 4k pages] the
> benefit of freeing 12 KB of space is likely to get lost in the
> rounding noise anyway [segments are rounded up to 64 KB in size]

I assumed that to move swapper_pg_dir into the .rodata section we would need to
break it up. Today its ~3 levels, which we setup in head.S, then do a dance in
paging_init() so that swapper_pg_dir is always the top level.

We could generate all leves of the 'init_pg_dir' in the __initdata section, then
copy only the top level into swapper_pg_dir into the rodata section during
paging_init().


Thanks,

James
Ard Biesheuvel June 21, 2018, 9:29 a.m. UTC | #5
On 21 June 2018 at 10:59, James Morse <james.morse@arm.com> wrote:
> Hi guys,
>
> On 21/06/18 07:39, Ard Biesheuvel wrote:
>> On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
>>> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
>>>> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
>>>>> Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
>>>>> section. And update the swapper_pg_dir by fixmap.
>>>>>
>>>>
>>>> I think we may be able to get away with not mapping idmap_pg_dir and
>>>> tramp_pg_dir at all.
>>>
>>> I think we need to move tramp_pg_dir to .rodata. The attacker can write
>>> a block-mapping(AP=01) to tramp_pg_dir and then he can access kernel
>>> memory.
>
>> Why does it need to be mapped at all? When do we ever access it from the code?
>
> (We would want to make its fixmap entry read-only too)
>

It already is.

>
>>>> As for swapper_pg_dir, it would indeed be nice if we could keep those
>>>> mappings read-only most of the time, but I'm not sure how useful this
>>>> is if we apply it to the root level only.
>>>
>>> The purpose of it is to make 'KSMA' harder, where an single arbitrary
>>> write is used to add a block mapping to the page-tables, giving the
>>> attacker full access to kernel memory. That's why we just apply it to
>>> the root level only. If the attacker can arbitrary write multiple times,
>>> I think it's hard to defend.
>>>
>>
>> So the assumption is that the root level is more easy to find?
>> Otherwise, I'm not sure I understand why being able to write a level 0
>> entry is so harmful, given that we don't have block mappings at that
>> level.
>
> I think this thing assumes 3-level page tables with 39bit VA.
>

The attack, you mean? Because this code is unlikely to build with that
configuration, given that __pgd_populate() BUILD_BUG()s in that case.

>
>>>>> @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
>>>>>
>>>>>  void __init mark_linear_text_alias_ro(void)
>>>>>  {
>
>>>>> +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
>>>>> +       update_mapping_prot(__pa_symbol(swapper_pg_end),
>>>>> +                           (unsigned long)lm_alias(swapper_pg_end),
>>>>> +                           size, PAGE_KERNEL_RO);
>>>>
>>>> I don't think this is necessary. Even if some pages are freed, it
>>>> doesn't harm to keep a read-only alias of them here since the new
>>>> owner won't access them via this mapping anyway. So we can keep
>>>> .rodata as a single region.
>>>
>>> To be honest, I didn't think of this issue at first. I later found a
>>> problem when testing the code on qemu:
>>
>> OK, you're right. I missed the fact that this operates on the linear
>> alias, not the kernel mapping itself.
>>
>> What I don't like is that we lose the ability to use block mappings
>> for the entire .rodata section this way. Isn't it possible to move
>> these pgdirs to the end of the .rodata segment, perhaps by using a
>> separate input section name and placing that explicitly? We could even
>> simply forget about freeing those pages, given that [on 4k pages] the
>> benefit of freeing 12 KB of space is likely to get lost in the
>> rounding noise anyway [segments are rounded up to 64 KB in size]
>
> I assumed that to move swapper_pg_dir into the .rodata section we would need to
> break it up. Today its ~3 levels, which we setup in head.S, then do a dance in
> paging_init() so that swapper_pg_dir is always the top level.
>
> We could generate all leves of the 'init_pg_dir' in the __initdata section, then
> copy only the top level into swapper_pg_dir into the rodata section during
> paging_init().
>

Is that complexity truly justified for a security sensitive piece of
code? Can't we just drop the memblock_free() and be done with it?
Jun Yao June 21, 2018, 12:24 p.m. UTC | #6
On Thu, Jun 21, 2018 at 11:29:52AM +0200, Ard Biesheuvel wrote:
> On 21 June 2018 at 10:59, James Morse <james.morse@arm.com> wrote:
> > On 21/06/18 07:39, Ard Biesheuvel wrote:
> >> On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
> >>> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
> >>>> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
> >>>> As for swapper_pg_dir, it would indeed be nice if we could keep those
> >>>> mappings read-only most of the time, but I'm not sure how useful this
> >>>> is if we apply it to the root level only.
> >>>
> >>> The purpose of it is to make 'KSMA' harder, where an single arbitrary
> >>> write is used to add a block mapping to the page-tables, giving the
> >>> attacker full access to kernel memory. That's why we just apply it to
> >>> the root level only. If the attacker can arbitrary write multiple times,
> >>> I think it's hard to defend.
> >>>
> >>
> >> So the assumption is that the root level is more easy to find?
> >> Otherwise, I'm not sure I understand why being able to write a level 0
> >> entry is so harmful, given that we don't have block mappings at that
> >> level.
> >
> > I think this thing assumes 3-level page tables with 39bit VA.
> >
> 
> The attack, you mean? Because this code is unlikely to build with that
> configuration, given that __pgd_populate() BUILD_BUG()s in that case.

I think this configuration may be ok. I find that the kernel on Google
pixel 2 XL is built with 3-level page tables with 39-bit VA.
James Morse June 21, 2018, 5:04 p.m. UTC | #7
Hi Ard,

On 21/06/18 10:29, Ard Biesheuvel wrote:
> On 21 June 2018 at 10:59, James Morse <james.morse@arm.com> wrote:
>> On 21/06/18 07:39, Ard Biesheuvel wrote:
>>> On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
>>>> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
>>>>> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
>>>>>> Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
>>>>>> section. And update the swapper_pg_dir by fixmap.
>>>>>
>>>>> I think we may be able to get away with not mapping idmap_pg_dir and
>>>>> tramp_pg_dir at all.
>>>>
>>>> I think we need to move tramp_pg_dir to .rodata. The attacker can write
>>>> a block-mapping(AP=01) to tramp_pg_dir and then he can access kernel
>>>> memory.
>>
>>> Why does it need to be mapped at all? When do we ever access it from the code?
>>
>> (We would want to make its fixmap entry read-only too)
> 
> It already is.

Sorry, I missed that,


>>>>> As for swapper_pg_dir, it would indeed be nice if we could keep those
>>>>> mappings read-only most of the time, but I'm not sure how useful this
>>>>> is if we apply it to the root level only.
>>>>
>>>> The purpose of it is to make 'KSMA' harder, where an single arbitrary
>>>> write is used to add a block mapping to the page-tables, giving the
>>>> attacker full access to kernel memory. That's why we just apply it to
>>>> the root level only. If the attacker can arbitrary write multiple times,
>>>> I think it's hard to defend.
>>>
>>> So the assumption is that the root level is more easy to find?
>>> Otherwise, I'm not sure I understand why being able to write a level 0
>>> entry is so harmful, given that we don't have block mappings at that
>>> level.
>>
>> I think this thing assumes 3-level page tables with 39bit VA.

> The attack, you mean? Because this code is unlikely to build with that
> configuration, given that __pgd_populate() BUILD_BUG()s in that case.

Yes, the attack. (I struggle to think of it as an 'attack' because you already
have arbitrary write...)


>>>>> @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
>>>>>>
>>>>>>  void __init mark_linear_text_alias_ro(void)
>>>>>>  {
>>
>>>>>> +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
>>>>>> +       update_mapping_prot(__pa_symbol(swapper_pg_end),
>>>>>> +                           (unsigned long)lm_alias(swapper_pg_end),
>>>>>> +                           size, PAGE_KERNEL_RO);
>>>>>
>>>>> I don't think this is necessary. Even if some pages are freed, it
>>>>> doesn't harm to keep a read-only alias of them here since the new
>>>>> owner won't access them via this mapping anyway. So we can keep
>>>>> .rodata as a single region.
>>>>
>>>> To be honest, I didn't think of this issue at first. I later found a
>>>> problem when testing the code on qemu:
>>>
>>> OK, you're right. I missed the fact that this operates on the linear
>>> alias, not the kernel mapping itself.
>>>
>>> What I don't like is that we lose the ability to use block mappings
>>> for the entire .rodata section this way. Isn't it possible to move
>>> these pgdirs to the end of the .rodata segment, perhaps by using a
>>> separate input section name and placing that explicitly? We could even
>>> simply forget about freeing those pages, given that [on 4k pages] the
>>> benefit of freeing 12 KB of space is likely to get lost in the
>>> rounding noise anyway [segments are rounded up to 64 KB in size]
>>
>> I assumed that to move swapper_pg_dir into the .rodata section we would need to
>> break it up. Today its ~3 levels, which we setup in head.S, then do a dance in
>> paging_init() so that swapper_pg_dir is always the top level.
>>
>> We could generate all leves of the 'init_pg_dir' in the __initdata section, then
>> copy only the top level into swapper_pg_dir into the rodata section during
>> paging_init().

> Is that complexity truly justified for a security sensitive piece of
> code?

Wouldn't this be less complex? (I've probably explained it badly.)

Today head.S builds the initial page tables in ~3 levels of swapper_pg_dir, then
during paging_init() build new tables with a temporary top level.
We switch to the temporary top level, then copy over the first level of
swapper_pg_dir, then switch back to swapper_pg_dir. Finally we free the
no-longer-used levels of swapper_pg_dir.

This looks like re-inventing __initdata for the bits of page table we eventually
free.

What I tried to describe is building the head.S/initial-page-tables in a
reserved area of the the __initdata section. We no longer need a temporary
top-level, we can build the final page tables directly in swapper_pg_dir, which
means one fewer rounds of cpu_replace_ttbr1().


> Can't we just drop the memblock_free() and be done with it?

That works, I assumed it would be at least frowned on!


Thanks,

James
Ard Biesheuvel June 21, 2018, 5:27 p.m. UTC | #8
On 21 June 2018 at 19:04, James Morse <james.morse@arm.com> wrote:
> Hi Ard,
>
> On 21/06/18 10:29, Ard Biesheuvel wrote:
>> On 21 June 2018 at 10:59, James Morse <james.morse@arm.com> wrote:
>>> On 21/06/18 07:39, Ard Biesheuvel wrote:
>>>> On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
>>>>> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
>>>>>> On 20 June 2018 at 10:57, Jun Yao <yaojun8558363@gmail.com> wrote:
>>>>>>> Move {idmap_pg_dir,tramp_pg_dir,swapper_pg_dir} to .rodata
>>>>>>> section. And update the swapper_pg_dir by fixmap.
>>>>>>
>>>>>> I think we may be able to get away with not mapping idmap_pg_dir and
>>>>>> tramp_pg_dir at all.
>>>>>
>>>>> I think we need to move tramp_pg_dir to .rodata. The attacker can write
>>>>> a block-mapping(AP=01) to tramp_pg_dir and then he can access kernel
>>>>> memory.
>>>
>>>> Why does it need to be mapped at all? When do we ever access it from the code?
>>>
>>> (We would want to make its fixmap entry read-only too)
>>
>> It already is.
>
> Sorry, I missed that,
>
>
>>>>>> As for swapper_pg_dir, it would indeed be nice if we could keep those
>>>>>> mappings read-only most of the time, but I'm not sure how useful this
>>>>>> is if we apply it to the root level only.
>>>>>
>>>>> The purpose of it is to make 'KSMA' harder, where an single arbitrary
>>>>> write is used to add a block mapping to the page-tables, giving the
>>>>> attacker full access to kernel memory. That's why we just apply it to
>>>>> the root level only. If the attacker can arbitrary write multiple times,
>>>>> I think it's hard to defend.
>>>>
>>>> So the assumption is that the root level is more easy to find?
>>>> Otherwise, I'm not sure I understand why being able to write a level 0
>>>> entry is so harmful, given that we don't have block mappings at that
>>>> level.
>>>
>>> I think this thing assumes 3-level page tables with 39bit VA.
>
>> The attack, you mean? Because this code is unlikely to build with that
>> configuration, given that __pgd_populate() BUILD_BUG()s in that case.
>
> Yes, the attack. (I struggle to think of it as an 'attack' because you already
> have arbitrary write...)
>

OK, so in that case, you can abuse your single arbitrary write to map
an entire 1 GB block of memory with arbitrary permissions, allowing
userland to take control of the contents, right? And if you know the
virtual and physical addresses of swapper_pg_dir, you can make sure
this block covers the entire kernel, allowing the attacker to
manipulate all core kernel code and statically allocated data
structures.

What I don't understand about this patch is how it is sufficient to
only remap swapper_pg_dir r/w for updates on kernels that use 4 level
paging.

>
>>>>>> @@ -417,12 +421,22 @@ static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
>>>>>>>
>>>>>>>  void __init mark_linear_text_alias_ro(void)
>>>>>>>  {
>>>
>>>>>>> +       size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
>>>>>>> +       update_mapping_prot(__pa_symbol(swapper_pg_end),
>>>>>>> +                           (unsigned long)lm_alias(swapper_pg_end),
>>>>>>> +                           size, PAGE_KERNEL_RO);
>>>>>>
>>>>>> I don't think this is necessary. Even if some pages are freed, it
>>>>>> doesn't harm to keep a read-only alias of them here since the new
>>>>>> owner won't access them via this mapping anyway. So we can keep
>>>>>> .rodata as a single region.
>>>>>
>>>>> To be honest, I didn't think of this issue at first. I later found a
>>>>> problem when testing the code on qemu:
>>>>
>>>> OK, you're right. I missed the fact that this operates on the linear
>>>> alias, not the kernel mapping itself.
>>>>
>>>> What I don't like is that we lose the ability to use block mappings
>>>> for the entire .rodata section this way. Isn't it possible to move
>>>> these pgdirs to the end of the .rodata segment, perhaps by using a
>>>> separate input section name and placing that explicitly? We could even
>>>> simply forget about freeing those pages, given that [on 4k pages] the
>>>> benefit of freeing 12 KB of space is likely to get lost in the
>>>> rounding noise anyway [segments are rounded up to 64 KB in size]
>>>
>>> I assumed that to move swapper_pg_dir into the .rodata section we would need to
>>> break it up. Today its ~3 levels, which we setup in head.S, then do a dance in
>>> paging_init() so that swapper_pg_dir is always the top level.
>>>
>>> We could generate all leves of the 'init_pg_dir' in the __initdata section, then
>>> copy only the top level into swapper_pg_dir into the rodata section during
>>> paging_init().
>
>> Is that complexity truly justified for a security sensitive piece of
>> code?
>
> Wouldn't this be less complex? (I've probably explained it badly.)
>
> Today head.S builds the initial page tables in ~3 levels of swapper_pg_dir, then
> during paging_init() build new tables with a temporary top level.
> We switch to the temporary top level, then copy over the first level of
> swapper_pg_dir, then switch back to swapper_pg_dir. Finally we free the
> no-longer-used levels of swapper_pg_dir.
>
> This looks like re-inventing __initdata for the bits of page table we eventually
> free.
>
> What I tried to describe is building the head.S/initial-page-tables in a
> reserved area of the the __initdata section. We no longer need a temporary
> top-level, we can build the final page tables directly in swapper_pg_dir, which
> means one fewer rounds of cpu_replace_ttbr1().
>

Ah fair enough. So either the initial page tables are never referred
to via swapper_pg_dir in the first place, or we copy the first level
over from __initdata after setting it up (which is probably easier
than teaching the asm code about non-consecutive page ranges).

So indeed, that would be an improvement in its own right.

>
>> Can't we just drop the memblock_free() and be done with it?
>
> That works, I assumed it would be at least frowned on!
>

I think I prefer your suggestion above.

But we do need to teach this code to deal with folded page table levels.
Jun Yao June 22, 2018, 8:15 a.m. UTC | #9
Hi Ard,

On Thu, Jun 21, 2018 at 07:27:01PM +0200, Ard Biesheuvel wrote:
> On 21 June 2018 at 19:04, James Morse <james.morse@arm.com> wrote:
> > On 21/06/18 10:29, Ard Biesheuvel wrote:
> >> On 21 June 2018 at 10:59, James Morse <james.morse@arm.com> wrote:
> >>> On 21/06/18 07:39, Ard Biesheuvel wrote:
> >>>> On 21 June 2018 at 04:51, Jun Yao <yaojun8558363@gmail.com> wrote:
> >>>>> On Wed, Jun 20, 2018 at 12:09:49PM +0200, Ard Biesheuvel wrote:
> >>>>>> As for swapper_pg_dir, it would indeed be nice if we could keep those
> >>>>>> mappings read-only most of the time, but I'm not sure how useful this
> >>>>>> is if we apply it to the root level only.
> >>>>>
> >>>>> The purpose of it is to make 'KSMA' harder, where an single arbitrary
> >>>>> write is used to add a block mapping to the page-tables, giving the
> >>>>> attacker full access to kernel memory. That's why we just apply it to
> >>>>> the root level only. If the attacker can arbitrary write multiple times,
> >>>>> I think it's hard to defend.
> >>>>
> >>>> So the assumption is that the root level is more easy to find?
> >>>> Otherwise, I'm not sure I understand why being able to write a level 0
> >>>> entry is so harmful, given that we don't have block mappings at that
> >>>> level.
> >>>
> >>> I think this thing assumes 3-level page tables with 39bit VA.
> >
> >> The attack, you mean? Because this code is unlikely to build with that
> >> configuration, given that __pgd_populate() BUILD_BUG()s in that case.
> >
> > Yes, the attack. (I struggle to think of it as an 'attack' because you already
> > have arbitrary write...)
> >
> 
> OK, so in that case, you can abuse your single arbitrary write to map
> an entire 1 GB block of memory with arbitrary permissions, allowing
> userland to take control of the contents, right? And if you know the
> virtual and physical addresses of swapper_pg_dir, you can make sure
> this block covers the entire kernel, allowing the attacker to
> manipulate all core kernel code and statically allocated data
> structures.
> 
> What I don't understand about this patch is how it is sufficient to
> only remap swapper_pg_dir r/w for updates on kernels that use 4 level
> paging.
> 

Sorry, It's my mistake.

To my best knowledge, to defend 'KSMA', we need to handle these
configurations:

1. ARM64_4K_PAGES && ARM64_VA_BITS_39 (PGTABLE_LEVELS = 3, 1GB block)
2. ARM64_16K_PAGES && ARM64_VA_BITS_36 (PGTABLE_LEVELS = 2, 32MB block)
3. ARM64_64K_PAGES && ARM64_VA_BITS_42 (PGTABLE_LEVELS = 2, 512MB block)

If these configurations are selected, we move {idmap_pg_dir,
tramp_pg_dir, reserved_ttbr0, swapper_pg_dir} to .rodata section(avoid
modifying tramp_(un)map_kernel), and remap swapper_pg_dir r/w for updates.
If these configurations are not selected, we just keep them as they are.

Do you think this is okay?
diff mbox

Patch

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 2e05bcd944c8..cc96a7e6957d 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -29,6 +29,10 @@ 
 #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
 
+#if CONFIG_STRICT_KERNEL_RWX
+extern spinlock_t pgdir_lock;
+#endif
+
 #if CONFIG_PGTABLE_LEVELS > 2
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -78,6 +82,21 @@  static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
 
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp)
 {
+#if CONFIG_STRICT_KERNEL_RWX
+	if (mm == &init_mm) {
+		pgd_t *pgd;
+
+		spin_lock(&pgdir_lock);
+		pgd = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
+
+		pgd = (pgd_t *)((unsigned long)pgd + pgdp - swapper_pg_dir);
+		__pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
+
+		pgd_clear_fixmap();
+		spin_unlock(&pgdir_lock);
+		return;
+	}
+#endif
 	__pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
 }
 #else
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 605d1b60469c..86532c57206a 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -216,21 +216,25 @@  SECTIONS
 	BSS_SECTION(0, 0, 0)
 
 	. = ALIGN(PAGE_SIZE);
-	idmap_pg_dir = .;
-	. += IDMAP_DIR_SIZE;
 
-#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
-	tramp_pg_dir = .;
-	. += PAGE_SIZE;
-#endif
-
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	reserved_ttbr0 = .;
-	. += RESERVED_TTBR0_SIZE;
-#endif
-	swapper_pg_dir = .;
-	. += SWAPPER_DIR_SIZE;
-	swapper_pg_end = .;
+	.rodata : {
+		. = ALIGN(PAGE_SIZE);
+		idmap_pg_dir = .;
+		. += IDMAP_DIR_SIZE;
+
+		#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+		tramp_pg_dir = .;
+		. += PAGE_SIZE;
+		#endif
+
+		#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+		reserved_ttbr0 = .;
+		. += RESERVED_TTBR0_SIZE;
+		#endif
+		swapper_pg_dir = .;
+		. += SWAPPER_DIR_SIZE;
+		swapper_pg_end = .;
+	}
 
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
 	_end = .;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 2dbb2c9f1ec1..c1aa85a6ada5 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -66,6 +66,10 @@  static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
 static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+DEFINE_SPINLOCK(pgdir_lock);
+#endif
+
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 			      unsigned long size, pgprot_t vma_prot)
 {
@@ -417,12 +421,22 @@  static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 
 void __init mark_linear_text_alias_ro(void)
 {
+	unsigned long size;
+
 	/*
 	 * Remove the write permissions from the linear alias of .text/.rodata
+	 *
+	 * We free some pages in .rodata at paging_init(), which generates a
+	 * hole. And the hole splits .rodata into two pieces.
 	 */
+	size = (unsigned long)swapper_pg_dir + PAGE_SIZE - (unsigned long)_text;
 	update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
-			    (unsigned long)__init_begin - (unsigned long)_text,
-			    PAGE_KERNEL_RO);
+			    size, PAGE_KERNEL_RO);
+
+	size = (unsigned long)__init_begin - (unsigned long)swapper_pg_end;
+	update_mapping_prot(__pa_symbol(swapper_pg_end),
+			    (unsigned long)lm_alias(swapper_pg_end),
+			    size, PAGE_KERNEL_RO);
 }
 
 static void __init map_mem(pgd_t *pgdp)
@@ -587,8 +601,9 @@  static void __init map_kernel(pgd_t *pgdp)
 	 */
 	map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
 			   VM_NO_GUARD);
-	map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
-			   &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
+	map_kernel_segment(pgdp, __start_rodata, __inittext_begin,
+			   PAGE_KERNEL, &vmlinux_rodata,
+			   NO_CONT_MAPPINGS | NO_BLOCK_MAPPINGS, VM_NO_GUARD);
 	map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
 			   &vmlinux_inittext, 0, VM_NO_GUARD);
 	map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,