diff mbox series

ARM: Mark the FDT_FIXED sections as shareable

Message ID 20220606124858.384-1-thunder.leizhen@huawei.com (mailing list archive)
State New, archived
Headers show
Series ARM: Mark the FDT_FIXED sections as shareable | expand

Commit Message

Zhen Lei June 6, 2022, 12:48 p.m. UTC
commit 7a1be318f579 ("ARM: 9012/1: move device tree mapping out of linear
region") use FDT_FIXED_BASE to map the whole FDT_FIXED_SIZE memory area
which contains fdt. But it only reserves the exact physical memory that
fdt occupied. Unfortunately, this mapping is non-shareable. An illegal or
speculative read access can bring the RAM content from non-fdt zone into
cache, PIPT makes it to be hit by subsequently read access through
shareable mapping(such as linear mapping), and the cache consistency
between cores is lost due to non-shareable property.

|<---------FDT_FIXED_SIZE------>|
|                               |
 -------------------------------
| <non-fdt> | <fdt> | <non-fdt> |
 -------------------------------

1. CoreA read <non-fdt> through MT_ROM mapping, the old data is loaded
   into the cache.
2. CoreB write <non-fdt> to update data through linear mapping. CoreA
   received the notification to invalid the corresponding cachelines, but
   the property non-shareable makes it to be ignored.
3. CoreA read <non-fdt> through linear mapping, cache hit, the old data
   is read.

To eliminate this risk, mark the MT_ROM sections as shareable.

The other user of MT_ROM is XIP_KERNEL. XIP allows the kernel to run from
flash to save RAM space. Not sure if anyone is still using XIP in order to
save a little memory and not care about performance degradation. Add a new
memory type MT_ROM_XIP to be compatible with it.

BTW: Another solution is to memblock_reserve() all the sections that fdt
spans, but this will waste 2-4MiB memory.

Here's an example:
  list_del corruption. prev->next should be c0ecbf74, but was c08410dc
  kernel BUG at lib/list_debug.c:53!
  ... ...
  PC is at __list_del_entry_valid+0x58/0x98
  LR is at __list_del_entry_valid+0x58/0x98
  psr: 60000093
  sp : c0ecbf30  ip : 00000000  fp : 00000001
  r10: c08410d0  r9 : 00000001  r8 : c0825e0c
  r7 : 20000013  r6 : c08410d0  r5 : c0ecbf74  r4 : c0ecbf74
  r3 : c0825d08  r2 : 00000000  r1 : df7ce6f4  r0 : 00000044
  ... ...
  Stack: (0xc0ecbf30 to 0xc0ecc000)
  bf20:                                     c0ecbf74 c0164fd0 c0ecbf70 c0165170
  bf40: c0eca000 c0840c00 c0840c00 c0824500 c0825e0c c0189bbc c088f404 60000013
  bf60: 60000013 c0e85100 000004ec 00000000 c0ebcdc0 c0ecbf74 c0ecbf74 c0825d08
  ... ...                                           <  next     prev  >
  (__list_del_entry_valid) from (__list_del_entry+0xc/0x20)
  (__list_del_entry) from (finish_swait+0x60/0x7c)
  (finish_swait) from (rcu_gp_kthread+0x560/0xa20)
  (rcu_gp_kthread) from (kthread+0x14c/0x15c)
  (kthread) from (ret_from_fork+0x14/0x24)

The faulty list node to be deleted is a local variable, its address is
c0ecbf74. The dumped stack shows that 'prev' = c0ecbf74, but its value
before lib/list_debug.c:53 is c08410dc. A large amount of printing results
in swapping out the cacheline containing the old data(MT_ROM mapping is
read only, so the cacheline cannot be dirty), and the subsequent dump
operation obtains new data from the DDR.

Fixes: 7a1be318f579 ("ARM: 9012/1: move device tree mapping out of linear region")
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 arch/arm/include/asm/mach/map.h | 1 +
 arch/arm/mm/mmu.c               | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

Comments

Ard Biesheuvel June 6, 2022, 3:52 p.m. UTC | #1
Hello Zhen Lei,

On Mon, 6 Jun 2022 at 14:49, Zhen Lei <thunder.leizhen@huawei.com> wrote:
>
> commit 7a1be318f579 ("ARM: 9012/1: move device tree mapping out of linear
> region") use FDT_FIXED_BASE to map the whole FDT_FIXED_SIZE memory area
> which contains fdt. But it only reserves the exact physical memory that
> fdt occupied. Unfortunately, this mapping is non-shareable. An illegal or
> speculative read access can bring the RAM content from non-fdt zone into
> cache, PIPT makes it to be hit by subsequently read access through
> shareable mapping(such as linear mapping), and the cache consistency
> between cores is lost due to non-shareable property.
>
> |<---------FDT_FIXED_SIZE------>|
> |                               |
>  -------------------------------
> | <non-fdt> | <fdt> | <non-fdt> |
>  -------------------------------
>
> 1. CoreA read <non-fdt> through MT_ROM mapping, the old data is loaded
>    into the cache.
> 2. CoreB write <non-fdt> to update data through linear mapping. CoreA
>    received the notification to invalid the corresponding cachelines, but
>    the property non-shareable makes it to be ignored.
> 3. CoreA read <non-fdt> through linear mapping, cache hit, the old data
>    is read.
>

Thanks for the excellent write-up, and for what must have been a lot
of work to narrow down and diagnose!

> To eliminate this risk, mark the MT_ROM sections as shareable.
>
> The other user of MT_ROM is XIP_KERNEL. XIP allows the kernel to run from
> flash to save RAM space. Not sure if anyone is still using XIP in order to
> save a little memory and not care about performance degradation. Add a new
> memory type MT_ROM_XIP to be compatible with it.
>
> BTW: Another solution is to memblock_reserve() all the sections that fdt
> spans, but this will waste 2-4MiB memory.
>

I agree that we should not add shareable attributes to the memory type
used by XIP kernels for code regions: NOR flash is not usually
integrated in a way that allows it to participate in the coherency
protocol, so that will likely break things.

I think, though, that it would be better to leave MT_ROM alone, and
introduce a new type MT_MEMORY_RO instead, which is wired up in the
right way (see below), so that we get NX attributes, and can use it to
create non-section mappings as well.

Then, as a followup which does not need to go into -stable, we can
reduce the size of the mapping: there is really no need for the
permanent mapping to be section granular - this is only for the early
asm code that is not able to create 2 levels of page tables.


--------------->8-----------------
diff --git a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h
index 92282558caf7..2b8970d8e5a2 100644
--- a/arch/arm/include/asm/mach/map.h
+++ b/arch/arm/include/asm/mach/map.h
@@ -27,6 +27,7 @@ enum {
        MT_HIGH_VECTORS,
        MT_MEMORY_RWX,
        MT_MEMORY_RW,
+       MT_MEMORY_RO,
        MT_ROM,
        MT_MEMORY_RWX_NONCACHED,
        MT_MEMORY_RW_DTCM,
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 5e2be37a198e..cd17e324aa51 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -296,6 +296,13 @@ static struct mem_type mem_types[] __ro_after_init = {
                .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
                .domain    = DOMAIN_KERNEL,
        },
+       [MT_MEMORY_RO] = {
+               .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
+                            L_PTE_XN | L_PTE_RDONLY,
+               .prot_l1   = PMD_TYPE_TABLE,
+               .prot_sect = PMD_TYPE_SECT,
+               .domain    = DOMAIN_KERNEL,
+       },
        [MT_ROM] = {
                .prot_sect = PMD_TYPE_SECT,
                .domain    = DOMAIN_KERNEL,
@@ -489,6 +496,7 @@ static void __init build_mem_type_table(void)

                        /* Also setup NX memory mapping */
                        mem_types[MT_MEMORY_RW].prot_sect |= PMD_SECT_XN;
+                       mem_types[MT_MEMORY_RO].prot_sect |= PMD_SECT_XN;
                }
                if (cpu_arch >= CPU_ARCH_ARMv7 && (cr & CR_TRE)) {
                        /*
@@ -568,6 +576,7 @@ static void __init build_mem_type_table(void)
                mem_types[MT_ROM].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
                mem_types[MT_MINICLEAN].prot_sect |=
PMD_SECT_APX|PMD_SECT_AP_WRITE;
                mem_types[MT_CACHECLEAN].prot_sect |=
PMD_SECT_APX|PMD_SECT_AP_WRITE;
+               mem_types[MT_MEMORY_RO].prot_sect |=
PMD_SECT_APX|PMD_SECT_AP_WRITE;
 #endif

                /*
@@ -587,6 +596,8 @@ static void __init build_mem_type_table(void)
                        mem_types[MT_MEMORY_RWX].prot_pte |= L_PTE_SHARED;
                        mem_types[MT_MEMORY_RW].prot_sect |= PMD_SECT_S;
                        mem_types[MT_MEMORY_RW].prot_pte |= L_PTE_SHARED;
+                       mem_types[MT_MEMORY_RO].prot_sect |= PMD_SECT_S;
+                       mem_types[MT_MEMORY_RO].prot_pte |= L_PTE_SHARED;
                        mem_types[MT_MEMORY_DMA_READY].prot_pte |= L_PTE_SHARED;
                        mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect
|= PMD_SECT_S;
                        mem_types[MT_MEMORY_RWX_NONCACHED].prot_pte |=
L_PTE_SHARED;
@@ -647,6 +658,8 @@ static void __init build_mem_type_table(void)
        mem_types[MT_MEMORY_RWX].prot_pte |= kern_pgprot;
        mem_types[MT_MEMORY_RW].prot_sect |= ecc_mask | cp->pmd;
        mem_types[MT_MEMORY_RW].prot_pte |= kern_pgprot;
+       mem_types[MT_MEMORY_RO].prot_sect |= ecc_mask | cp->pmd;
+       mem_types[MT_MEMORY_RO].prot_pte |= kern_pgprot;
        mem_types[MT_MEMORY_DMA_READY].prot_pte |= kern_pgprot;
        mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |= ecc_mask;
        mem_types[MT_ROM].prot_sect |= cp->pmd;
@@ -1360,7 +1373,7 @@ static void __init devicemaps_init(const struct
machine_desc *mdesc)
                map.pfn = __phys_to_pfn(__atags_pointer & SECTION_MASK);
                map.virtual = FDT_FIXED_BASE;
                map.length = FDT_FIXED_SIZE;
-               map.type = MT_ROM;
+               map.type = MT_MEMORY_RO;
                create_mapping(&map);
        }
Ard Biesheuvel June 6, 2022, 5:12 p.m. UTC | #2
On Mon, 6 Jun 2022 at 17:52, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> Hello Zhen Lei,
>
> On Mon, 6 Jun 2022 at 14:49, Zhen Lei <thunder.leizhen@huawei.com> wrote:
> >
> > commit 7a1be318f579 ("ARM: 9012/1: move device tree mapping out of linear
> > region") use FDT_FIXED_BASE to map the whole FDT_FIXED_SIZE memory area
> > which contains fdt. But it only reserves the exact physical memory that
> > fdt occupied. Unfortunately, this mapping is non-shareable. An illegal or
> > speculative read access can bring the RAM content from non-fdt zone into
> > cache, PIPT makes it to be hit by subsequently read access through
> > shareable mapping(such as linear mapping), and the cache consistency
> > between cores is lost due to non-shareable property.
> >
> > |<---------FDT_FIXED_SIZE------>|
> > |                               |
> >  -------------------------------
> > | <non-fdt> | <fdt> | <non-fdt> |
> >  -------------------------------
> >
> > 1. CoreA read <non-fdt> through MT_ROM mapping, the old data is loaded
> >    into the cache.
> > 2. CoreB write <non-fdt> to update data through linear mapping. CoreA
> >    received the notification to invalid the corresponding cachelines, but
> >    the property non-shareable makes it to be ignored.
> > 3. CoreA read <non-fdt> through linear mapping, cache hit, the old data
> >    is read.
> >
>
> Thanks for the excellent write-up, and for what must have been a lot
> of work to narrow down and diagnose!
>
> > To eliminate this risk, mark the MT_ROM sections as shareable.
> >
> > The other user of MT_ROM is XIP_KERNEL. XIP allows the kernel to run from
> > flash to save RAM space. Not sure if anyone is still using XIP in order to
> > save a little memory and not care about performance degradation. Add a new
> > memory type MT_ROM_XIP to be compatible with it.
> >
> > BTW: Another solution is to memblock_reserve() all the sections that fdt
> > spans, but this will waste 2-4MiB memory.
> >
>
> I agree that we should not add shareable attributes to the memory type
> used by XIP kernels for code regions: NOR flash is not usually
> integrated in a way that allows it to participate in the coherency
> protocol, so that will likely break things.
>
> I think, though, that it would be better to leave MT_ROM alone, and
> introduce a new type MT_MEMORY_RO instead, which is wired up in the
> right way (see below), so that we get NX attributes, and can use it to
> create non-section mappings as well.
>
> Then, as a followup which does not need to go into -stable, we can
> reduce the size of the mapping: there is really no need for the
> permanent mapping to be section granular - this is only for the early
> asm code that is not able to create 2 levels of page tables.
>

Actually, on second thought, I think reducing the size of the FDT
mapping is also needed for correctness, as the non-fdt regions could
potentially be covered by a no-map memory reservation, or get mapped
non-cacheable for things like non-coherent DMA.
Zhen Lei June 7, 2022, 7:16 a.m. UTC | #3
On 2022/6/7 1:12, Ard Biesheuvel wrote:
> On Mon, 6 Jun 2022 at 17:52, Ard Biesheuvel <ardb@kernel.org> wrote:
>>
>> Hello Zhen Lei,
>>
>> On Mon, 6 Jun 2022 at 14:49, Zhen Lei <thunder.leizhen@huawei.com> wrote:
>>>
>>> commit 7a1be318f579 ("ARM: 9012/1: move device tree mapping out of linear
>>> region") use FDT_FIXED_BASE to map the whole FDT_FIXED_SIZE memory area
>>> which contains fdt. But it only reserves the exact physical memory that
>>> fdt occupied. Unfortunately, this mapping is non-shareable. An illegal or
>>> speculative read access can bring the RAM content from non-fdt zone into
>>> cache, PIPT makes it to be hit by subsequently read access through
>>> shareable mapping(such as linear mapping), and the cache consistency
>>> between cores is lost due to non-shareable property.
>>>
>>> |<---------FDT_FIXED_SIZE------>|
>>> |                               |
>>>  -------------------------------
>>> | <non-fdt> | <fdt> | <non-fdt> |
>>>  -------------------------------
>>>
>>> 1. CoreA read <non-fdt> through MT_ROM mapping, the old data is loaded
>>>    into the cache.
>>> 2. CoreB write <non-fdt> to update data through linear mapping. CoreA
>>>    received the notification to invalid the corresponding cachelines, but
>>>    the property non-shareable makes it to be ignored.
>>> 3. CoreA read <non-fdt> through linear mapping, cache hit, the old data
>>>    is read.
>>>
>>
>> Thanks for the excellent write-up, and for what must have been a lot
>> of work to narrow down and diagnose!

Yes, it took a lot of time, a lot of boards.

>>
>>> To eliminate this risk, mark the MT_ROM sections as shareable.
>>>
>>> The other user of MT_ROM is XIP_KERNEL. XIP allows the kernel to run from
>>> flash to save RAM space. Not sure if anyone is still using XIP in order to
>>> save a little memory and not care about performance degradation. Add a new
>>> memory type MT_ROM_XIP to be compatible with it.
>>>
>>> BTW: Another solution is to memblock_reserve() all the sections that fdt
>>> spans, but this will waste 2-4MiB memory.
>>>
>>
>> I agree that we should not add shareable attributes to the memory type
>> used by XIP kernels for code regions: NOR flash is not usually
>> integrated in a way that allows it to participate in the coherency
>> protocol, so that will likely break things.
>>
>> I think, though, that it would be better to leave MT_ROM alone, and
>> introduce a new type MT_MEMORY_RO instead, which is wired up in the
>> right way (see below), so that we get NX attributes, and can use it to
>> create non-section mappings as well.

Right, NX should also be set. I will try MT_MEMORY_RO.

>>
>> Then, as a followup which does not need to go into -stable, we can
>> reduce the size of the mapping: there is really no need for the
>> permanent mapping to be section granular - this is only for the early
>> asm code that is not able to create 2 levels of page tables.
>>
> 
> Actually, on second thought, I think reducing the size of the FDT
> mapping is also needed for correctness, as the non-fdt regions could
> potentially be covered by a no-map memory reservation, or get mapped
> non-cacheable for things like non-coherent DMA.

I'll keep the section mapping first, because the fix for adding the
shareable attribute is explicit.

> .
>
diff mbox series

Patch

diff --git a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h
index 92282558caf7cdb..0100f95c8104b3c 100644
--- a/arch/arm/include/asm/mach/map.h
+++ b/arch/arm/include/asm/mach/map.h
@@ -28,6 +28,7 @@  enum {
 	MT_MEMORY_RWX,
 	MT_MEMORY_RW,
 	MT_ROM,
+	MT_ROM_XIP,
 	MT_MEMORY_RWX_NONCACHED,
 	MT_MEMORY_RW_DTCM,
 	MT_MEMORY_RWX_ITCM,
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 5e2be37a198e29e..a707eac189b1d6d 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -300,6 +300,10 @@  static struct mem_type mem_types[] __ro_after_init = {
 		.prot_sect = PMD_TYPE_SECT,
 		.domain    = DOMAIN_KERNEL,
 	},
+	[MT_ROM_XIP] = {
+		.prot_sect = PMD_TYPE_SECT,
+		.domain    = DOMAIN_KERNEL,
+	},
 	[MT_MEMORY_RWX_NONCACHED] = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 				L_PTE_MT_BUFFERABLE,
@@ -566,6 +570,7 @@  static void __init build_mem_type_table(void)
 		 * from SVC mode and no access from userspace.
 		 */
 		mem_types[MT_ROM].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
+		mem_types[MT_ROM_XIP].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
 		mem_types[MT_MINICLEAN].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
 		mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
 #endif
@@ -590,6 +595,7 @@  static void __init build_mem_type_table(void)
 			mem_types[MT_MEMORY_DMA_READY].prot_pte |= L_PTE_SHARED;
 			mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |= PMD_SECT_S;
 			mem_types[MT_MEMORY_RWX_NONCACHED].prot_pte |= L_PTE_SHARED;
+			mem_types[MT_ROM].prot_sect |= PMD_SECT_S;
 		}
 	}
 
@@ -650,6 +656,7 @@  static void __init build_mem_type_table(void)
 	mem_types[MT_MEMORY_DMA_READY].prot_pte |= kern_pgprot;
 	mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |= ecc_mask;
 	mem_types[MT_ROM].prot_sect |= cp->pmd;
+	mem_types[MT_ROM_XIP].prot_sect |= cp->pmd;
 
 	switch (cp->pmd) {
 	case PMD_SECT_WT:
@@ -1372,7 +1379,7 @@  static void __init devicemaps_init(const struct machine_desc *mdesc)
 	map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
 	map.virtual = MODULES_VADDR;
 	map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK;
-	map.type = MT_ROM;
+	map.type = MT_ROM_XIP;
 	create_mapping(&map);
 #endif