diff mbox series

efi/arm: fix allocation failure when reserving the kernel base

Message ID 20190802053744.5519-1-clin@suse.com (mailing list archive)
State New, archived
Headers show
Series efi/arm: fix allocation failure when reserving the kernel base | expand

Commit Message

Chester Lin Aug. 2, 2019, 5:38 a.m. UTC
In some cases the arm32 efistub could fail to allocate memory for
uncompressed kernel. For example, we got the following error message when
verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :

  EFI stub: Booting Linux Kernel...
  EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
  EFI stub: ERROR: Failed to relocate kernel

After checking the EFI memory map we found that the first page [0 - 0xfff]
had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
set the dram base at 0, which was actually in a reserved region.

  grub> lsefimmap
  Type      Physical start  - end             #Pages        Size Attributes
  reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
  conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
  RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
  conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
  .....

To avoid a reserved address, we have to ignore the memory regions which are
marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
chosen. If the region before the kernel base is unaligned, it will be
marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
will not be sticked with a very low address such as 0x1000.

Signed-off-by: Chester Lin <clin@suse.com>
---
 arch/arm/mm/mmu.c                         |  3 ++
 drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
 2 files changed, 37 insertions(+), 9 deletions(-)

Comments

Ard Biesheuvel Aug. 4, 2019, 7:57 a.m. UTC | #1
Hello Chester,

On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
>
> In some cases the arm32 efistub could fail to allocate memory for
> uncompressed kernel. For example, we got the following error message when
> verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
>
>   EFI stub: Booting Linux Kernel...
>   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
>   EFI stub: ERROR: Failed to relocate kernel
>
> After checking the EFI memory map we found that the first page [0 - 0xfff]
> had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> set the dram base at 0, which was actually in a reserved region.
>

This by itself is a violation of the Linux boot protocol for 32-bit
ARM when using the decompressor. The decompressor rounds down its own
base address to a multiple of 128 MB, and assumes the whole area is
available for the decompressed kernel and related data structures.
(The first TEXT_OFFSET bytes are no longer used in practice, which is
why putting a reserved region of 4 KB bytes works at the moment, but
this is fragile). Note that the decompressor does not look at any DT
or EFI provided memory maps *at all*.

So unfortunately, this is not something we can fix in the kernel, but
we should fix it in the bootloader or in GRUB, so it does not put any
reserved regions in the first 128 MB of memory,


>   grub> lsefimmap
>   Type      Physical start  - end             #Pages        Size Attributes
>   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
>   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
>   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
>   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
>   .....
>
> To avoid a reserved address, we have to ignore the memory regions which are
> marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
> chosen. If the region before the kernel base is unaligned, it will be
> marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
> will not be sticked with a very low address such as 0x1000.
>
> Signed-off-by: Chester Lin <clin@suse.com>
> ---
>  arch/arm/mm/mmu.c                         |  3 ++
>  drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
>  2 files changed, 37 insertions(+), 9 deletions(-)
>
> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index f3ce34113f89..909b11ba48d8 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
>                 phys_addr_t block_start = reg->base;
>                 phys_addr_t block_end = reg->base + reg->size;
>
> +               if (memblock_is_nomap(reg))
> +                       continue;
> +
>                 if (reg->base < vmalloc_limit) {
>                         if (block_end > lowmem_limit)
>                                 /*
> diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
> index e8f7aefb6813..10d33d36df00 100644
> --- a/drivers/firmware/efi/libstub/arm32-stub.c
> +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> @@ -128,7 +128,7 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
>
>         for (l = 0; l < map_size; l += desc_size) {
>                 efi_memory_desc_t *desc;
> -               u64 start, end;
> +               u64 start, end, spare, kernel_base;
>
>                 desc = (void *)memory_map + l;
>                 start = desc->phys_addr;
> @@ -144,27 +144,52 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
>                 case EFI_BOOT_SERVICES_DATA:
>                         /* Ignore types that are released to the OS anyway */
>                         continue;
> -
> +               case EFI_RESERVED_TYPE:
> +                       /* Ignore reserved regions */
> +                       continue;
>                 case EFI_CONVENTIONAL_MEMORY:
>                         /*
>                          * Reserve the intersection between this entry and the
>                          * region.
>                          */
>                         start = max(start, (u64)dram_base);
> -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> +                       kernel_base = round_up(start, PMD_SIZE);
> +                       spare = kernel_base - start;
> +                       end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
> +
> +                       status = efi_call_early(allocate_pages,
> +                                       EFI_ALLOCATE_ADDRESS,
> +                                       EFI_LOADER_DATA,
> +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> +                                       &kernel_base);
> +                       if (status != EFI_SUCCESS) {
> +                               pr_efi_err(sys_table_arg,
> +                                       "reserve_kernel_base: alloc failed.\n");
> +                               goto out;
> +                       }
> +                       *reserve_addr = kernel_base;
>
> +                       if (!spare)
> +                               break;
> +                       /*
> +                        * If there's a gap between start and kernel_base,
> +                        * it needs be reserved so that the memblock_limit
> +                        * will not fall on a very low address when running
> +                        * adjust_lowmem_bounds(), wchich could eventually
> +                        * cause CMA reservation issue.
> +                        */
>                         status = efi_call_early(allocate_pages,
>                                                 EFI_ALLOCATE_ADDRESS,
> -                                               EFI_LOADER_DATA,
> -                                               (end - start) / EFI_PAGE_SIZE,
> +                                               EFI_RESERVED_TYPE,
> +                                               spare / EFI_PAGE_SIZE,
>                                                 &start);
>                         if (status != EFI_SUCCESS) {
>                                 pr_efi_err(sys_table_arg,
> -                                       "reserve_kernel_base(): alloc failed.\n");
> +                                       "reserve spare-region failed\n");
>                                 goto out;
>                         }
> -                       break;
>
> +                       break;
>                 case EFI_LOADER_CODE:
>                 case EFI_LOADER_DATA:
>                         /*
> @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
>         *image_size = image->image_size;
>         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
>                                      *image_size,
> -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> +                                    *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
>         if (status != EFI_SUCCESS) {
>                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
>                 efi_free(sys_table, *reserve_size, *reserve_addr);
> @@ -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
>          * in memory. The kernel determines the base of DRAM from the
>          * address at which the zImage is loaded.
>          */
> -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> +       if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
>                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
>                 efi_free(sys_table, *reserve_size, *reserve_addr);
>                 *reserve_size = 0;
> --
> 2.22.0
>
Guillaume Gardet Aug. 14, 2019, 7:58 a.m. UTC | #2
> -----Original Message-----
> From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Sent: 04 August 2019 09:57
> To: Chester Lin <clin@suse.com>; linux@armlinux.org.uk
> Cc: akpm@linux-foundation.org; rppt@linux.ibm.com; ren_guo@c-sky.com;
> Juergen Gross <JGross@suse.com>; geert@linux-m68k.org; mingo@kernel.org;
> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org; linux-
> efi@vger.kernel.org; Guillaume Gardet <Guillaume.Gardet@arm.com>; Joey Lee
> <JLee@suse.com>; Gary Lin <GLin@suse.com>
> Subject: Re: [PATCH] efi/arm: fix allocation failure when reserving the kernel
> base
>
> Hello Chester,
>
> On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> >
> > In some cases the arm32 efistub could fail to allocate memory for
> > uncompressed kernel. For example, we got the following error message
> > when verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> >
> >   EFI stub: Booting Linux Kernel...
> >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> >   EFI stub: ERROR: Failed to relocate kernel
> >
> > After checking the EFI memory map we found that the first page [0 -
> > 0xfff] had been reserved by Raspberry Pi-2's firmware, and the efistub
> > tried to set the dram base at 0, which was actually in a reserved region.
> >
>
> This by itself is a violation of the Linux boot protocol for 32-bit ARM when using
> the decompressor. The decompressor rounds down its own base address to a
> multiple of 128 MB, and assumes the whole area is available for the
> decompressed kernel and related data structures.
> (The first TEXT_OFFSET bytes are no longer used in practice, which is why putting
> a reserved region of 4 KB bytes works at the moment, but this is fragile). Note
> that the decompressor does not look at any DT or EFI provided memory maps
> *at all*.
>
> So unfortunately, this is not something we can fix in the kernel, but we should fix
> it in the bootloader or in GRUB, so it does not put any reserved regions in the
> first 128 MB of memory,

FYI, this is in Raspberry Pi firmware: https://github.com/raspberrypi/firmware/issues/1199


>
>
> >   grub> lsefimmap
> >   Type      Physical start  - end             #Pages        Size Attributes
> >   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
> >   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
> >   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
> >   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
> >   .....
> >
> > To avoid a reserved address, we have to ignore the memory regions
> > which are marked as EFI_RESERVED_TYPE, and only conventional memory
> > regions can be chosen. If the region before the kernel base is
> > unaligned, it will be marked as EFI_RESERVED_TYPE and let kernel
> > ignore it so that memblock_limit will not be sticked with a very low address
> such as 0x1000.
> >
> > Signed-off-by: Chester Lin <clin@suse.com>
> > ---
> >  arch/arm/mm/mmu.c                         |  3 ++
> >  drivers/firmware/efi/libstub/arm32-stub.c | 43
> > ++++++++++++++++++-----
> >  2 files changed, 37 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index
> > f3ce34113f89..909b11ba48d8 100644
> > --- a/arch/arm/mm/mmu.c
> > +++ b/arch/arm/mm/mmu.c
> > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> >                 phys_addr_t block_start = reg->base;
> >                 phys_addr_t block_end = reg->base + reg->size;
> >
> > +               if (memblock_is_nomap(reg))
> > +                       continue;
> > +
> >                 if (reg->base < vmalloc_limit) {
> >                         if (block_end > lowmem_limit)
> >                                 /*
> > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> > b/drivers/firmware/efi/libstub/arm32-stub.c
> > index e8f7aefb6813..10d33d36df00 100644
> > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > @@ -128,7 +128,7 @@ static efi_status_t
> > reserve_kernel_base(efi_system_table_t *sys_table_arg,
> >
> >         for (l = 0; l < map_size; l += desc_size) {
> >                 efi_memory_desc_t *desc;
> > -               u64 start, end;
> > +               u64 start, end, spare, kernel_base;
> >
> >                 desc = (void *)memory_map + l;
> >                 start = desc->phys_addr; @@ -144,27 +144,52 @@ static
> > efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> >                 case EFI_BOOT_SERVICES_DATA:
> >                         /* Ignore types that are released to the OS anyway */
> >                         continue;
> > -
> > +               case EFI_RESERVED_TYPE:
> > +                       /* Ignore reserved regions */
> > +                       continue;
> >                 case EFI_CONVENTIONAL_MEMORY:
> >                         /*
> >                          * Reserve the intersection between this entry and the
> >                          * region.
> >                          */
> >                         start = max(start, (u64)dram_base);
> > -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> > +                       kernel_base = round_up(start, PMD_SIZE);
> > +                       spare = kernel_base - start;
> > +                       end = min(end, kernel_base +
> > + MAX_UNCOMP_KERNEL_SIZE);
> > +
> > +                       status = efi_call_early(allocate_pages,
> > +                                       EFI_ALLOCATE_ADDRESS,
> > +                                       EFI_LOADER_DATA,
> > +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> > +                                       &kernel_base);
> > +                       if (status != EFI_SUCCESS) {
> > +                               pr_efi_err(sys_table_arg,
> > +                                       "reserve_kernel_base: alloc failed.\n");
> > +                               goto out;
> > +                       }
> > +                       *reserve_addr = kernel_base;
> >
> > +                       if (!spare)
> > +                               break;
> > +                       /*
> > +                        * If there's a gap between start and kernel_base,
> > +                        * it needs be reserved so that the memblock_limit
> > +                        * will not fall on a very low address when running
> > +                        * adjust_lowmem_bounds(), wchich could eventually
> > +                        * cause CMA reservation issue.
> > +                        */
> >                         status = efi_call_early(allocate_pages,
> >                                                 EFI_ALLOCATE_ADDRESS,
> > -                                               EFI_LOADER_DATA,
> > -                                               (end - start) / EFI_PAGE_SIZE,
> > +                                               EFI_RESERVED_TYPE,
> > +                                               spare / EFI_PAGE_SIZE,
> >                                                 &start);
> >                         if (status != EFI_SUCCESS) {
> >                                 pr_efi_err(sys_table_arg,
> > -                                       "reserve_kernel_base(): alloc failed.\n");
> > +                                       "reserve spare-region
> > + failed\n");
> >                                 goto out;
> >                         }
> > -                       break;
> >
> > +                       break;
> >                 case EFI_LOADER_CODE:
> >                 case EFI_LOADER_DATA:
> >                         /*
> > @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t
> *sys_table,
> >         *image_size = image->image_size;
> >         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
> >                                      *image_size,
> > -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> > +                                    *reserve_addr +
> > + MAX_UNCOMP_KERNEL_SIZE, 0);
> >         if (status != EFI_SUCCESS) {
> >                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
> >                 efi_free(sys_table, *reserve_size, *reserve_addr); @@
> > -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t
> *sys_table,
> >          * in memory. The kernel determines the base of DRAM from the
> >          * address at which the zImage is loaded.
> >          */
> > -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> > +       if (*image_addr + *image_size > *reserve_addr +
> > + ZIMAGE_OFFSET_LIMIT) {
> >                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory
> available.\n");
> >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> >                 *reserve_size = 0;
> > --
> > 2.22.0
> >
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Ard Biesheuvel Aug. 15, 2019, 7:59 a.m. UTC | #3
On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> Hello Chester,
>
> On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> >
> > In some cases the arm32 efistub could fail to allocate memory for
> > uncompressed kernel. For example, we got the following error message when
> > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> >
> >   EFI stub: Booting Linux Kernel...
> >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> >   EFI stub: ERROR: Failed to relocate kernel
> >
> > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > set the dram base at 0, which was actually in a reserved region.
> >
>
> This by itself is a violation of the Linux boot protocol for 32-bit
> ARM when using the decompressor. The decompressor rounds down its own
> base address to a multiple of 128 MB, and assumes the whole area is
> available for the decompressed kernel and related data structures.
> (The first TEXT_OFFSET bytes are no longer used in practice, which is
> why putting a reserved region of 4 KB bytes works at the moment, but
> this is fragile). Note that the decompressor does not look at any DT
> or EFI provided memory maps *at all*.
>
> So unfortunately, this is not something we can fix in the kernel, but
> we should fix it in the bootloader or in GRUB, so it does not put any
> reserved regions in the first 128 MB of memory,
>

OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
ARM boot protocol docs are unclear about whether this memory should be
used or not, but it is no longer used for its original purpose (page
tables), and the RPi loader already keeps data there.

Can you check whether the following patch works for you?

diff --git a/drivers/firmware/efi/libstub/Makefile
b/drivers/firmware/efi/libstub/Makefile
index 0460c7581220..ee0661ddb25b 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
string.o random.o \

 lib-$(CONFIG_ARM)              += arm32-stub.o
 lib-$(CONFIG_ARM64)            += arm64-stub.o
+CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
 CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)

 #
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
b/drivers/firmware/efi/libstub/arm32-stub.c
index e8f7aefb6813..66ff0c8ec269 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -204,7 +204,7 @@ efi_status_t
handle_kernel_image(efi_system_table_t *sys_table,
         * loaded. These assumptions are made by the decompressor,
         * before any memory map is available.
         */
-       dram_base = round_up(dram_base, SZ_128M);
+       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;

        status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
                                     reserve_size);

>
> >   grub> lsefimmap
> >   Type      Physical start  - end             #Pages        Size Attributes
> >   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
> >   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
> >   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
> >   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
> >   .....
> >
> > To avoid a reserved address, we have to ignore the memory regions which are
> > marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
> > chosen. If the region before the kernel base is unaligned, it will be
> > marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
> > will not be sticked with a very low address such as 0x1000.
> >

This is a separate issue, so it should be handled in a separate patch.

> > Signed-off-by: Chester Lin <clin@suse.com>
> > ---
> >  arch/arm/mm/mmu.c                         |  3 ++
> >  drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
> >  2 files changed, 37 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > index f3ce34113f89..909b11ba48d8 100644
> > --- a/arch/arm/mm/mmu.c
> > +++ b/arch/arm/mm/mmu.c
> > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> >                 phys_addr_t block_start = reg->base;
> >                 phys_addr_t block_end = reg->base + reg->size;
> >
> > +               if (memblock_is_nomap(reg))
> > +                       continue;
> > +
> >                 if (reg->base < vmalloc_limit) {
> >                         if (block_end > lowmem_limit)
> >                                 /*
> > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
> > index e8f7aefb6813..10d33d36df00 100644
> > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > @@ -128,7 +128,7 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> >
> >         for (l = 0; l < map_size; l += desc_size) {
> >                 efi_memory_desc_t *desc;
> > -               u64 start, end;
> > +               u64 start, end, spare, kernel_base;
> >
> >                 desc = (void *)memory_map + l;
> >                 start = desc->phys_addr;
> > @@ -144,27 +144,52 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> >                 case EFI_BOOT_SERVICES_DATA:
> >                         /* Ignore types that are released to the OS anyway */
> >                         continue;
> > -
> > +               case EFI_RESERVED_TYPE:
> > +                       /* Ignore reserved regions */
> > +                       continue;
> >                 case EFI_CONVENTIONAL_MEMORY:
> >                         /*
> >                          * Reserve the intersection between this entry and the
> >                          * region.
> >                          */
> >                         start = max(start, (u64)dram_base);
> > -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> > +                       kernel_base = round_up(start, PMD_SIZE);
> > +                       spare = kernel_base - start;
> > +                       end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
> > +
> > +                       status = efi_call_early(allocate_pages,
> > +                                       EFI_ALLOCATE_ADDRESS,
> > +                                       EFI_LOADER_DATA,
> > +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> > +                                       &kernel_base);
> > +                       if (status != EFI_SUCCESS) {
> > +                               pr_efi_err(sys_table_arg,
> > +                                       "reserve_kernel_base: alloc failed.\n");
> > +                               goto out;
> > +                       }
> > +                       *reserve_addr = kernel_base;
> >
> > +                       if (!spare)
> > +                               break;
> > +                       /*
> > +                        * If there's a gap between start and kernel_base,
> > +                        * it needs be reserved so that the memblock_limit
> > +                        * will not fall on a very low address when running
> > +                        * adjust_lowmem_bounds(), wchich could eventually
> > +                        * cause CMA reservation issue.
> > +                        */
> >                         status = efi_call_early(allocate_pages,
> >                                                 EFI_ALLOCATE_ADDRESS,
> > -                                               EFI_LOADER_DATA,
> > -                                               (end - start) / EFI_PAGE_SIZE,
> > +                                               EFI_RESERVED_TYPE,
> > +                                               spare / EFI_PAGE_SIZE,
> >                                                 &start);
> >                         if (status != EFI_SUCCESS) {
> >                                 pr_efi_err(sys_table_arg,
> > -                                       "reserve_kernel_base(): alloc failed.\n");
> > +                                       "reserve spare-region failed\n");
> >                                 goto out;
> >                         }
> > -                       break;
> >
> > +                       break;
> >                 case EFI_LOADER_CODE:
> >                 case EFI_LOADER_DATA:
> >                         /*
> > @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> >         *image_size = image->image_size;
> >         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
> >                                      *image_size,
> > -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> > +                                    *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
> >         if (status != EFI_SUCCESS) {
> >                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
> >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > @@ -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> >          * in memory. The kernel determines the base of DRAM from the
> >          * address at which the zImage is loaded.
> >          */
> > -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> > +       if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
> >                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
> >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> >                 *reserve_size = 0;
> > --
> > 2.22.0
> >
Chester Lin Aug. 15, 2019, 11:16 a.m. UTC | #4
Hi Ard,

On Thu, Aug 15, 2019 at 10:59:43AM +0300, Ard Biesheuvel wrote:
> On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> >
> > Hello Chester,
> >
> > On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> > >
> > > In some cases the arm32 efistub could fail to allocate memory for
> > > uncompressed kernel. For example, we got the following error message when
> > > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> > >
> > >   EFI stub: Booting Linux Kernel...
> > >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> > >   EFI stub: ERROR: Failed to relocate kernel
> > >
> > > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > > set the dram base at 0, which was actually in a reserved region.
> > >
> >
> > This by itself is a violation of the Linux boot protocol for 32-bit
> > ARM when using the decompressor. The decompressor rounds down its own
> > base address to a multiple of 128 MB, and assumes the whole area is
> > available for the decompressed kernel and related data structures.
> > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > why putting a reserved region of 4 KB bytes works at the moment, but
> > this is fragile). Note that the decompressor does not look at any DT
> > or EFI provided memory maps *at all*.
> >
> > So unfortunately, this is not something we can fix in the kernel, but
> > we should fix it in the bootloader or in GRUB, so it does not put any
> > reserved regions in the first 128 MB of memory,
> >
> 
> OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
> ARM boot protocol docs are unclear about whether this memory should be
> used or not, but it is no longer used for its original purpose (page
> tables), and the RPi loader already keeps data there.
> 
> Can you check whether the following patch works for you?
> 
> diff --git a/drivers/firmware/efi/libstub/Makefile
> b/drivers/firmware/efi/libstub/Makefile
> index 0460c7581220..ee0661ddb25b 100644
> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
> string.o random.o \
> 
>  lib-$(CONFIG_ARM)              += arm32-stub.o
>  lib-$(CONFIG_ARM64)            += arm64-stub.o
> +CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
>  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> 
>  #
> diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> b/drivers/firmware/efi/libstub/arm32-stub.c
> index e8f7aefb6813..66ff0c8ec269 100644
> --- a/drivers/firmware/efi/libstub/arm32-stub.c
> +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> @@ -204,7 +204,7 @@ efi_status_t
> handle_kernel_image(efi_system_table_t *sys_table,
>          * loaded. These assumptions are made by the decompressor,
>          * before any memory map is available.
>          */
> -       dram_base = round_up(dram_base, SZ_128M);
> +       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;
> 
>         status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
>                                      reserve_size);
> 

I tried your patch on rpi2 and got the following panic. Just a reminder that I
have replaced some log messages with "......" since it might be too long to
post all.

In this case the kernel failed to reserve cma, which should hit the issue of
memblock_limit=0x1000 as I had mentioned in my patch description. The first
block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
with PMD_SIZE so the cma reservation failed because the memblock.current_limit
was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
if any suggestion, thank you.

boot-log:
--------

Loading Linux test ...
EFI stub: Booting Linux Kernel...
EFI stub: Using DTB from configuration table
EFI stub: Exiting boot services and installing virtual address map...
Uncompressing Linux... done, booting the kernel.
[    0.000000] Booting Linux on physical CPU 0xf00
[    0.000000] Linux version 5.2.1-lpae (chester@linux-8mug) (......)
[    0.000000] CPU: ARMv7 Processor [410fc075] revision 5 (ARMv7), cr=30c5387d
[    0.000000] CPU: div instructions available: patching division code
[    0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache
[    0.000000] OF: fdt: Machine model: Raspberry Pi 2 Model B Rev 1.1
[    0.000000] printk: bootconsole [earlycon0] enabled
[    0.000000] Memory policy: Data cache writealloc
[    0.000000] efi: Getting EFI parameters from FDT:
[    0.000000] efi:   System Table: 0x000000003df757c0
[    0.000000] efi:   MemMap Address: 0x000000002c1c5040
[    0.000000] efi:   MemMap Size: 0x000003c0
[    0.000000] efi:   MemMap Desc. Size: 0x00000028
[    0.000000] efi:   MemMap Desc. Version: 0x00000001
[    0.000000] efi: EFI v2.70 by Das U-Boot
[    0.000000] efi:  SMBIOS=0x3cb62000  MEMRESERVE=0x3cb3d040
[    0.000000] memblock_reserve: [0x000000003cb3d040-0x000000003cb3d04f] efi_config_parse_tables+0x25c/0x2d8
[    0.000000] efi: Processing EFI memory map:
[    0.000000] MEMBLOCK configuration:
[    0.000000]  memory size = 0x000000003e000000 reserved size = 0x0000000000000010
[    0.000000]  memory.cnt  = 0x1
[    0.000000]  memory[0x0]     [0x0000000000000000-0x000000003dffffff], 0x000000003e000000 bytes flags: 0x0
[    0.000000]  reserved.cnt  = 0x1
[    0.000000]  reserved[0x0]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
[    0.000000] memblock_remove: [0x0000000000000000-0xfffffffffffffffe] reserve_regions+0x68/0x23c
[    0.000000] efi:   0x000000000000-0x000000000fff [Reserved           |   |  |  |  |  |  |  |   |WB|  |  |  ]
[    0.000000] memblock_add: [0x0000000000000000-0x0000000000000fff] early_init_dt_add_memory_arch+0x164/0x178
[    0.000000] efi:   0x000000001000-0x000000307fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
[    0.000000] memblock_add: [0x0000000000001000-0x0000000000307fff] early_init_dt_add_memory_arch+0x164/0x178
[    0.000000] efi:   0x000000308000-0x000002307fff [Boot Data          |   |  |  |  |  |  |  |   |WB|  |  |  ]
[    0.000000] memblock_add: [0x0000000000308000-0x0000000002307fff] early_init_dt_add_memory_arch+0x164/0x178
[    0.000000] efi:   0x000002308000-0x000002a93fff [Loader Data        |   |  |  |  |  |  |  |   |WB|  |  |  ]
[    0.000000] memblock_add: [0x0000000002308000-0x0000000002a93fff] early_init_dt_add_memory_arch+0x164/0x178
[    0.000000] efi:   0x000002a94000-0x000007cf5fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
[    0.000000] memblock_add: [0x0000000002a94000-0x0000000007cf5fff] early_init_dt_add_memory_arch+0x164/0x178
......
......
[    0.000000] memblock_add: [0x000000003df76000-0x000000003dffffff] early_init_dt_add_memory_arch+0x164/0x178
[    0.000000] efi:   0x00003f100000-0x00003f100fff [Memory Mapped I/O  |RUN|  |  |  |  |  |  |   |  |  |  |  ]
[    0.000000] memblock_reserve: [0x000000002c1c5000-0x000000002c1c5fff] efi_init+0xd8/0x1c8
[    0.000000] memblock_reserve: [0x0000000000400000-0x0000000001df2cef] arm_memblock_init+0x44/0x19c
[    0.000000] memblock_reserve: [0x0000000000303000-0x0000000000307fff] arm_mm_memblock_reserve+0x30/0x38
[    0.000000] memblock_reserve: [0x0000000007cf6000-0x0000000007cfc5c4] early_init_dt_reserve_memory_arch+0x2c/0x30
[    0.000000] cma: Failed to reserve 64 MiB
[    0.000000] MEMBLOCK configuration:
[    0.000000]  memory size = 0x000000003e000000 reserved size = 0x00000000019ff2c5
[    0.000000]  memory.cnt  = 0xa
[    0.000000]  memory[0x0]     [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
[    0.000000]  memory[0x1]     [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
[    0.000000]  memory[0x2]     [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
[    0.000000]  memory[0x3]     [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
[    0.000000]  memory[0x4]     [0x000000003cb3f000-0x000000003cb3ffff], 0x0000000000001000 bytes flags: 0x4
[    0.000000]  memory[0x5]     [0x000000003cb40000-0x000000003cb5ffff], 0x0000000000020000 bytes flags: 0x0
[    0.000000]  memory[0x6]     [0x000000003cb60000-0x000000003cb68fff], 0x0000000000009000 bytes flags: 0x4
[    0.000000]  memory[0x7]     [0x000000003cb69000-0x000000003df74fff], 0x000000000140c000 bytes flags: 0x0
[    0.000000]  memory[0x8]     [0x000000003df75000-0x000000003df75fff], 0x0000000000001000 bytes flags: 0x4
[    0.000000]  memory[0x9]     [0x000000003df76000-0x000000003dffffff], 0x000000000008a000 bytes flags: 0x0
[    0.000000]  reserved.cnt  = 0x5
[    0.000000]  reserved[0x0]   [0x0000000000303000-0x0000000000307fff], 0x0000000000005000 bytes flags: 0x0
[    0.000000]  reserved[0x1]   [0x0000000000400000-0x0000000001df2cef], 0x00000000019f2cf0 bytes flags: 0x0
[    0.000000]  reserved[0x2]   [0x0000000007cf6000-0x0000000007cfc5c4], 0x00000000000065c5 bytes flags: 0x0
[    0.000000]  reserved[0x3]   [0x000000002c1c5000-0x000000002c1c5fff], 0x0000000000001000 bytes flags: 0x0
[    0.000000]  reserved[0x4]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
[    0.000000] memblock_alloc_try_nid: 4096 bytes align=0x1000 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_alloc+0x44/0x70
[    0.000000] Kernel panic - not syncing: early_alloc: Failed to allocate 4096 bytes align=0x1000
[    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.1-lpae #1 openSUSE Tumbleweed (unreleased)
[    0.000000] Hardware name: BCM2835
[    0.000000] Backtrace:
[    0.000000] [<c043fafc>] (dump_backtrace) from [<c043fd84>] (show_stack+0x20/0x24)
[    0.000000]  r7:c1800000 r6:00000000 r5:600001d3 r4:c1901ba0
[    0.000000] [<c043fd64>] (show_stack) from [<c0df9400>] (dump_stack+0xd0/0x104)
[    0.000000] [<c0df9330>] (dump_stack) from [<c048061c>] (panic+0xf8/0x32c)
[    0.000000]  r10:c0307000 r9:c0001000 r8:00000003 r7:00000000 r6:00000000 r5:c181df04
[    0.000000]  r4:c192b8d8 r3:00000001
[    0.000000] [<c0480528>] (panic) from [<c1609728>] (early_alloc+0x60/0x70)
[    0.000000]  r3:00001000 r2:00001000 r1:c10037e8 r0:c12fe64c
[    0.000000]  r7:00000000
[    0.000000] [<c16096c8>] (early_alloc) from [<c1609114>] (arm_pte_alloc+0x34/0x94)
[    0.000000]  r7:00000000 r6:00000000 r4:c0307000
[    0.000000] [<c16090e0>] (arm_pte_alloc) from [<c1609384>] (__create_mapping+0x210/0x2c0)
[    0.000000]  r9:c0001000 r8:c0001000 r7:00000001 r6:c13f22e0 r5:c0200000 r4:c0400000
[    0.000000] [<c1609174>] (__create_mapping) from [<c160951c>] (create_mapping+0xe8/0x108)
[    0.000000]  r10:c0400000 r9:c16a2110 r8:c19c7a80 r7:00000000 r6:00400000 r5:c13f2000
[    0.000000]  r4:c1801ef0
[    0.000000] [<c1609434>] (create_mapping) from [<c1609f50>] (paging_init+0x350/0x75c)
[    0.000000]  r4:c1842d40


> >
> > >   grub> lsefimmap
> > >   Type      Physical start  - end             #Pages        Size Attributes
> > >   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
> > >   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
> > >   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
> > >   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
> > >   .....
> > >
> > > To avoid a reserved address, we have to ignore the memory regions which are
> > > marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
> > > chosen. If the region before the kernel base is unaligned, it will be
> > > marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
> > > will not be sticked with a very low address such as 0x1000.
> > >
> 
> This is a separate issue, so it should be handled in a separate patch.
> 
> > > Signed-off-by: Chester Lin <clin@suse.com>
> > > ---
> > >  arch/arm/mm/mmu.c                         |  3 ++
> > >  drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
> > >  2 files changed, 37 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > index f3ce34113f89..909b11ba48d8 100644
> > > --- a/arch/arm/mm/mmu.c
> > > +++ b/arch/arm/mm/mmu.c
> > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > >                 phys_addr_t block_start = reg->base;
> > >                 phys_addr_t block_end = reg->base + reg->size;
> > >
> > > +               if (memblock_is_nomap(reg))
> > > +                       continue;
> > > +
> > >                 if (reg->base < vmalloc_limit) {
> > >                         if (block_end > lowmem_limit)
> > >                                 /*
> > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
> > > index e8f7aefb6813..10d33d36df00 100644
> > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > @@ -128,7 +128,7 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > >
> > >         for (l = 0; l < map_size; l += desc_size) {
> > >                 efi_memory_desc_t *desc;
> > > -               u64 start, end;
> > > +               u64 start, end, spare, kernel_base;
> > >
> > >                 desc = (void *)memory_map + l;
> > >                 start = desc->phys_addr;
> > > @@ -144,27 +144,52 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > >                 case EFI_BOOT_SERVICES_DATA:
> > >                         /* Ignore types that are released to the OS anyway */
> > >                         continue;
> > > -
> > > +               case EFI_RESERVED_TYPE:
> > > +                       /* Ignore reserved regions */
> > > +                       continue;
> > >                 case EFI_CONVENTIONAL_MEMORY:
> > >                         /*
> > >                          * Reserve the intersection between this entry and the
> > >                          * region.
> > >                          */
> > >                         start = max(start, (u64)dram_base);
> > > -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> > > +                       kernel_base = round_up(start, PMD_SIZE);
> > > +                       spare = kernel_base - start;
> > > +                       end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
> > > +
> > > +                       status = efi_call_early(allocate_pages,
> > > +                                       EFI_ALLOCATE_ADDRESS,
> > > +                                       EFI_LOADER_DATA,
> > > +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> > > +                                       &kernel_base);
> > > +                       if (status != EFI_SUCCESS) {
> > > +                               pr_efi_err(sys_table_arg,
> > > +                                       "reserve_kernel_base: alloc failed.\n");
> > > +                               goto out;
> > > +                       }
> > > +                       *reserve_addr = kernel_base;
> > >
> > > +                       if (!spare)
> > > +                               break;
> > > +                       /*
> > > +                        * If there's a gap between start and kernel_base,
> > > +                        * it needs be reserved so that the memblock_limit
> > > +                        * will not fall on a very low address when running
> > > +                        * adjust_lowmem_bounds(), wchich could eventually
> > > +                        * cause CMA reservation issue.
> > > +                        */
> > >                         status = efi_call_early(allocate_pages,
> > >                                                 EFI_ALLOCATE_ADDRESS,
> > > -                                               EFI_LOADER_DATA,
> > > -                                               (end - start) / EFI_PAGE_SIZE,
> > > +                                               EFI_RESERVED_TYPE,
> > > +                                               spare / EFI_PAGE_SIZE,
> > >                                                 &start);
> > >                         if (status != EFI_SUCCESS) {
> > >                                 pr_efi_err(sys_table_arg,
> > > -                                       "reserve_kernel_base(): alloc failed.\n");
> > > +                                       "reserve spare-region failed\n");
> > >                                 goto out;
> > >                         }
> > > -                       break;
> > >
> > > +                       break;
> > >                 case EFI_LOADER_CODE:
> > >                 case EFI_LOADER_DATA:
> > >                         /*
> > > @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > >         *image_size = image->image_size;
> > >         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
> > >                                      *image_size,
> > > -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > +                                    *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
> > >         if (status != EFI_SUCCESS) {
> > >                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
> > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > @@ -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > >          * in memory. The kernel determines the base of DRAM from the
> > >          * address at which the zImage is loaded.
> > >          */
> > > -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> > > +       if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
> > >                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
> > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > >                 *reserve_size = 0;
> > > --
> > > 2.22.0
> > >
>
Ard Biesheuvel Aug. 15, 2019, 11:32 a.m. UTC | #5
(adding Mike)

On Thu, 15 Aug 2019 at 14:28, Chester Lin <clin@suse.com> wrote:
>
> Hi Ard,
>
> On Thu, Aug 15, 2019 at 10:59:43AM +0300, Ard Biesheuvel wrote:
> > On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > >
> > > Hello Chester,
> > >
> > > On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> > > >
> > > > In some cases the arm32 efistub could fail to allocate memory for
> > > > uncompressed kernel. For example, we got the following error message when
> > > > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> > > >
> > > >   EFI stub: Booting Linux Kernel...
> > > >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> > > >   EFI stub: ERROR: Failed to relocate kernel
> > > >
> > > > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > > > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > > > set the dram base at 0, which was actually in a reserved region.
> > > >
> > >
> > > This by itself is a violation of the Linux boot protocol for 32-bit
> > > ARM when using the decompressor. The decompressor rounds down its own
> > > base address to a multiple of 128 MB, and assumes the whole area is
> > > available for the decompressed kernel and related data structures.
> > > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > > why putting a reserved region of 4 KB bytes works at the moment, but
> > > this is fragile). Note that the decompressor does not look at any DT
> > > or EFI provided memory maps *at all*.
> > >
> > > So unfortunately, this is not something we can fix in the kernel, but
> > > we should fix it in the bootloader or in GRUB, so it does not put any
> > > reserved regions in the first 128 MB of memory,
> > >
> >
> > OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
> > ARM boot protocol docs are unclear about whether this memory should be
> > used or not, but it is no longer used for its original purpose (page
> > tables), and the RPi loader already keeps data there.
> >
> > Can you check whether the following patch works for you?
> >
> > diff --git a/drivers/firmware/efi/libstub/Makefile
> > b/drivers/firmware/efi/libstub/Makefile
> > index 0460c7581220..ee0661ddb25b 100644
> > --- a/drivers/firmware/efi/libstub/Makefile
> > +++ b/drivers/firmware/efi/libstub/Makefile
> > @@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
> > string.o random.o \
> >
> >  lib-$(CONFIG_ARM)              += arm32-stub.o
> >  lib-$(CONFIG_ARM64)            += arm64-stub.o
> > +CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> >  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> >
> >  #
> > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> > b/drivers/firmware/efi/libstub/arm32-stub.c
> > index e8f7aefb6813..66ff0c8ec269 100644
> > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > @@ -204,7 +204,7 @@ efi_status_t
> > handle_kernel_image(efi_system_table_t *sys_table,
> >          * loaded. These assumptions are made by the decompressor,
> >          * before any memory map is available.
> >          */
> > -       dram_base = round_up(dram_base, SZ_128M);
> > +       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;
> >
> >         status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
> >                                      reserve_size);
> >
>
> I tried your patch on rpi2 and got the following panic. Just a reminder that I
> have replaced some log messages with "......" since it might be too long to
> post all.
>

OK. Good to know that this change helps you to get past the EFI stub boot issue.

> In this case the kernel failed to reserve cma, which should hit the issue of
> memblock_limit=0x1000 as I had mentioned in my patch description. The first
> block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> if any suggestion, thank you.
>

This looks like it is a separate issue. The memblock/cma code should
not choke on a reserved page of memory at 0x0.

Perhaps Russell or Mike (cc'ed) have an idea how to address this?



> boot-log:
> --------
>
> Loading Linux test ...
> EFI stub: Booting Linux Kernel...
> EFI stub: Using DTB from configuration table
> EFI stub: Exiting boot services and installing virtual address map...
> Uncompressing Linux... done, booting the kernel.
> [    0.000000] Booting Linux on physical CPU 0xf00
> [    0.000000] Linux version 5.2.1-lpae (chester@linux-8mug) (......)
> [    0.000000] CPU: ARMv7 Processor [410fc075] revision 5 (ARMv7), cr=30c5387d
> [    0.000000] CPU: div instructions available: patching division code
> [    0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache
> [    0.000000] OF: fdt: Machine model: Raspberry Pi 2 Model B Rev 1.1
> [    0.000000] printk: bootconsole [earlycon0] enabled
> [    0.000000] Memory policy: Data cache writealloc
> [    0.000000] efi: Getting EFI parameters from FDT:
> [    0.000000] efi:   System Table: 0x000000003df757c0
> [    0.000000] efi:   MemMap Address: 0x000000002c1c5040
> [    0.000000] efi:   MemMap Size: 0x000003c0
> [    0.000000] efi:   MemMap Desc. Size: 0x00000028
> [    0.000000] efi:   MemMap Desc. Version: 0x00000001
> [    0.000000] efi: EFI v2.70 by Das U-Boot
> [    0.000000] efi:  SMBIOS=0x3cb62000  MEMRESERVE=0x3cb3d040
> [    0.000000] memblock_reserve: [0x000000003cb3d040-0x000000003cb3d04f] efi_config_parse_tables+0x25c/0x2d8
> [    0.000000] efi: Processing EFI memory map:
> [    0.000000] MEMBLOCK configuration:
> [    0.000000]  memory size = 0x000000003e000000 reserved size = 0x0000000000000010
> [    0.000000]  memory.cnt  = 0x1
> [    0.000000]  memory[0x0]     [0x0000000000000000-0x000000003dffffff], 0x000000003e000000 bytes flags: 0x0
> [    0.000000]  reserved.cnt  = 0x1
> [    0.000000]  reserved[0x0]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
> [    0.000000] memblock_remove: [0x0000000000000000-0xfffffffffffffffe] reserve_regions+0x68/0x23c
> [    0.000000] efi:   0x000000000000-0x000000000fff [Reserved           |   |  |  |  |  |  |  |   |WB|  |  |  ]
> [    0.000000] memblock_add: [0x0000000000000000-0x0000000000000fff] early_init_dt_add_memory_arch+0x164/0x178
> [    0.000000] efi:   0x000000001000-0x000000307fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
> [    0.000000] memblock_add: [0x0000000000001000-0x0000000000307fff] early_init_dt_add_memory_arch+0x164/0x178
> [    0.000000] efi:   0x000000308000-0x000002307fff [Boot Data          |   |  |  |  |  |  |  |   |WB|  |  |  ]
> [    0.000000] memblock_add: [0x0000000000308000-0x0000000002307fff] early_init_dt_add_memory_arch+0x164/0x178
> [    0.000000] efi:   0x000002308000-0x000002a93fff [Loader Data        |   |  |  |  |  |  |  |   |WB|  |  |  ]
> [    0.000000] memblock_add: [0x0000000002308000-0x0000000002a93fff] early_init_dt_add_memory_arch+0x164/0x178
> [    0.000000] efi:   0x000002a94000-0x000007cf5fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
> [    0.000000] memblock_add: [0x0000000002a94000-0x0000000007cf5fff] early_init_dt_add_memory_arch+0x164/0x178
> ......
> ......
> [    0.000000] memblock_add: [0x000000003df76000-0x000000003dffffff] early_init_dt_add_memory_arch+0x164/0x178
> [    0.000000] efi:   0x00003f100000-0x00003f100fff [Memory Mapped I/O  |RUN|  |  |  |  |  |  |   |  |  |  |  ]
> [    0.000000] memblock_reserve: [0x000000002c1c5000-0x000000002c1c5fff] efi_init+0xd8/0x1c8
> [    0.000000] memblock_reserve: [0x0000000000400000-0x0000000001df2cef] arm_memblock_init+0x44/0x19c
> [    0.000000] memblock_reserve: [0x0000000000303000-0x0000000000307fff] arm_mm_memblock_reserve+0x30/0x38
> [    0.000000] memblock_reserve: [0x0000000007cf6000-0x0000000007cfc5c4] early_init_dt_reserve_memory_arch+0x2c/0x30
> [    0.000000] cma: Failed to reserve 64 MiB
> [    0.000000] MEMBLOCK configuration:
> [    0.000000]  memory size = 0x000000003e000000 reserved size = 0x00000000019ff2c5
> [    0.000000]  memory.cnt  = 0xa
> [    0.000000]  memory[0x0]     [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> [    0.000000]  memory[0x1]     [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> [    0.000000]  memory[0x2]     [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> [    0.000000]  memory[0x3]     [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> [    0.000000]  memory[0x4]     [0x000000003cb3f000-0x000000003cb3ffff], 0x0000000000001000 bytes flags: 0x4
> [    0.000000]  memory[0x5]     [0x000000003cb40000-0x000000003cb5ffff], 0x0000000000020000 bytes flags: 0x0
> [    0.000000]  memory[0x6]     [0x000000003cb60000-0x000000003cb68fff], 0x0000000000009000 bytes flags: 0x4
> [    0.000000]  memory[0x7]     [0x000000003cb69000-0x000000003df74fff], 0x000000000140c000 bytes flags: 0x0
> [    0.000000]  memory[0x8]     [0x000000003df75000-0x000000003df75fff], 0x0000000000001000 bytes flags: 0x4
> [    0.000000]  memory[0x9]     [0x000000003df76000-0x000000003dffffff], 0x000000000008a000 bytes flags: 0x0
> [    0.000000]  reserved.cnt  = 0x5
> [    0.000000]  reserved[0x0]   [0x0000000000303000-0x0000000000307fff], 0x0000000000005000 bytes flags: 0x0
> [    0.000000]  reserved[0x1]   [0x0000000000400000-0x0000000001df2cef], 0x00000000019f2cf0 bytes flags: 0x0
> [    0.000000]  reserved[0x2]   [0x0000000007cf6000-0x0000000007cfc5c4], 0x00000000000065c5 bytes flags: 0x0
> [    0.000000]  reserved[0x3]   [0x000000002c1c5000-0x000000002c1c5fff], 0x0000000000001000 bytes flags: 0x0
> [    0.000000]  reserved[0x4]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
> [    0.000000] memblock_alloc_try_nid: 4096 bytes align=0x1000 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_alloc+0x44/0x70
> [    0.000000] Kernel panic - not syncing: early_alloc: Failed to allocate 4096 bytes align=0x1000
> [    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.1-lpae #1 openSUSE Tumbleweed (unreleased)
> [    0.000000] Hardware name: BCM2835
> [    0.000000] Backtrace:
> [    0.000000] [<c043fafc>] (dump_backtrace) from [<c043fd84>] (show_stack+0x20/0x24)
> [    0.000000]  r7:c1800000 r6:00000000 r5:600001d3 r4:c1901ba0
> [    0.000000] [<c043fd64>] (show_stack) from [<c0df9400>] (dump_stack+0xd0/0x104)
> [    0.000000] [<c0df9330>] (dump_stack) from [<c048061c>] (panic+0xf8/0x32c)
> [    0.000000]  r10:c0307000 r9:c0001000 r8:00000003 r7:00000000 r6:00000000 r5:c181df04
> [    0.000000]  r4:c192b8d8 r3:00000001
> [    0.000000] [<c0480528>] (panic) from [<c1609728>] (early_alloc+0x60/0x70)
> [    0.000000]  r3:00001000 r2:00001000 r1:c10037e8 r0:c12fe64c
> [    0.000000]  r7:00000000
> [    0.000000] [<c16096c8>] (early_alloc) from [<c1609114>] (arm_pte_alloc+0x34/0x94)
> [    0.000000]  r7:00000000 r6:00000000 r4:c0307000
> [    0.000000] [<c16090e0>] (arm_pte_alloc) from [<c1609384>] (__create_mapping+0x210/0x2c0)
> [    0.000000]  r9:c0001000 r8:c0001000 r7:00000001 r6:c13f22e0 r5:c0200000 r4:c0400000
> [    0.000000] [<c1609174>] (__create_mapping) from [<c160951c>] (create_mapping+0xe8/0x108)
> [    0.000000]  r10:c0400000 r9:c16a2110 r8:c19c7a80 r7:00000000 r6:00400000 r5:c13f2000
> [    0.000000]  r4:c1801ef0
> [    0.000000] [<c1609434>] (create_mapping) from [<c1609f50>] (paging_init+0x350/0x75c)
> [    0.000000]  r4:c1842d40
>
>
> > >
> > > >   grub> lsefimmap
> > > >   Type      Physical start  - end             #Pages        Size Attributes
> > > >   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
> > > >   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
> > > >   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
> > > >   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
> > > >   .....
> > > >
> > > > To avoid a reserved address, we have to ignore the memory regions which are
> > > > marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
> > > > chosen. If the region before the kernel base is unaligned, it will be
> > > > marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
> > > > will not be sticked with a very low address such as 0x1000.
> > > >
> >
> > This is a separate issue, so it should be handled in a separate patch.
> >
> > > > Signed-off-by: Chester Lin <clin@suse.com>
> > > > ---
> > > >  arch/arm/mm/mmu.c                         |  3 ++
> > > >  drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
> > > >  2 files changed, 37 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > index f3ce34113f89..909b11ba48d8 100644
> > > > --- a/arch/arm/mm/mmu.c
> > > > +++ b/arch/arm/mm/mmu.c
> > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > >                 phys_addr_t block_start = reg->base;
> > > >                 phys_addr_t block_end = reg->base + reg->size;
> > > >
> > > > +               if (memblock_is_nomap(reg))
> > > > +                       continue;
> > > > +
> > > >                 if (reg->base < vmalloc_limit) {
> > > >                         if (block_end > lowmem_limit)
> > > >                                 /*
> > > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > index e8f7aefb6813..10d33d36df00 100644
> > > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > @@ -128,7 +128,7 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > > >
> > > >         for (l = 0; l < map_size; l += desc_size) {
> > > >                 efi_memory_desc_t *desc;
> > > > -               u64 start, end;
> > > > +               u64 start, end, spare, kernel_base;
> > > >
> > > >                 desc = (void *)memory_map + l;
> > > >                 start = desc->phys_addr;
> > > > @@ -144,27 +144,52 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > > >                 case EFI_BOOT_SERVICES_DATA:
> > > >                         /* Ignore types that are released to the OS anyway */
> > > >                         continue;
> > > > -
> > > > +               case EFI_RESERVED_TYPE:
> > > > +                       /* Ignore reserved regions */
> > > > +                       continue;
> > > >                 case EFI_CONVENTIONAL_MEMORY:
> > > >                         /*
> > > >                          * Reserve the intersection between this entry and the
> > > >                          * region.
> > > >                          */
> > > >                         start = max(start, (u64)dram_base);
> > > > -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> > > > +                       kernel_base = round_up(start, PMD_SIZE);
> > > > +                       spare = kernel_base - start;
> > > > +                       end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
> > > > +
> > > > +                       status = efi_call_early(allocate_pages,
> > > > +                                       EFI_ALLOCATE_ADDRESS,
> > > > +                                       EFI_LOADER_DATA,
> > > > +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> > > > +                                       &kernel_base);
> > > > +                       if (status != EFI_SUCCESS) {
> > > > +                               pr_efi_err(sys_table_arg,
> > > > +                                       "reserve_kernel_base: alloc failed.\n");
> > > > +                               goto out;
> > > > +                       }
> > > > +                       *reserve_addr = kernel_base;
> > > >
> > > > +                       if (!spare)
> > > > +                               break;
> > > > +                       /*
> > > > +                        * If there's a gap between start and kernel_base,
> > > > +                        * it needs be reserved so that the memblock_limit
> > > > +                        * will not fall on a very low address when running
> > > > +                        * adjust_lowmem_bounds(), wchich could eventually
> > > > +                        * cause CMA reservation issue.
> > > > +                        */
> > > >                         status = efi_call_early(allocate_pages,
> > > >                                                 EFI_ALLOCATE_ADDRESS,
> > > > -                                               EFI_LOADER_DATA,
> > > > -                                               (end - start) / EFI_PAGE_SIZE,
> > > > +                                               EFI_RESERVED_TYPE,
> > > > +                                               spare / EFI_PAGE_SIZE,
> > > >                                                 &start);
> > > >                         if (status != EFI_SUCCESS) {
> > > >                                 pr_efi_err(sys_table_arg,
> > > > -                                       "reserve_kernel_base(): alloc failed.\n");
> > > > +                                       "reserve spare-region failed\n");
> > > >                                 goto out;
> > > >                         }
> > > > -                       break;
> > > >
> > > > +                       break;
> > > >                 case EFI_LOADER_CODE:
> > > >                 case EFI_LOADER_DATA:
> > > >                         /*
> > > > @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > > >         *image_size = image->image_size;
> > > >         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
> > > >                                      *image_size,
> > > > -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > > +                                    *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > >         if (status != EFI_SUCCESS) {
> > > >                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
> > > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > > @@ -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > > >          * in memory. The kernel determines the base of DRAM from the
> > > >          * address at which the zImage is loaded.
> > > >          */
> > > > -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> > > > +       if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
> > > >                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
> > > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > >                 *reserve_size = 0;
> > > > --
> > > > 2.22.0
> > > >
> >
Mike Rapoport Aug. 15, 2019, 1:37 p.m. UTC | #6
On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> (adding Mike)
> 
> On Thu, 15 Aug 2019 at 14:28, Chester Lin <clin@suse.com> wrote:
> >
> > Hi Ard,
> >
> > On Thu, Aug 15, 2019 at 10:59:43AM +0300, Ard Biesheuvel wrote:
> > > On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > > >
> > > > Hello Chester,
> > > >
> > > > On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> > > > >
> > > > > In some cases the arm32 efistub could fail to allocate memory for
> > > > > uncompressed kernel. For example, we got the following error message when
> > > > > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> > > > >
> > > > >   EFI stub: Booting Linux Kernel...
> > > > >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> > > > >   EFI stub: ERROR: Failed to relocate kernel
> > > > >
> > > > > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > > > > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > > > > set the dram base at 0, which was actually in a reserved region.
> > > > >
> > > >
> > > > This by itself is a violation of the Linux boot protocol for 32-bit
> > > > ARM when using the decompressor. The decompressor rounds down its own
> > > > base address to a multiple of 128 MB, and assumes the whole area is
> > > > available for the decompressed kernel and related data structures.
> > > > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > > > why putting a reserved region of 4 KB bytes works at the moment, but
> > > > this is fragile). Note that the decompressor does not look at any DT
> > > > or EFI provided memory maps *at all*.
> > > >
> > > > So unfortunately, this is not something we can fix in the kernel, but
> > > > we should fix it in the bootloader or in GRUB, so it does not put any
> > > > reserved regions in the first 128 MB of memory,
> > > >
> > >
> > > OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
> > > ARM boot protocol docs are unclear about whether this memory should be
> > > used or not, but it is no longer used for its original purpose (page
> > > tables), and the RPi loader already keeps data there.
> > >
> > > Can you check whether the following patch works for you?
> > >
> > > diff --git a/drivers/firmware/efi/libstub/Makefile
> > > b/drivers/firmware/efi/libstub/Makefile
> > > index 0460c7581220..ee0661ddb25b 100644
> > > --- a/drivers/firmware/efi/libstub/Makefile
> > > +++ b/drivers/firmware/efi/libstub/Makefile
> > > @@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
> > > string.o random.o \
> > >
> > >  lib-$(CONFIG_ARM)              += arm32-stub.o
> > >  lib-$(CONFIG_ARM64)            += arm64-stub.o
> > > +CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > >  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > >
> > >  #
> > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> > > b/drivers/firmware/efi/libstub/arm32-stub.c
> > > index e8f7aefb6813..66ff0c8ec269 100644
> > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > @@ -204,7 +204,7 @@ efi_status_t
> > > handle_kernel_image(efi_system_table_t *sys_table,
> > >          * loaded. These assumptions are made by the decompressor,
> > >          * before any memory map is available.
> > >          */
> > > -       dram_base = round_up(dram_base, SZ_128M);
> > > +       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;
> > >
> > >         status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
> > >                                      reserve_size);
> > >
> >
> > I tried your patch on rpi2 and got the following panic. Just a reminder that I
> > have replaced some log messages with "......" since it might be too long to
> > post all.
> >
> 
> OK. Good to know that this change helps you to get past the EFI stub boot issue.
> 
> > In this case the kernel failed to reserve cma, which should hit the issue of
> > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > if any suggestion, thank you.

 
> This looks like it is a separate issue. The memblock/cma code should
> not choke on a reserved page of memory at 0x0.
> 
> Perhaps Russell or Mike (cc'ed) have an idea how to address this?

Presuming that the last memblock dump comes from the end of
arm_memblock_init() with the this memory map 
 
memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0

adjust_lowmem_bounds() will set the memblock_limit (and respectively global
memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
happily fail.

I believe that the assumption for memblock_limit calculations was that the
first bank has several megs at least.

I wonder if this hack would help:

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index d9a0038..948e5b9 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
 			 * allocated when mapping the start of bank 0, which
 			 * occurs before any free memory is mapped.
 			 */
-			if (!memblock_limit) {
+			if (memblock_limit < PMD_SIZE) {
 				if (!IS_ALIGNED(block_start, PMD_SIZE))
 					memblock_limit = block_start;
 				else if (!IS_ALIGNED(block_end, PMD_SIZE))
 
> > boot-log:
> > --------
> >
> > Loading Linux test ...
> > EFI stub: Booting Linux Kernel...
> > EFI stub: Using DTB from configuration table
> > EFI stub: Exiting boot services and installing virtual address map...
> > Uncompressing Linux... done, booting the kernel.
> > [    0.000000] Booting Linux on physical CPU 0xf00
> > [    0.000000] Linux version 5.2.1-lpae (chester@linux-8mug) (......)
> > [    0.000000] CPU: ARMv7 Processor [410fc075] revision 5 (ARMv7), cr=30c5387d
> > [    0.000000] CPU: div instructions available: patching division code
> > [    0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache
> > [    0.000000] OF: fdt: Machine model: Raspberry Pi 2 Model B Rev 1.1
> > [    0.000000] printk: bootconsole [earlycon0] enabled
> > [    0.000000] Memory policy: Data cache writealloc
> > [    0.000000] efi: Getting EFI parameters from FDT:
> > [    0.000000] efi:   System Table: 0x000000003df757c0
> > [    0.000000] efi:   MemMap Address: 0x000000002c1c5040
> > [    0.000000] efi:   MemMap Size: 0x000003c0
> > [    0.000000] efi:   MemMap Desc. Size: 0x00000028
> > [    0.000000] efi:   MemMap Desc. Version: 0x00000001
> > [    0.000000] efi: EFI v2.70 by Das U-Boot
> > [    0.000000] efi:  SMBIOS=0x3cb62000  MEMRESERVE=0x3cb3d040
> > [    0.000000] memblock_reserve: [0x000000003cb3d040-0x000000003cb3d04f] efi_config_parse_tables+0x25c/0x2d8
> > [    0.000000] efi: Processing EFI memory map:
> > [    0.000000] MEMBLOCK configuration:
> > [    0.000000]  memory size = 0x000000003e000000 reserved size = 0x0000000000000010
> > [    0.000000]  memory.cnt  = 0x1
> > [    0.000000]  memory[0x0]     [0x0000000000000000-0x000000003dffffff], 0x000000003e000000 bytes flags: 0x0
> > [    0.000000]  reserved.cnt  = 0x1
> > [    0.000000]  reserved[0x0]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
> > [    0.000000] memblock_remove: [0x0000000000000000-0xfffffffffffffffe] reserve_regions+0x68/0x23c
> > [    0.000000] efi:   0x000000000000-0x000000000fff [Reserved           |   |  |  |  |  |  |  |   |WB|  |  |  ]
> > [    0.000000] memblock_add: [0x0000000000000000-0x0000000000000fff] early_init_dt_add_memory_arch+0x164/0x178
> > [    0.000000] efi:   0x000000001000-0x000000307fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
> > [    0.000000] memblock_add: [0x0000000000001000-0x0000000000307fff] early_init_dt_add_memory_arch+0x164/0x178
> > [    0.000000] efi:   0x000000308000-0x000002307fff [Boot Data          |   |  |  |  |  |  |  |   |WB|  |  |  ]
> > [    0.000000] memblock_add: [0x0000000000308000-0x0000000002307fff] early_init_dt_add_memory_arch+0x164/0x178
> > [    0.000000] efi:   0x000002308000-0x000002a93fff [Loader Data        |   |  |  |  |  |  |  |   |WB|  |  |  ]
> > [    0.000000] memblock_add: [0x0000000002308000-0x0000000002a93fff] early_init_dt_add_memory_arch+0x164/0x178
> > [    0.000000] efi:   0x000002a94000-0x000007cf5fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
> > [    0.000000] memblock_add: [0x0000000002a94000-0x0000000007cf5fff] early_init_dt_add_memory_arch+0x164/0x178
> > ......
> > ......
> > [    0.000000] memblock_add: [0x000000003df76000-0x000000003dffffff] early_init_dt_add_memory_arch+0x164/0x178
> > [    0.000000] efi:   0x00003f100000-0x00003f100fff [Memory Mapped I/O  |RUN|  |  |  |  |  |  |   |  |  |  |  ]
> > [    0.000000] memblock_reserve: [0x000000002c1c5000-0x000000002c1c5fff] efi_init+0xd8/0x1c8
> > [    0.000000] memblock_reserve: [0x0000000000400000-0x0000000001df2cef] arm_memblock_init+0x44/0x19c
> > [    0.000000] memblock_reserve: [0x0000000000303000-0x0000000000307fff] arm_mm_memblock_reserve+0x30/0x38
> > [    0.000000] memblock_reserve: [0x0000000007cf6000-0x0000000007cfc5c4] early_init_dt_reserve_memory_arch+0x2c/0x30
> > [    0.000000] cma: Failed to reserve 64 MiB
> > [    0.000000] MEMBLOCK configuration:
> > [    0.000000]  memory size = 0x000000003e000000 reserved size = 0x00000000019ff2c5
> > [    0.000000]  memory.cnt  = 0xa
> > [    0.000000]  memory[0x0]     [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > [    0.000000]  memory[0x1]     [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > [    0.000000]  memory[0x2]     [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > [    0.000000]  memory[0x3]     [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> > [    0.000000]  memory[0x4]     [0x000000003cb3f000-0x000000003cb3ffff], 0x0000000000001000 bytes flags: 0x4
> > [    0.000000]  memory[0x5]     [0x000000003cb40000-0x000000003cb5ffff], 0x0000000000020000 bytes flags: 0x0
> > [    0.000000]  memory[0x6]     [0x000000003cb60000-0x000000003cb68fff], 0x0000000000009000 bytes flags: 0x4
> > [    0.000000]  memory[0x7]     [0x000000003cb69000-0x000000003df74fff], 0x000000000140c000 bytes flags: 0x0
> > [    0.000000]  memory[0x8]     [0x000000003df75000-0x000000003df75fff], 0x0000000000001000 bytes flags: 0x4
> > [    0.000000]  memory[0x9]     [0x000000003df76000-0x000000003dffffff], 0x000000000008a000 bytes flags: 0x0
> > [    0.000000]  reserved.cnt  = 0x5
> > [    0.000000]  reserved[0x0]   [0x0000000000303000-0x0000000000307fff], 0x0000000000005000 bytes flags: 0x0
> > [    0.000000]  reserved[0x1]   [0x0000000000400000-0x0000000001df2cef], 0x00000000019f2cf0 bytes flags: 0x0
> > [    0.000000]  reserved[0x2]   [0x0000000007cf6000-0x0000000007cfc5c4], 0x00000000000065c5 bytes flags: 0x0
> > [    0.000000]  reserved[0x3]   [0x000000002c1c5000-0x000000002c1c5fff], 0x0000000000001000 bytes flags: 0x0
> > [    0.000000]  reserved[0x4]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
> > [    0.000000] memblock_alloc_try_nid: 4096 bytes align=0x1000 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_alloc+0x44/0x70
> > [    0.000000] Kernel panic - not syncing: early_alloc: Failed to allocate 4096 bytes align=0x1000
> > [    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.1-lpae #1 openSUSE Tumbleweed (unreleased)
> > [    0.000000] Hardware name: BCM2835
> > [    0.000000] Backtrace:
> > [    0.000000] [<c043fafc>] (dump_backtrace) from [<c043fd84>] (show_stack+0x20/0x24)
> > [    0.000000]  r7:c1800000 r6:00000000 r5:600001d3 r4:c1901ba0
> > [    0.000000] [<c043fd64>] (show_stack) from [<c0df9400>] (dump_stack+0xd0/0x104)
> > [    0.000000] [<c0df9330>] (dump_stack) from [<c048061c>] (panic+0xf8/0x32c)
> > [    0.000000]  r10:c0307000 r9:c0001000 r8:00000003 r7:00000000 r6:00000000 r5:c181df04
> > [    0.000000]  r4:c192b8d8 r3:00000001
> > [    0.000000] [<c0480528>] (panic) from [<c1609728>] (early_alloc+0x60/0x70)
> > [    0.000000]  r3:00001000 r2:00001000 r1:c10037e8 r0:c12fe64c
> > [    0.000000]  r7:00000000
> > [    0.000000] [<c16096c8>] (early_alloc) from [<c1609114>] (arm_pte_alloc+0x34/0x94)
> > [    0.000000]  r7:00000000 r6:00000000 r4:c0307000
> > [    0.000000] [<c16090e0>] (arm_pte_alloc) from [<c1609384>] (__create_mapping+0x210/0x2c0)
> > [    0.000000]  r9:c0001000 r8:c0001000 r7:00000001 r6:c13f22e0 r5:c0200000 r4:c0400000
> > [    0.000000] [<c1609174>] (__create_mapping) from [<c160951c>] (create_mapping+0xe8/0x108)
> > [    0.000000]  r10:c0400000 r9:c16a2110 r8:c19c7a80 r7:00000000 r6:00400000 r5:c13f2000
> > [    0.000000]  r4:c1801ef0
> > [    0.000000] [<c1609434>] (create_mapping) from [<c1609f50>] (paging_init+0x350/0x75c)
> > [    0.000000]  r4:c1842d40
> >
> >
> > > >
> > > > >   grub> lsefimmap
> > > > >   Type      Physical start  - end             #Pages        Size Attributes
> > > > >   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
> > > > >   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
> > > > >   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
> > > > >   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
> > > > >   .....
> > > > >
> > > > > To avoid a reserved address, we have to ignore the memory regions which are
> > > > > marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
> > > > > chosen. If the region before the kernel base is unaligned, it will be
> > > > > marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
> > > > > will not be sticked with a very low address such as 0x1000.
> > > > >
> > >
> > > This is a separate issue, so it should be handled in a separate patch.
> > >
> > > > > Signed-off-by: Chester Lin <clin@suse.com>
> > > > > ---
> > > > >  arch/arm/mm/mmu.c                         |  3 ++
> > > > >  drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
> > > > >  2 files changed, 37 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > > index f3ce34113f89..909b11ba48d8 100644
> > > > > --- a/arch/arm/mm/mmu.c
> > > > > +++ b/arch/arm/mm/mmu.c
> > > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > > >                 phys_addr_t block_start = reg->base;
> > > > >                 phys_addr_t block_end = reg->base + reg->size;
> > > > >
> > > > > +               if (memblock_is_nomap(reg))
> > > > > +                       continue;
> > > > > +
> > > > >                 if (reg->base < vmalloc_limit) {
> > > > >                         if (block_end > lowmem_limit)
> > > > >                                 /*
> > > > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > index e8f7aefb6813..10d33d36df00 100644
> > > > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > @@ -128,7 +128,7 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > > > >
> > > > >         for (l = 0; l < map_size; l += desc_size) {
> > > > >                 efi_memory_desc_t *desc;
> > > > > -               u64 start, end;
> > > > > +               u64 start, end, spare, kernel_base;
> > > > >
> > > > >                 desc = (void *)memory_map + l;
> > > > >                 start = desc->phys_addr;
> > > > > @@ -144,27 +144,52 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > > > >                 case EFI_BOOT_SERVICES_DATA:
> > > > >                         /* Ignore types that are released to the OS anyway */
> > > > >                         continue;
> > > > > -
> > > > > +               case EFI_RESERVED_TYPE:
> > > > > +                       /* Ignore reserved regions */
> > > > > +                       continue;
> > > > >                 case EFI_CONVENTIONAL_MEMORY:
> > > > >                         /*
> > > > >                          * Reserve the intersection between this entry and the
> > > > >                          * region.
> > > > >                          */
> > > > >                         start = max(start, (u64)dram_base);
> > > > > -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> > > > > +                       kernel_base = round_up(start, PMD_SIZE);
> > > > > +                       spare = kernel_base - start;
> > > > > +                       end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
> > > > > +
> > > > > +                       status = efi_call_early(allocate_pages,
> > > > > +                                       EFI_ALLOCATE_ADDRESS,
> > > > > +                                       EFI_LOADER_DATA,
> > > > > +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> > > > > +                                       &kernel_base);
> > > > > +                       if (status != EFI_SUCCESS) {
> > > > > +                               pr_efi_err(sys_table_arg,
> > > > > +                                       "reserve_kernel_base: alloc failed.\n");
> > > > > +                               goto out;
> > > > > +                       }
> > > > > +                       *reserve_addr = kernel_base;
> > > > >
> > > > > +                       if (!spare)
> > > > > +                               break;
> > > > > +                       /*
> > > > > +                        * If there's a gap between start and kernel_base,
> > > > > +                        * it needs be reserved so that the memblock_limit
> > > > > +                        * will not fall on a very low address when running
> > > > > +                        * adjust_lowmem_bounds(), wchich could eventually
> > > > > +                        * cause CMA reservation issue.
> > > > > +                        */
> > > > >                         status = efi_call_early(allocate_pages,
> > > > >                                                 EFI_ALLOCATE_ADDRESS,
> > > > > -                                               EFI_LOADER_DATA,
> > > > > -                                               (end - start) / EFI_PAGE_SIZE,
> > > > > +                                               EFI_RESERVED_TYPE,
> > > > > +                                               spare / EFI_PAGE_SIZE,
> > > > >                                                 &start);
> > > > >                         if (status != EFI_SUCCESS) {
> > > > >                                 pr_efi_err(sys_table_arg,
> > > > > -                                       "reserve_kernel_base(): alloc failed.\n");
> > > > > +                                       "reserve spare-region failed\n");
> > > > >                                 goto out;
> > > > >                         }
> > > > > -                       break;
> > > > >
> > > > > +                       break;
> > > > >                 case EFI_LOADER_CODE:
> > > > >                 case EFI_LOADER_DATA:
> > > > >                         /*
> > > > > @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > > > >         *image_size = image->image_size;
> > > > >         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
> > > > >                                      *image_size,
> > > > > -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > > > +                                    *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > > >         if (status != EFI_SUCCESS) {
> > > > >                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
> > > > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > > > @@ -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > > > >          * in memory. The kernel determines the base of DRAM from the
> > > > >          * address at which the zImage is loaded.
> > > > >          */
> > > > > -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> > > > > +       if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
> > > > >                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
> > > > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > > >                 *reserve_size = 0;
> > > > > --
> > > > > 2.22.0
> > > > >
> > >
>
Chester Lin Aug. 19, 2019, 7:56 a.m. UTC | #7
Hi Mike and Ard,

On Thu, Aug 15, 2019 at 04:37:39PM +0300, Mike Rapoport wrote:
> On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> > (adding Mike)
> > 
> > On Thu, 15 Aug 2019 at 14:28, Chester Lin <clin@suse.com> wrote:
> > >
> > > Hi Ard,
> > >
> > > On Thu, Aug 15, 2019 at 10:59:43AM +0300, Ard Biesheuvel wrote:
> > > > On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > > > >
> > > > > Hello Chester,
> > > > >
> > > > > On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> > > > > >
> > > > > > In some cases the arm32 efistub could fail to allocate memory for
> > > > > > uncompressed kernel. For example, we got the following error message when
> > > > > > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> > > > > >
> > > > > >   EFI stub: Booting Linux Kernel...
> > > > > >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> > > > > >   EFI stub: ERROR: Failed to relocate kernel
> > > > > >
> > > > > > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > > > > > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > > > > > set the dram base at 0, which was actually in a reserved region.
> > > > > >
> > > > >
> > > > > This by itself is a violation of the Linux boot protocol for 32-bit
> > > > > ARM when using the decompressor. The decompressor rounds down its own
> > > > > base address to a multiple of 128 MB, and assumes the whole area is
> > > > > available for the decompressed kernel and related data structures.
> > > > > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > > > > why putting a reserved region of 4 KB bytes works at the moment, but
> > > > > this is fragile). Note that the decompressor does not look at any DT
> > > > > or EFI provided memory maps *at all*.
> > > > >
> > > > > So unfortunately, this is not something we can fix in the kernel, but
> > > > > we should fix it in the bootloader or in GRUB, so it does not put any
> > > > > reserved regions in the first 128 MB of memory,
> > > > >
> > > >
> > > > OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
> > > > ARM boot protocol docs are unclear about whether this memory should be
> > > > used or not, but it is no longer used for its original purpose (page
> > > > tables), and the RPi loader already keeps data there.
> > > >
> > > > Can you check whether the following patch works for you?
> > > >
> > > > diff --git a/drivers/firmware/efi/libstub/Makefile
> > > > b/drivers/firmware/efi/libstub/Makefile
> > > > index 0460c7581220..ee0661ddb25b 100644
> > > > --- a/drivers/firmware/efi/libstub/Makefile
> > > > +++ b/drivers/firmware/efi/libstub/Makefile
> > > > @@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
> > > > string.o random.o \
> > > >
> > > >  lib-$(CONFIG_ARM)              += arm32-stub.o
> > > >  lib-$(CONFIG_ARM64)            += arm64-stub.o
> > > > +CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > > >  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > > >
> > > >  #
> > > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > index e8f7aefb6813..66ff0c8ec269 100644
> > > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > @@ -204,7 +204,7 @@ efi_status_t
> > > > handle_kernel_image(efi_system_table_t *sys_table,
> > > >          * loaded. These assumptions are made by the decompressor,
> > > >          * before any memory map is available.
> > > >          */
> > > > -       dram_base = round_up(dram_base, SZ_128M);
> > > > +       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;
> > > >
> > > >         status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
> > > >                                      reserve_size);
> > > >
> > >
> > > I tried your patch on rpi2 and got the following panic. Just a reminder that I
> > > have replaced some log messages with "......" since it might be too long to
> > > post all.
> > >
> > 
> > OK. Good to know that this change helps you to get past the EFI stub boot issue.
> > 
> > > In this case the kernel failed to reserve cma, which should hit the issue of
> > > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > > if any suggestion, thank you.
> 
>  
> > This looks like it is a separate issue. The memblock/cma code should
> > not choke on a reserved page of memory at 0x0.
> > 
> > Perhaps Russell or Mike (cc'ed) have an idea how to address this?
> 
> Presuming that the last memblock dump comes from the end of
> arm_memblock_init() with the this memory map 
>  
> memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> 
> adjust_lowmem_bounds() will set the memblock_limit (and respectively global
> memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
> happily fail.
> 
> I believe that the assumption for memblock_limit calculations was that the
> first bank has several megs at least.
> 
> I wonder if this hack would help:
> 
> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index d9a0038..948e5b9 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
>  			 * allocated when mapping the start of bank 0, which
>  			 * occurs before any free memory is mapped.
>  			 */
> -			if (!memblock_limit) {
> +			if (memblock_limit < PMD_SIZE) {
>  				if (!IS_ALIGNED(block_start, PMD_SIZE))
>  					memblock_limit = block_start;
>  				else if (!IS_ALIGNED(block_end, PMD_SIZE))
>

I applied this patch as well and it works well on rpi-2 model B.

> > > boot-log:
> > > --------
> > >
> > > Loading Linux test ...
> > > EFI stub: Booting Linux Kernel...
> > > EFI stub: Using DTB from configuration table
> > > EFI stub: Exiting boot services and installing virtual address map...
> > > Uncompressing Linux... done, booting the kernel.
> > > [    0.000000] Booting Linux on physical CPU 0xf00
> > > [    0.000000] Linux version 5.2.1-lpae (chester@linux-8mug) (......)
> > > [    0.000000] CPU: ARMv7 Processor [410fc075] revision 5 (ARMv7), cr=30c5387d
> > > [    0.000000] CPU: div instructions available: patching division code
> > > [    0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache
> > > [    0.000000] OF: fdt: Machine model: Raspberry Pi 2 Model B Rev 1.1
> > > [    0.000000] printk: bootconsole [earlycon0] enabled
> > > [    0.000000] Memory policy: Data cache writealloc
> > > [    0.000000] efi: Getting EFI parameters from FDT:
> > > [    0.000000] efi:   System Table: 0x000000003df757c0
> > > [    0.000000] efi:   MemMap Address: 0x000000002c1c5040
> > > [    0.000000] efi:   MemMap Size: 0x000003c0
> > > [    0.000000] efi:   MemMap Desc. Size: 0x00000028
> > > [    0.000000] efi:   MemMap Desc. Version: 0x00000001
> > > [    0.000000] efi: EFI v2.70 by Das U-Boot
> > > [    0.000000] efi:  SMBIOS=0x3cb62000  MEMRESERVE=0x3cb3d040
> > > [    0.000000] memblock_reserve: [0x000000003cb3d040-0x000000003cb3d04f] efi_config_parse_tables+0x25c/0x2d8
> > > [    0.000000] efi: Processing EFI memory map:
> > > [    0.000000] MEMBLOCK configuration:
> > > [    0.000000]  memory size = 0x000000003e000000 reserved size = 0x0000000000000010
> > > [    0.000000]  memory.cnt  = 0x1
> > > [    0.000000]  memory[0x0]     [0x0000000000000000-0x000000003dffffff], 0x000000003e000000 bytes flags: 0x0
> > > [    0.000000]  reserved.cnt  = 0x1
> > > [    0.000000]  reserved[0x0]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
> > > [    0.000000] memblock_remove: [0x0000000000000000-0xfffffffffffffffe] reserve_regions+0x68/0x23c
> > > [    0.000000] efi:   0x000000000000-0x000000000fff [Reserved           |   |  |  |  |  |  |  |   |WB|  |  |  ]
> > > [    0.000000] memblock_add: [0x0000000000000000-0x0000000000000fff] early_init_dt_add_memory_arch+0x164/0x178
> > > [    0.000000] efi:   0x000000001000-0x000000307fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
> > > [    0.000000] memblock_add: [0x0000000000001000-0x0000000000307fff] early_init_dt_add_memory_arch+0x164/0x178
> > > [    0.000000] efi:   0x000000308000-0x000002307fff [Boot Data          |   |  |  |  |  |  |  |   |WB|  |  |  ]
> > > [    0.000000] memblock_add: [0x0000000000308000-0x0000000002307fff] early_init_dt_add_memory_arch+0x164/0x178
> > > [    0.000000] efi:   0x000002308000-0x000002a93fff [Loader Data        |   |  |  |  |  |  |  |   |WB|  |  |  ]
> > > [    0.000000] memblock_add: [0x0000000002308000-0x0000000002a93fff] early_init_dt_add_memory_arch+0x164/0x178
> > > [    0.000000] efi:   0x000002a94000-0x000007cf5fff [Conventional Memory|   |  |  |  |  |  |  |   |WB|  |  |  ]
> > > [    0.000000] memblock_add: [0x0000000002a94000-0x0000000007cf5fff] early_init_dt_add_memory_arch+0x164/0x178
> > > ......
> > > ......
> > > [    0.000000] memblock_add: [0x000000003df76000-0x000000003dffffff] early_init_dt_add_memory_arch+0x164/0x178
> > > [    0.000000] efi:   0x00003f100000-0x00003f100fff [Memory Mapped I/O  |RUN|  |  |  |  |  |  |   |  |  |  |  ]
> > > [    0.000000] memblock_reserve: [0x000000002c1c5000-0x000000002c1c5fff] efi_init+0xd8/0x1c8
> > > [    0.000000] memblock_reserve: [0x0000000000400000-0x0000000001df2cef] arm_memblock_init+0x44/0x19c
> > > [    0.000000] memblock_reserve: [0x0000000000303000-0x0000000000307fff] arm_mm_memblock_reserve+0x30/0x38
> > > [    0.000000] memblock_reserve: [0x0000000007cf6000-0x0000000007cfc5c4] early_init_dt_reserve_memory_arch+0x2c/0x30
> > > [    0.000000] cma: Failed to reserve 64 MiB
> > > [    0.000000] MEMBLOCK configuration:
> > > [    0.000000]  memory size = 0x000000003e000000 reserved size = 0x00000000019ff2c5
> > > [    0.000000]  memory.cnt  = 0xa
> > > [    0.000000]  memory[0x0]     [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > > [    0.000000]  memory[0x1]     [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > > [    0.000000]  memory[0x2]     [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > > [    0.000000]  memory[0x3]     [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> > > [    0.000000]  memory[0x4]     [0x000000003cb3f000-0x000000003cb3ffff], 0x0000000000001000 bytes flags: 0x4
> > > [    0.000000]  memory[0x5]     [0x000000003cb40000-0x000000003cb5ffff], 0x0000000000020000 bytes flags: 0x0
> > > [    0.000000]  memory[0x6]     [0x000000003cb60000-0x000000003cb68fff], 0x0000000000009000 bytes flags: 0x4
> > > [    0.000000]  memory[0x7]     [0x000000003cb69000-0x000000003df74fff], 0x000000000140c000 bytes flags: 0x0
> > > [    0.000000]  memory[0x8]     [0x000000003df75000-0x000000003df75fff], 0x0000000000001000 bytes flags: 0x4
> > > [    0.000000]  memory[0x9]     [0x000000003df76000-0x000000003dffffff], 0x000000000008a000 bytes flags: 0x0
> > > [    0.000000]  reserved.cnt  = 0x5
> > > [    0.000000]  reserved[0x0]   [0x0000000000303000-0x0000000000307fff], 0x0000000000005000 bytes flags: 0x0
> > > [    0.000000]  reserved[0x1]   [0x0000000000400000-0x0000000001df2cef], 0x00000000019f2cf0 bytes flags: 0x0
> > > [    0.000000]  reserved[0x2]   [0x0000000007cf6000-0x0000000007cfc5c4], 0x00000000000065c5 bytes flags: 0x0
> > > [    0.000000]  reserved[0x3]   [0x000000002c1c5000-0x000000002c1c5fff], 0x0000000000001000 bytes flags: 0x0
> > > [    0.000000]  reserved[0x4]   [0x000000003cb3d040-0x000000003cb3d04f], 0x0000000000000010 bytes flags: 0x0
> > > [    0.000000] memblock_alloc_try_nid: 4096 bytes align=0x1000 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_alloc+0x44/0x70
> > > [    0.000000] Kernel panic - not syncing: early_alloc: Failed to allocate 4096 bytes align=0x1000
> > > [    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.1-lpae #1 openSUSE Tumbleweed (unreleased)
> > > [    0.000000] Hardware name: BCM2835
> > > [    0.000000] Backtrace:
> > > [    0.000000] [<c043fafc>] (dump_backtrace) from [<c043fd84>] (show_stack+0x20/0x24)
> > > [    0.000000]  r7:c1800000 r6:00000000 r5:600001d3 r4:c1901ba0
> > > [    0.000000] [<c043fd64>] (show_stack) from [<c0df9400>] (dump_stack+0xd0/0x104)
> > > [    0.000000] [<c0df9330>] (dump_stack) from [<c048061c>] (panic+0xf8/0x32c)
> > > [    0.000000]  r10:c0307000 r9:c0001000 r8:00000003 r7:00000000 r6:00000000 r5:c181df04
> > > [    0.000000]  r4:c192b8d8 r3:00000001
> > > [    0.000000] [<c0480528>] (panic) from [<c1609728>] (early_alloc+0x60/0x70)
> > > [    0.000000]  r3:00001000 r2:00001000 r1:c10037e8 r0:c12fe64c
> > > [    0.000000]  r7:00000000
> > > [    0.000000] [<c16096c8>] (early_alloc) from [<c1609114>] (arm_pte_alloc+0x34/0x94)
> > > [    0.000000]  r7:00000000 r6:00000000 r4:c0307000
> > > [    0.000000] [<c16090e0>] (arm_pte_alloc) from [<c1609384>] (__create_mapping+0x210/0x2c0)
> > > [    0.000000]  r9:c0001000 r8:c0001000 r7:00000001 r6:c13f22e0 r5:c0200000 r4:c0400000
> > > [    0.000000] [<c1609174>] (__create_mapping) from [<c160951c>] (create_mapping+0xe8/0x108)
> > > [    0.000000]  r10:c0400000 r9:c16a2110 r8:c19c7a80 r7:00000000 r6:00400000 r5:c13f2000
> > > [    0.000000]  r4:c1801ef0
> > > [    0.000000] [<c1609434>] (create_mapping) from [<c1609f50>] (paging_init+0x350/0x75c)
> > > [    0.000000]  r4:c1842d40
> > >
> > >
> > > > >
> > > > > >   grub> lsefimmap
> > > > > >   Type      Physical start  - end             #Pages        Size Attributes
> > > > > >   reserved  0000000000000000-0000000000000fff 00000001      4KiB WB
> > > > > >   conv-mem  0000000000001000-0000000007ef5fff 00007ef5 130004KiB WB
> > > > > >   RT-data   0000000007ef6000-0000000007f09fff 00000014     80KiB RT WB
> > > > > >   conv-mem  0000000007f0a000-000000002d871fff 00025968 615840KiB WB
> > > > > >   .....
> > > > > >
> > > > > > To avoid a reserved address, we have to ignore the memory regions which are
> > > > > > marked as EFI_RESERVED_TYPE, and only conventional memory regions can be
> > > > > > chosen. If the region before the kernel base is unaligned, it will be
> > > > > > marked as EFI_RESERVED_TYPE and let kernel ignore it so that memblock_limit
> > > > > > will not be sticked with a very low address such as 0x1000.
> > > > > >
> > > >
> > > > This is a separate issue, so it should be handled in a separate patch.
> > > >
> > > > > > Signed-off-by: Chester Lin <clin@suse.com>
> > > > > > ---
> > > > > >  arch/arm/mm/mmu.c                         |  3 ++
> > > > > >  drivers/firmware/efi/libstub/arm32-stub.c | 43 ++++++++++++++++++-----
> > > > > >  2 files changed, 37 insertions(+), 9 deletions(-)
> > > > > >
> > > > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > > > index f3ce34113f89..909b11ba48d8 100644
> > > > > > --- a/arch/arm/mm/mmu.c
> > > > > > +++ b/arch/arm/mm/mmu.c
> > > > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > > > >                 phys_addr_t block_start = reg->base;
> > > > > >                 phys_addr_t block_end = reg->base + reg->size;
> > > > > >
> > > > > > +               if (memblock_is_nomap(reg))
> > > > > > +                       continue;
> > > > > > +
> > > > > >                 if (reg->base < vmalloc_limit) {
> > > > > >                         if (block_end > lowmem_limit)
> > > > > >                                 /*
> > > > > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > index e8f7aefb6813..10d33d36df00 100644
> > > > > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > @@ -128,7 +128,7 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > > > > >
> > > > > >         for (l = 0; l < map_size; l += desc_size) {
> > > > > >                 efi_memory_desc_t *desc;
> > > > > > -               u64 start, end;
> > > > > > +               u64 start, end, spare, kernel_base;
> > > > > >
> > > > > >                 desc = (void *)memory_map + l;
> > > > > >                 start = desc->phys_addr;
> > > > > > @@ -144,27 +144,52 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
> > > > > >                 case EFI_BOOT_SERVICES_DATA:
> > > > > >                         /* Ignore types that are released to the OS anyway */
> > > > > >                         continue;
> > > > > > -
> > > > > > +               case EFI_RESERVED_TYPE:
> > > > > > +                       /* Ignore reserved regions */
> > > > > > +                       continue;
> > > > > >                 case EFI_CONVENTIONAL_MEMORY:
> > > > > >                         /*
> > > > > >                          * Reserve the intersection between this entry and the
> > > > > >                          * region.
> > > > > >                          */
> > > > > >                         start = max(start, (u64)dram_base);
> > > > > > -                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
> > > > > > +                       kernel_base = round_up(start, PMD_SIZE);
> > > > > > +                       spare = kernel_base - start;
> > > > > > +                       end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
> > > > > > +
> > > > > > +                       status = efi_call_early(allocate_pages,
> > > > > > +                                       EFI_ALLOCATE_ADDRESS,
> > > > > > +                                       EFI_LOADER_DATA,
> > > > > > +                                       MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
> > > > > > +                                       &kernel_base);
> > > > > > +                       if (status != EFI_SUCCESS) {
> > > > > > +                               pr_efi_err(sys_table_arg,
> > > > > > +                                       "reserve_kernel_base: alloc failed.\n");
> > > > > > +                               goto out;
> > > > > > +                       }
> > > > > > +                       *reserve_addr = kernel_base;
> > > > > >
> > > > > > +                       if (!spare)
> > > > > > +                               break;
> > > > > > +                       /*
> > > > > > +                        * If there's a gap between start and kernel_base,
> > > > > > +                        * it needs be reserved so that the memblock_limit
> > > > > > +                        * will not fall on a very low address when running
> > > > > > +                        * adjust_lowmem_bounds(), wchich could eventually
> > > > > > +                        * cause CMA reservation issue.
> > > > > > +                        */
> > > > > >                         status = efi_call_early(allocate_pages,
> > > > > >                                                 EFI_ALLOCATE_ADDRESS,
> > > > > > -                                               EFI_LOADER_DATA,
> > > > > > -                                               (end - start) / EFI_PAGE_SIZE,
> > > > > > +                                               EFI_RESERVED_TYPE,
> > > > > > +                                               spare / EFI_PAGE_SIZE,
> > > > > >                                                 &start);
> > > > > >                         if (status != EFI_SUCCESS) {
> > > > > >                                 pr_efi_err(sys_table_arg,
> > > > > > -                                       "reserve_kernel_base(): alloc failed.\n");
> > > > > > +                                       "reserve spare-region failed\n");
> > > > > >                                 goto out;
> > > > > >                         }
> > > > > > -                       break;
> > > > > >
> > > > > > +                       break;
> > > > > >                 case EFI_LOADER_CODE:
> > > > > >                 case EFI_LOADER_DATA:
> > > > > >                         /*
> > > > > > @@ -220,7 +245,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > > > > >         *image_size = image->image_size;
> > > > > >         status = efi_relocate_kernel(sys_table, image_addr, *image_size,
> > > > > >                                      *image_size,
> > > > > > -                                    dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > > > > +                                    *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
> > > > > >         if (status != EFI_SUCCESS) {
> > > > > >                 pr_efi_err(sys_table, "Failed to relocate kernel.\n");
> > > > > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > > > > @@ -233,7 +258,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
> > > > > >          * in memory. The kernel determines the base of DRAM from the
> > > > > >          * address at which the zImage is loaded.
> > > > > >          */
> > > > > > -       if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
> > > > > > +       if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
> > > > > >                 pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
> > > > > >                 efi_free(sys_table, *reserve_size, *reserve_addr);
> > > > > >                 *reserve_size = 0;
> > > > > > --
> > > > > > 2.22.0
> > > > > >
> > > >
> > 
> 
> -- 
> Sincerely yours,
> Mike.
> 
>
Ard Biesheuvel Aug. 19, 2019, 2:56 p.m. UTC | #8
On Mon, 19 Aug 2019 at 11:01, Chester Lin <clin@suse.com> wrote:
>
> Hi Mike and Ard,
>
> On Thu, Aug 15, 2019 at 04:37:39PM +0300, Mike Rapoport wrote:
> > On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> > > (adding Mike)
> > >
> > > On Thu, 15 Aug 2019 at 14:28, Chester Lin <clin@suse.com> wrote:
> > > >
> > > > Hi Ard,
> > > >
> > > > On Thu, Aug 15, 2019 at 10:59:43AM +0300, Ard Biesheuvel wrote:
> > > > > On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > > > > >
> > > > > > Hello Chester,
> > > > > >
> > > > > > On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> > > > > > >
> > > > > > > In some cases the arm32 efistub could fail to allocate memory for
> > > > > > > uncompressed kernel. For example, we got the following error message when
> > > > > > > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> > > > > > >
> > > > > > >   EFI stub: Booting Linux Kernel...
> > > > > > >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> > > > > > >   EFI stub: ERROR: Failed to relocate kernel
> > > > > > >
> > > > > > > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > > > > > > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > > > > > > set the dram base at 0, which was actually in a reserved region.
> > > > > > >
> > > > > >
> > > > > > This by itself is a violation of the Linux boot protocol for 32-bit
> > > > > > ARM when using the decompressor. The decompressor rounds down its own
> > > > > > base address to a multiple of 128 MB, and assumes the whole area is
> > > > > > available for the decompressed kernel and related data structures.
> > > > > > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > > > > > why putting a reserved region of 4 KB bytes works at the moment, but
> > > > > > this is fragile). Note that the decompressor does not look at any DT
> > > > > > or EFI provided memory maps *at all*.
> > > > > >
> > > > > > So unfortunately, this is not something we can fix in the kernel, but
> > > > > > we should fix it in the bootloader or in GRUB, so it does not put any
> > > > > > reserved regions in the first 128 MB of memory,
> > > > > >
> > > > >
> > > > > OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
> > > > > ARM boot protocol docs are unclear about whether this memory should be
> > > > > used or not, but it is no longer used for its original purpose (page
> > > > > tables), and the RPi loader already keeps data there.
> > > > >
> > > > > Can you check whether the following patch works for you?
> > > > >
> > > > > diff --git a/drivers/firmware/efi/libstub/Makefile
> > > > > b/drivers/firmware/efi/libstub/Makefile
> > > > > index 0460c7581220..ee0661ddb25b 100644
> > > > > --- a/drivers/firmware/efi/libstub/Makefile
> > > > > +++ b/drivers/firmware/efi/libstub/Makefile
> > > > > @@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
> > > > > string.o random.o \
> > > > >
> > > > >  lib-$(CONFIG_ARM)              += arm32-stub.o
> > > > >  lib-$(CONFIG_ARM64)            += arm64-stub.o
> > > > > +CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > > > >  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > > > >
> > > > >  #
> > > > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > index e8f7aefb6813..66ff0c8ec269 100644
> > > > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > @@ -204,7 +204,7 @@ efi_status_t
> > > > > handle_kernel_image(efi_system_table_t *sys_table,
> > > > >          * loaded. These assumptions are made by the decompressor,
> > > > >          * before any memory map is available.
> > > > >          */
> > > > > -       dram_base = round_up(dram_base, SZ_128M);
> > > > > +       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;
> > > > >
> > > > >         status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
> > > > >                                      reserve_size);
> > > > >
> > > >
> > > > I tried your patch on rpi2 and got the following panic. Just a reminder that I
> > > > have replaced some log messages with "......" since it might be too long to
> > > > post all.
> > > >
> > >
> > > OK. Good to know that this change helps you to get past the EFI stub boot issue.
> > >
> > > > In this case the kernel failed to reserve cma, which should hit the issue of
> > > > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > > > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > > > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > > > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > > > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > > > if any suggestion, thank you.
> >
> >
> > > This looks like it is a separate issue. The memblock/cma code should
> > > not choke on a reserved page of memory at 0x0.
> > >
> > > Perhaps Russell or Mike (cc'ed) have an idea how to address this?
> >
> > Presuming that the last memblock dump comes from the end of
> > arm_memblock_init() with the this memory map
> >
> > memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> >
> > adjust_lowmem_bounds() will set the memblock_limit (and respectively global
> > memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
> > happily fail.
> >
> > I believe that the assumption for memblock_limit calculations was that the
> > first bank has several megs at least.
> >
> > I wonder if this hack would help:
> >
> > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > index d9a0038..948e5b9 100644
> > --- a/arch/arm/mm/mmu.c
> > +++ b/arch/arm/mm/mmu.c
> > @@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
> >                        * allocated when mapping the start of bank 0, which
> >                        * occurs before any free memory is mapped.
> >                        */
> > -                     if (!memblock_limit) {
> > +                     if (memblock_limit < PMD_SIZE) {
> >                               if (!IS_ALIGNED(block_start, PMD_SIZE))
> >                                       memblock_limit = block_start;
> >                               else if (!IS_ALIGNED(block_end, PMD_SIZE))
> >
>
> I applied this patch as well and it works well on rpi-2 model B.
>

Thanks, Chester, that is good to know.

However, afaict, this only affects systems where physical memory
starts at address 0x0, so I think we need a better fix.

I know Mike has been looking into the NOMAP stuff lately, and your
original patch contains a hunk that makes this code (?) disregard
nomap memblocks. That might be a better approach.
Chester Lin Aug. 20, 2019, 7:33 a.m. UTC | #9
On Mon, Aug 19, 2019 at 05:56:51PM +0300, Ard Biesheuvel wrote:
> On Mon, 19 Aug 2019 at 11:01, Chester Lin <clin@suse.com> wrote:
> >
> > Hi Mike and Ard,
> >
> > On Thu, Aug 15, 2019 at 04:37:39PM +0300, Mike Rapoport wrote:
> > > On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> > > > (adding Mike)
> > > >
> > > > On Thu, 15 Aug 2019 at 14:28, Chester Lin <clin@suse.com> wrote:
> > > > >
> > > > > Hi Ard,
> > > > >
> > > > > On Thu, Aug 15, 2019 at 10:59:43AM +0300, Ard Biesheuvel wrote:
> > > > > > On Sun, 4 Aug 2019 at 10:57, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > > > > > >
> > > > > > > Hello Chester,
> > > > > > >
> > > > > > > On Fri, 2 Aug 2019 at 08:40, Chester Lin <clin@suse.com> wrote:
> > > > > > > >
> > > > > > > > In some cases the arm32 efistub could fail to allocate memory for
> > > > > > > > uncompressed kernel. For example, we got the following error message when
> > > > > > > > verifying EFI stub on Raspberry Pi-2 [kernel-5.2.1 + grub-2.04] :
> > > > > > > >
> > > > > > > >   EFI stub: Booting Linux Kernel...
> > > > > > > >   EFI stub: ERROR: Unable to allocate memory for uncompressed kernel.
> > > > > > > >   EFI stub: ERROR: Failed to relocate kernel
> > > > > > > >
> > > > > > > > After checking the EFI memory map we found that the first page [0 - 0xfff]
> > > > > > > > had been reserved by Raspberry Pi-2's firmware, and the efistub tried to
> > > > > > > > set the dram base at 0, which was actually in a reserved region.
> > > > > > > >
> > > > > > >
> > > > > > > This by itself is a violation of the Linux boot protocol for 32-bit
> > > > > > > ARM when using the decompressor. The decompressor rounds down its own
> > > > > > > base address to a multiple of 128 MB, and assumes the whole area is
> > > > > > > available for the decompressed kernel and related data structures.
> > > > > > > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > > > > > > why putting a reserved region of 4 KB bytes works at the moment, but
> > > > > > > this is fragile). Note that the decompressor does not look at any DT
> > > > > > > or EFI provided memory maps *at all*.
> > > > > > >
> > > > > > > So unfortunately, this is not something we can fix in the kernel, but
> > > > > > > we should fix it in the bootloader or in GRUB, so it does not put any
> > > > > > > reserved regions in the first 128 MB of memory,
> > > > > > >
> > > > > >
> > > > > > OK, perhaps we can fix this by taking TEXT_OFFSET into account. The
> > > > > > ARM boot protocol docs are unclear about whether this memory should be
> > > > > > used or not, but it is no longer used for its original purpose (page
> > > > > > tables), and the RPi loader already keeps data there.
> > > > > >
> > > > > > Can you check whether the following patch works for you?
> > > > > >
> > > > > > diff --git a/drivers/firmware/efi/libstub/Makefile
> > > > > > b/drivers/firmware/efi/libstub/Makefile
> > > > > > index 0460c7581220..ee0661ddb25b 100644
> > > > > > --- a/drivers/firmware/efi/libstub/Makefile
> > > > > > +++ b/drivers/firmware/efi/libstub/Makefile
> > > > > > @@ -52,6 +52,7 @@ lib-$(CONFIG_EFI_ARMSTUB)     += arm-stub.o fdt.o
> > > > > > string.o random.o \
> > > > > >
> > > > > >  lib-$(CONFIG_ARM)              += arm32-stub.o
> > > > > >  lib-$(CONFIG_ARM64)            += arm64-stub.o
> > > > > > +CFLAGS_arm32-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > > > > >  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
> > > > > >
> > > > > >  #
> > > > > > diff --git a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > index e8f7aefb6813..66ff0c8ec269 100644
> > > > > > --- a/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > +++ b/drivers/firmware/efi/libstub/arm32-stub.c
> > > > > > @@ -204,7 +204,7 @@ efi_status_t
> > > > > > handle_kernel_image(efi_system_table_t *sys_table,
> > > > > >          * loaded. These assumptions are made by the decompressor,
> > > > > >          * before any memory map is available.
> > > > > >          */
> > > > > > -       dram_base = round_up(dram_base, SZ_128M);
> > > > > > +       dram_base = round_up(dram_base, SZ_128M) + TEXT_OFFSET;
> > > > > >
> > > > > >         status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
> > > > > >                                      reserve_size);
> > > > > >
> > > > >
> > > > > I tried your patch on rpi2 and got the following panic. Just a reminder that I
> > > > > have replaced some log messages with "......" since it might be too long to
> > > > > post all.
> > > > >
> > > >
> > > > OK. Good to know that this change helps you to get past the EFI stub boot issue.
> > > >
> > > > > In this case the kernel failed to reserve cma, which should hit the issue of
> > > > > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > > > > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > > > > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > > > > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > > > > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > > > > if any suggestion, thank you.
> > >
> > >
> > > > This looks like it is a separate issue. The memblock/cma code should
> > > > not choke on a reserved page of memory at 0x0.
> > > >
> > > > Perhaps Russell or Mike (cc'ed) have an idea how to address this?
> > >
> > > Presuming that the last memblock dump comes from the end of
> > > arm_memblock_init() with the this memory map
> > >
> > > memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > > memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > > memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > > memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> > >
> > > adjust_lowmem_bounds() will set the memblock_limit (and respectively global
> > > memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
> > > happily fail.
> > >
> > > I believe that the assumption for memblock_limit calculations was that the
> > > first bank has several megs at least.
> > >
> > > I wonder if this hack would help:
> > >
> > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > index d9a0038..948e5b9 100644
> > > --- a/arch/arm/mm/mmu.c
> > > +++ b/arch/arm/mm/mmu.c
> > > @@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
> > >                        * allocated when mapping the start of bank 0, which
> > >                        * occurs before any free memory is mapped.
> > >                        */
> > > -                     if (!memblock_limit) {
> > > +                     if (memblock_limit < PMD_SIZE) {
> > >                               if (!IS_ALIGNED(block_start, PMD_SIZE))
> > >                                       memblock_limit = block_start;
> > >                               else if (!IS_ALIGNED(block_end, PMD_SIZE))
> > >
> >
> > I applied this patch as well and it works well on rpi-2 model B.
> >
> 
> Thanks, Chester, that is good to know.
> 
> However, afaict, this only affects systems where physical memory
> starts at address 0x0, so I think we need a better fix.
> 
> I know Mike has been looking into the NOMAP stuff lately, and your
> original patch contains a hunk that makes this code (?) disregard
> nomap memblocks. That might be a better approach.
>
Hi Ard and Mike,

In my original patch, I studied map_lowmem() and found that some blocks might
not be mapped according to the current memory map. Thus I assumed maybe NOMAP
blocks could still be ignored in adjust_lowmem_bounds() since they would not
be allocated afterward. But that change in mmu.c still depends on a condition
that there should be a PMD_SIZE block or consecutive smaller NOMAP blocks which
exacly fit the PM_SIZE alignment at the beginning of memory map otherwise the
memblock_limit could still fall on a very low address. That's why I tried to
allocate pages again in arm32-stub.c in order to fill the gap between the
unaligned block_start and the PMD_SIZE aligned kernel base.

Please feel free to let me know if any idea and I am willing to help with
verification.
Mike Rapoport Aug. 20, 2019, 7:49 a.m. UTC | #10
On Mon, Aug 19, 2019 at 05:56:51PM +0300, Ard Biesheuvel wrote:
> On Mon, 19 Aug 2019 at 11:01, Chester Lin <clin@suse.com> wrote:
> >
> > Hi Mike and Ard,
> >
> > On Thu, Aug 15, 2019 at 04:37:39PM +0300, Mike Rapoport wrote:
> > > On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> > > > (adding Mike)
> > > >

...

> > > > > In this case the kernel failed to reserve cma, which should hit the issue of
> > > > > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > > > > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > > > > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > > > > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > > > > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > > > > if any suggestion, thank you.
> > >
> > >
> > > > This looks like it is a separate issue. The memblock/cma code should
> > > > not choke on a reserved page of memory at 0x0.
> > > >
> > > > Perhaps Russell or Mike (cc'ed) have an idea how to address this?
> > >
> > > Presuming that the last memblock dump comes from the end of
> > > arm_memblock_init() with the this memory map
> > >
> > > memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > > memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > > memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > > memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> > >
> > > adjust_lowmem_bounds() will set the memblock_limit (and respectively global
> > > memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
> > > happily fail.
> > >
> > > I believe that the assumption for memblock_limit calculations was that the
> > > first bank has several megs at least.
> > >
> > > I wonder if this hack would help:
> > >
> > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > index d9a0038..948e5b9 100644
> > > --- a/arch/arm/mm/mmu.c
> > > +++ b/arch/arm/mm/mmu.c
> > > @@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
> > >                        * allocated when mapping the start of bank 0, which
> > >                        * occurs before any free memory is mapped.
> > >                        */
> > > -                     if (!memblock_limit) {
> > > +                     if (memblock_limit < PMD_SIZE) {
> > >                               if (!IS_ALIGNED(block_start, PMD_SIZE))
> > >                                       memblock_limit = block_start;
> > >                               else if (!IS_ALIGNED(block_end, PMD_SIZE))
> > >
> >
> > I applied this patch as well and it works well on rpi-2 model B.
> >
> 
> Thanks, Chester, that is good to know.
> 
> However, afaict, this only affects systems where physical memory
> starts at address 0x0, so I think we need a better fix.

This hack can be easily extended to handle systems with arbitrary start
address, but it's still a hack...

> I know Mike has been looking into the NOMAP stuff lately, and your
> original patch contains a hunk that makes this code (?) disregard
> nomap memblocks. That might be a better approach.

I was actually looking how to replace NOMAP with something else to make
memblock.memory consistent with actual physical memory banks. But this work
is stashed for now.

I'm not sure that skipping NOMAP regions would be good enough.
If I understand corrrectly, with Chester's original patch the reservation
of PMD aligned chunk of 32M for the kernel made the first conv-mem region
PMD aligned and then memblock_limit will be set to the end of this region.

Is there a reason for marking EFI_RESERVED_TYPE as NOMAP rather than simply
reserve them with memblock_reserve()?
Chester Lin Aug. 20, 2019, 10:29 a.m. UTC | #11
On Tue, Aug 20, 2019 at 10:49:30AM +0300, Mike Rapoport wrote:
> On Mon, Aug 19, 2019 at 05:56:51PM +0300, Ard Biesheuvel wrote:
> > On Mon, 19 Aug 2019 at 11:01, Chester Lin <clin@suse.com> wrote:
> > >
> > > Hi Mike and Ard,
> > >
> > > On Thu, Aug 15, 2019 at 04:37:39PM +0300, Mike Rapoport wrote:
> > > > On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> > > > > (adding Mike)
> > > > >
> 
> ...
> 
> > > > > > In this case the kernel failed to reserve cma, which should hit the issue of
> > > > > > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > > > > > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > > > > > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > > > > > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > > > > > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > > > > > if any suggestion, thank you.
> > > >
> > > >
> > > > > This looks like it is a separate issue. The memblock/cma code should
> > > > > not choke on a reserved page of memory at 0x0.
> > > > >
> > > > > Perhaps Russell or Mike (cc'ed) have an idea how to address this?
> > > >
> > > > Presuming that the last memblock dump comes from the end of
> > > > arm_memblock_init() with the this memory map
> > > >
> > > > memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > > > memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > > > memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > > > memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> > > >
> > > > adjust_lowmem_bounds() will set the memblock_limit (and respectively global
> > > > memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
> > > > happily fail.
> > > >
> > > > I believe that the assumption for memblock_limit calculations was that the
> > > > first bank has several megs at least.
> > > >
> > > > I wonder if this hack would help:
> > > >
> > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > index d9a0038..948e5b9 100644
> > > > --- a/arch/arm/mm/mmu.c
> > > > +++ b/arch/arm/mm/mmu.c
> > > > @@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
> > > >                        * allocated when mapping the start of bank 0, which
> > > >                        * occurs before any free memory is mapped.
> > > >                        */
> > > > -                     if (!memblock_limit) {
> > > > +                     if (memblock_limit < PMD_SIZE) {
> > > >                               if (!IS_ALIGNED(block_start, PMD_SIZE))
> > > >                                       memblock_limit = block_start;
> > > >                               else if (!IS_ALIGNED(block_end, PMD_SIZE))
> > > >
> > >
> > > I applied this patch as well and it works well on rpi-2 model B.
> > >
> > 
> > Thanks, Chester, that is good to know.
> > 
> > However, afaict, this only affects systems where physical memory
> > starts at address 0x0, so I think we need a better fix.
> 
> This hack can be easily extended to handle systems with arbitrary start
> address, but it's still a hack...
> 
> > I know Mike has been looking into the NOMAP stuff lately, and your
> > original patch contains a hunk that makes this code (?) disregard
> > nomap memblocks. That might be a better approach.
> 
> I was actually looking how to replace NOMAP with something else to make
> memblock.memory consistent with actual physical memory banks. But this work
> is stashed for now.
> 
> I'm not sure that skipping NOMAP regions would be good enough.
> If I understand corrrectly, with Chester's original patch the reservation
> of PMD aligned chunk of 32M for the kernel made the first conv-mem region
> PMD aligned and then memblock_limit will be set to the end of this region.
> 
> Is there a reason for marking EFI_RESERVED_TYPE as NOMAP rather than simply
> reserve them with memblock_reserve()?
> 

Hi Mike,

I make this change in efistub so I am not sure if memblock_reserve() can be
linked by ld or not. I tried using efi_mem_reserve() but got a linker error of
undefined reference. Is there a better place to call memblock_reserve() after
efistub?

Thanks,
Chester
Ard Biesheuvel Aug. 20, 2019, 11 a.m. UTC | #12
On Tue, 20 Aug 2019 at 10:49, Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> On Mon, Aug 19, 2019 at 05:56:51PM +0300, Ard Biesheuvel wrote:
> > On Mon, 19 Aug 2019 at 11:01, Chester Lin <clin@suse.com> wrote:
> > >
> > > Hi Mike and Ard,
> > >
> > > On Thu, Aug 15, 2019 at 04:37:39PM +0300, Mike Rapoport wrote:
> > > > On Thu, Aug 15, 2019 at 02:32:50PM +0300, Ard Biesheuvel wrote:
> > > > > (adding Mike)
> > > > >
>
> ...
>
> > > > > > In this case the kernel failed to reserve cma, which should hit the issue of
> > > > > > memblock_limit=0x1000 as I had mentioned in my patch description. The first
> > > > > > block [0-0xfff] was scanned in adjust_lowmem_bounds(), but it did not align
> > > > > > with PMD_SIZE so the cma reservation failed because the memblock.current_limit
> > > > > > was extremely low. That's why I expand the first reservation from 1 PAGESIZE to
> > > > > > 1 PMD_SIZE in my patch in order to avoid this issue. Please kindly let me know
> > > > > > if any suggestion, thank you.
> > > >
> > > >
> > > > > This looks like it is a separate issue. The memblock/cma code should
> > > > > not choke on a reserved page of memory at 0x0.
> > > > >
> > > > > Perhaps Russell or Mike (cc'ed) have an idea how to address this?
> > > >
> > > > Presuming that the last memblock dump comes from the end of
> > > > arm_memblock_init() with the this memory map
> > > >
> > > > memory[0x0] [0x0000000000000000-0x0000000000000fff], 0x0000000000001000 bytes flags: 0x4
> > > > memory[0x1] [0x0000000000001000-0x0000000007ef5fff], 0x0000000007ef5000 bytes flags: 0x0
> > > > memory[0x2] [0x0000000007ef6000-0x0000000007f09fff], 0x0000000000014000 bytes flags: 0x4
> > > > memory[0x3] [0x0000000007f0a000-0x000000003cb3efff], 0x0000000034c35000 bytes flags: 0x0
> > > >
> > > > adjust_lowmem_bounds() will set the memblock_limit (and respectively global
> > > > memblock.current_limit) to 0x1000 and any further memblock_alloc*() will
> > > > happily fail.
> > > >
> > > > I believe that the assumption for memblock_limit calculations was that the
> > > > first bank has several megs at least.
> > > >
> > > > I wonder if this hack would help:
> > > >
> > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > index d9a0038..948e5b9 100644
> > > > --- a/arch/arm/mm/mmu.c
> > > > +++ b/arch/arm/mm/mmu.c
> > > > @@ -1206,7 +1206,7 @@ void __init adjust_lowmem_bounds(void)
> > > >                        * allocated when mapping the start of bank 0, which
> > > >                        * occurs before any free memory is mapped.
> > > >                        */
> > > > -                     if (!memblock_limit) {
> > > > +                     if (memblock_limit < PMD_SIZE) {
> > > >                               if (!IS_ALIGNED(block_start, PMD_SIZE))
> > > >                                       memblock_limit = block_start;
> > > >                               else if (!IS_ALIGNED(block_end, PMD_SIZE))
> > > >
> > >
> > > I applied this patch as well and it works well on rpi-2 model B.
> > >
> >
> > Thanks, Chester, that is good to know.
> >
> > However, afaict, this only affects systems where physical memory
> > starts at address 0x0, so I think we need a better fix.
>
> This hack can be easily extended to handle systems with arbitrary start
> address, but it's still a hack...
>
> > I know Mike has been looking into the NOMAP stuff lately, and your
> > original patch contains a hunk that makes this code (?) disregard
> > nomap memblocks. That might be a better approach.
>
> I was actually looking how to replace NOMAP with something else to make
> memblock.memory consistent with actual physical memory banks. But this work
> is stashed for now.
>
> I'm not sure that skipping NOMAP regions would be good enough.
> If I understand corrrectly, with Chester's original patch the reservation
> of PMD aligned chunk of 32M for the kernel made the first conv-mem region
> PMD aligned and then memblock_limit will be set to the end of this region.
>
> Is there a reason for marking EFI_RESERVED_TYPE as NOMAP rather than simply
> reserve them with memblock_reserve()?
>

Yes.

On ARM systems, reserved memory regions should never be mapped by
default, since the cacheable mappings we use in the linear region may
conflict with the mapping attributes used by the firmware or driver
components that are using this memory.

In this particular case, we are talking about things like spin tables
and pens for secondaries that boot up with their caches disabled, and
having a cacheable mapping on the primary CPU might cause a loss of
coherency.
Russell King (Oracle) Aug. 20, 2019, 11:54 a.m. UTC | #13
On Sun, Aug 04, 2019 at 10:57:00AM +0300, Ard Biesheuvel wrote:
> (The first TEXT_OFFSET bytes are no longer used in practice, which is
> why putting a reserved region of 4 KB bytes works at the moment, but
> this is fragile).

That is not correct for 32-bit ARM.  The swapper page table is still
located 16kiB below the kernel.
Russell King (Oracle) Aug. 20, 2019, 11:56 a.m. UTC | #14
On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index f3ce34113f89..909b11ba48d8 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
>  		phys_addr_t block_start = reg->base;
>  		phys_addr_t block_end = reg->base + reg->size;
>  
> +		if (memblock_is_nomap(reg))
> +			continue;
> +
>  		if (reg->base < vmalloc_limit) {
>  			if (block_end > lowmem_limit)
>  				/*

I think this hunk is sane - if the memory is marked nomap, then it isn't
available for the kernel's use, so as far as calculating where the
lowmem/highmem boundary is, it effectively doesn't exist and should be
skipped.
Ard Biesheuvel Aug. 20, 2019, 12:27 p.m. UTC | #15
On Tue, 20 Aug 2019 at 14:54, Russell King - ARM Linux admin
<linux@armlinux.org.uk> wrote:
>
> On Sun, Aug 04, 2019 at 10:57:00AM +0300, Ard Biesheuvel wrote:
> > (The first TEXT_OFFSET bytes are no longer used in practice, which is
> > why putting a reserved region of 4 KB bytes works at the moment, but
> > this is fragile).
>
> That is not correct for 32-bit ARM.  The swapper page table is still
> located 16kiB below the kernel.
>

Right. So that means we can only permit reserved regions in the first
(TEXT_OFFSET - 16 kiB) bytes starting at the first 128 MiB aligned
address covered by system RAM, if we want to ensure that the
decompressor or the early kernel don't trample on it. (or TEXT_OFFSET
- 20 kiB for LPAE kernels)
Ard Biesheuvel Aug. 20, 2019, 12:28 p.m. UTC | #16
On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
<linux@armlinux.org.uk> wrote:
>
> On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > index f3ce34113f89..909b11ba48d8 100644
> > --- a/arch/arm/mm/mmu.c
> > +++ b/arch/arm/mm/mmu.c
> > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> >               phys_addr_t block_start = reg->base;
> >               phys_addr_t block_end = reg->base + reg->size;
> >
> > +             if (memblock_is_nomap(reg))
> > +                     continue;
> > +
> >               if (reg->base < vmalloc_limit) {
> >                       if (block_end > lowmem_limit)
> >                               /*
>
> I think this hunk is sane - if the memory is marked nomap, then it isn't
> available for the kernel's use, so as far as calculating where the
> lowmem/highmem boundary is, it effectively doesn't exist and should be
> skipped.
>

I agree.

Chester, could you explain what you need beyond this change (and my
EFI stub change involving TEXT_OFFSET) to make things work on the
RPi2?
Chester Lin Aug. 21, 2019, 6:10 a.m. UTC | #17
On Tue, Aug 20, 2019 at 03:28:25PM +0300, Ard Biesheuvel wrote:
> On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
> <linux@armlinux.org.uk> wrote:
> >
> > On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > index f3ce34113f89..909b11ba48d8 100644
> > > --- a/arch/arm/mm/mmu.c
> > > +++ b/arch/arm/mm/mmu.c
> > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > >               phys_addr_t block_start = reg->base;
> > >               phys_addr_t block_end = reg->base + reg->size;
> > >
> > > +             if (memblock_is_nomap(reg))
> > > +                     continue;
> > > +
> > >               if (reg->base < vmalloc_limit) {
> > >                       if (block_end > lowmem_limit)
> > >                               /*
> >
> > I think this hunk is sane - if the memory is marked nomap, then it isn't
> > available for the kernel's use, so as far as calculating where the
> > lowmem/highmem boundary is, it effectively doesn't exist and should be
> > skipped.
> >
> 
> I agree.
> 
> Chester, could you explain what you need beyond this change (and my
> EFI stub change involving TEXT_OFFSET) to make things work on the
> RPi2?
>

Hi Ard,

In fact I am working with Guillaume to try booting zImage kernel and openSUSE
from grub2.04 + arm32-efistub so that's why we get this issue on RPi2, which is
one of the test machines we have. However we want a better solution for all
cases but not just RPi2 since we don't want to affect other platforms as well.

Regards,
Chester
Ard Biesheuvel Aug. 21, 2019, 6:35 a.m. UTC | #18
On Wed, 21 Aug 2019 at 09:11, Chester Lin <clin@suse.com> wrote:
>
> On Tue, Aug 20, 2019 at 03:28:25PM +0300, Ard Biesheuvel wrote:
> > On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
> > <linux@armlinux.org.uk> wrote:
> > >
> > > On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > index f3ce34113f89..909b11ba48d8 100644
> > > > --- a/arch/arm/mm/mmu.c
> > > > +++ b/arch/arm/mm/mmu.c
> > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > >               phys_addr_t block_start = reg->base;
> > > >               phys_addr_t block_end = reg->base + reg->size;
> > > >
> > > > +             if (memblock_is_nomap(reg))
> > > > +                     continue;
> > > > +
> > > >               if (reg->base < vmalloc_limit) {
> > > >                       if (block_end > lowmem_limit)
> > > >                               /*
> > >
> > > I think this hunk is sane - if the memory is marked nomap, then it isn't
> > > available for the kernel's use, so as far as calculating where the
> > > lowmem/highmem boundary is, it effectively doesn't exist and should be
> > > skipped.
> > >
> >
> > I agree.
> >
> > Chester, could you explain what you need beyond this change (and my
> > EFI stub change involving TEXT_OFFSET) to make things work on the
> > RPi2?
> >
>
> Hi Ard,
>
> In fact I am working with Guillaume to try booting zImage kernel and openSUSE
> from grub2.04 + arm32-efistub so that's why we get this issue on RPi2, which is
> one of the test machines we have. However we want a better solution for all
> cases but not just RPi2 since we don't want to affect other platforms as well.
>

Thanks Chester, but that doesn't answer my question.

Your fix is a single patch that changes various things that are only
vaguely related. We have already identified that we need to take
TEXT_OFFSET (minus some space used by the swapper page tables) into
account into the EFI stub if we want to ensure compatibility with many
different platforms, and as it turns out, this applies not only to
RPi2 but to other platforms as well, most notably the ones that
require a TEXT_OFFSET of 0x208000, since they also have reserved
regions at the base of RAM.

My question was what else we need beyond:
- the EFI stub TEXT_OFFSET fix [0]
- the change to disregard NOMAP memblocks in adjust_lowmem_bounds()
- what else???


[0] https://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git/commit/?h=next&id=0eb7bad595e52666b642a02862ad996a0f9bfcc0
Mike Rapoport Aug. 21, 2019, 7:11 a.m. UTC | #19
On Wed, Aug 21, 2019 at 09:35:16AM +0300, Ard Biesheuvel wrote:
> On Wed, 21 Aug 2019 at 09:11, Chester Lin <clin@suse.com> wrote:
> >
> > On Tue, Aug 20, 2019 at 03:28:25PM +0300, Ard Biesheuvel wrote:
> > > On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
> > > <linux@armlinux.org.uk> wrote:
> > > >
> > > > On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > > index f3ce34113f89..909b11ba48d8 100644
> > > > > --- a/arch/arm/mm/mmu.c
> > > > > +++ b/arch/arm/mm/mmu.c
> > > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > > >               phys_addr_t block_start = reg->base;
> > > > >               phys_addr_t block_end = reg->base + reg->size;
> > > > >
> > > > > +             if (memblock_is_nomap(reg))
> > > > > +                     continue;
> > > > > +
> > > > >               if (reg->base < vmalloc_limit) {
> > > > >                       if (block_end > lowmem_limit)
> > > > >                               /*
> > > >
> > > > I think this hunk is sane - if the memory is marked nomap, then it isn't
> > > > available for the kernel's use, so as far as calculating where the
> > > > lowmem/highmem boundary is, it effectively doesn't exist and should be
> > > > skipped.
> > > >
> > >
> > > I agree.
> > >
> > > Chester, could you explain what you need beyond this change (and my
> > > EFI stub change involving TEXT_OFFSET) to make things work on the
> > > RPi2?
> > >
> >
> > Hi Ard,
> >
> > In fact I am working with Guillaume to try booting zImage kernel and openSUSE
> > from grub2.04 + arm32-efistub so that's why we get this issue on RPi2, which is
> > one of the test machines we have. However we want a better solution for all
> > cases but not just RPi2 since we don't want to affect other platforms as well.
> >
> 
> Thanks Chester, but that doesn't answer my question.
> 
> Your fix is a single patch that changes various things that are only
> vaguely related. We have already identified that we need to take
> TEXT_OFFSET (minus some space used by the swapper page tables) into
> account into the EFI stub if we want to ensure compatibility with many
> different platforms, and as it turns out, this applies not only to
> RPi2 but to other platforms as well, most notably the ones that
> require a TEXT_OFFSET of 0x208000, since they also have reserved
> regions at the base of RAM.
> 
> My question was what else we need beyond:
> - the EFI stub TEXT_OFFSET fix [0]
> - the change to disregard NOMAP memblocks in adjust_lowmem_bounds()
> - what else???

I think the only missing part here is to ensure that non-reserved memory in
bank 0 starts from a PMD-aligned address. I believe this could be done if
EFI stub, but I'm not really familiar with it so this just a semi-educated
guess :)
 
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git/commit/?h=next&id=0eb7bad595e52666b642a02862ad996a0f9bfcc0
Chester Lin Aug. 21, 2019, 7:22 a.m. UTC | #20
On Wed, Aug 21, 2019 at 10:11:01AM +0300, Mike Rapoport wrote:
> On Wed, Aug 21, 2019 at 09:35:16AM +0300, Ard Biesheuvel wrote:
> > On Wed, 21 Aug 2019 at 09:11, Chester Lin <clin@suse.com> wrote:
> > >
> > > On Tue, Aug 20, 2019 at 03:28:25PM +0300, Ard Biesheuvel wrote:
> > > > On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
> > > > <linux@armlinux.org.uk> wrote:
> > > > >
> > > > > On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > > > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > > > index f3ce34113f89..909b11ba48d8 100644
> > > > > > --- a/arch/arm/mm/mmu.c
> > > > > > +++ b/arch/arm/mm/mmu.c
> > > > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > > > >               phys_addr_t block_start = reg->base;
> > > > > >               phys_addr_t block_end = reg->base + reg->size;
> > > > > >
> > > > > > +             if (memblock_is_nomap(reg))
> > > > > > +                     continue;
> > > > > > +
> > > > > >               if (reg->base < vmalloc_limit) {
> > > > > >                       if (block_end > lowmem_limit)
> > > > > >                               /*
> > > > >
> > > > > I think this hunk is sane - if the memory is marked nomap, then it isn't
> > > > > available for the kernel's use, so as far as calculating where the
> > > > > lowmem/highmem boundary is, it effectively doesn't exist and should be
> > > > > skipped.
> > > > >
> > > >
> > > > I agree.
> > > >
> > > > Chester, could you explain what you need beyond this change (and my
> > > > EFI stub change involving TEXT_OFFSET) to make things work on the
> > > > RPi2?
> > > >
> > >
> > > Hi Ard,
> > >
> > > In fact I am working with Guillaume to try booting zImage kernel and openSUSE
> > > from grub2.04 + arm32-efistub so that's why we get this issue on RPi2, which is
> > > one of the test machines we have. However we want a better solution for all
> > > cases but not just RPi2 since we don't want to affect other platforms as well.
> > >
> > 
> > Thanks Chester, but that doesn't answer my question.
> > 
> > Your fix is a single patch that changes various things that are only
> > vaguely related. We have already identified that we need to take
> > TEXT_OFFSET (minus some space used by the swapper page tables) into
> > account into the EFI stub if we want to ensure compatibility with many
> > different platforms, and as it turns out, this applies not only to
> > RPi2 but to other platforms as well, most notably the ones that
> > require a TEXT_OFFSET of 0x208000, since they also have reserved
> > regions at the base of RAM.
> > 
> > My question was what else we need beyond:
> > - the EFI stub TEXT_OFFSET fix [0]
> > - the change to disregard NOMAP memblocks in adjust_lowmem_bounds()
> > - what else???
> 
> I think the only missing part here is to ensure that non-reserved memory in
> bank 0 starts from a PMD-aligned address. I believe this could be done if
> EFI stub, but I'm not really familiar with it so this just a semi-educated
> guess :)
>  
> > [0] https://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git/commit/?h=next&id=0eb7bad595e52666b642a02862ad996a0f9bfcc0
>

Hi Ard and Mike,

Sorry for my misunderstanding and I agree with Mike. We could still meet the
memblock_limit issue if there's a non-reserved memory in bank0 starts from an
unaligned address.

Regards,
Chester
Ard Biesheuvel Aug. 21, 2019, 7:29 a.m. UTC | #21
On Wed, 21 Aug 2019 at 10:11, Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> On Wed, Aug 21, 2019 at 09:35:16AM +0300, Ard Biesheuvel wrote:
> > On Wed, 21 Aug 2019 at 09:11, Chester Lin <clin@suse.com> wrote:
> > >
> > > On Tue, Aug 20, 2019 at 03:28:25PM +0300, Ard Biesheuvel wrote:
> > > > On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
> > > > <linux@armlinux.org.uk> wrote:
> > > > >
> > > > > On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > > > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > > > index f3ce34113f89..909b11ba48d8 100644
> > > > > > --- a/arch/arm/mm/mmu.c
> > > > > > +++ b/arch/arm/mm/mmu.c
> > > > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > > > >               phys_addr_t block_start = reg->base;
> > > > > >               phys_addr_t block_end = reg->base + reg->size;
> > > > > >
> > > > > > +             if (memblock_is_nomap(reg))
> > > > > > +                     continue;
> > > > > > +
> > > > > >               if (reg->base < vmalloc_limit) {
> > > > > >                       if (block_end > lowmem_limit)
> > > > > >                               /*
> > > > >
> > > > > I think this hunk is sane - if the memory is marked nomap, then it isn't
> > > > > available for the kernel's use, so as far as calculating where the
> > > > > lowmem/highmem boundary is, it effectively doesn't exist and should be
> > > > > skipped.
> > > > >
> > > >
> > > > I agree.
> > > >
> > > > Chester, could you explain what you need beyond this change (and my
> > > > EFI stub change involving TEXT_OFFSET) to make things work on the
> > > > RPi2?
> > > >
> > >
> > > Hi Ard,
> > >
> > > In fact I am working with Guillaume to try booting zImage kernel and openSUSE
> > > from grub2.04 + arm32-efistub so that's why we get this issue on RPi2, which is
> > > one of the test machines we have. However we want a better solution for all
> > > cases but not just RPi2 since we don't want to affect other platforms as well.
> > >
> >
> > Thanks Chester, but that doesn't answer my question.
> >
> > Your fix is a single patch that changes various things that are only
> > vaguely related. We have already identified that we need to take
> > TEXT_OFFSET (minus some space used by the swapper page tables) into
> > account into the EFI stub if we want to ensure compatibility with many
> > different platforms, and as it turns out, this applies not only to
> > RPi2 but to other platforms as well, most notably the ones that
> > require a TEXT_OFFSET of 0x208000, since they also have reserved
> > regions at the base of RAM.
> >
> > My question was what else we need beyond:
> > - the EFI stub TEXT_OFFSET fix [0]
> > - the change to disregard NOMAP memblocks in adjust_lowmem_bounds()
> > - what else???
>
> I think the only missing part here is to ensure that non-reserved memory in
> bank 0 starts from a PMD-aligned address. I believe this could be done if
> EFI stub, but I'm not really familiar with it so this just a semi-educated
> guess :)
>

Given that it is the ARM arch code that imposes this requirement, how
about adding something like this to adjust_lowmem_bounds():

if (memblock_start_of_DRAM() % PMD_SIZE)
    memblock_mark_nomap(memblock_start_of_DRAM(),
        PMD_SIZE - (memblock_start_of_DRAM() % PMD_SIZE));

(and introduce the nomap check into the loop)
Mike Rapoport Aug. 21, 2019, 8:29 a.m. UTC | #22
On Wed, Aug 21, 2019 at 10:29:37AM +0300, Ard Biesheuvel wrote:
> On Wed, 21 Aug 2019 at 10:11, Mike Rapoport <rppt@linux.ibm.com> wrote:
> >
> > On Wed, Aug 21, 2019 at 09:35:16AM +0300, Ard Biesheuvel wrote:
> > > On Wed, 21 Aug 2019 at 09:11, Chester Lin <clin@suse.com> wrote:
> > > >
> > > > On Tue, Aug 20, 2019 at 03:28:25PM +0300, Ard Biesheuvel wrote:
> > > > > On Tue, 20 Aug 2019 at 14:56, Russell King - ARM Linux admin
> > > > > <linux@armlinux.org.uk> wrote:
> > > > > >
> > > > > > On Fri, Aug 02, 2019 at 05:38:54AM +0000, Chester Lin wrote:
> > > > > > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > > > > > > index f3ce34113f89..909b11ba48d8 100644
> > > > > > > --- a/arch/arm/mm/mmu.c
> > > > > > > +++ b/arch/arm/mm/mmu.c
> > > > > > > @@ -1184,6 +1184,9 @@ void __init adjust_lowmem_bounds(void)
> > > > > > >               phys_addr_t block_start = reg->base;
> > > > > > >               phys_addr_t block_end = reg->base + reg->size;
> > > > > > >
> > > > > > > +             if (memblock_is_nomap(reg))
> > > > > > > +                     continue;
> > > > > > > +
> > > > > > >               if (reg->base < vmalloc_limit) {
> > > > > > >                       if (block_end > lowmem_limit)
> > > > > > >                               /*
> > > > > >
> > > > > > I think this hunk is sane - if the memory is marked nomap, then it isn't
> > > > > > available for the kernel's use, so as far as calculating where the
> > > > > > lowmem/highmem boundary is, it effectively doesn't exist and should be
> > > > > > skipped.
> > > > > >
> > > > >
> > > > > I agree.
> > > > >
> > > > > Chester, could you explain what you need beyond this change (and my
> > > > > EFI stub change involving TEXT_OFFSET) to make things work on the
> > > > > RPi2?
> > > > >
> > > >
> > > > Hi Ard,
> > > >
> > > > In fact I am working with Guillaume to try booting zImage kernel and openSUSE
> > > > from grub2.04 + arm32-efistub so that's why we get this issue on RPi2, which is
> > > > one of the test machines we have. However we want a better solution for all
> > > > cases but not just RPi2 since we don't want to affect other platforms as well.
> > > >
> > >
> > > Thanks Chester, but that doesn't answer my question.
> > >
> > > Your fix is a single patch that changes various things that are only
> > > vaguely related. We have already identified that we need to take
> > > TEXT_OFFSET (minus some space used by the swapper page tables) into
> > > account into the EFI stub if we want to ensure compatibility with many
> > > different platforms, and as it turns out, this applies not only to
> > > RPi2 but to other platforms as well, most notably the ones that
> > > require a TEXT_OFFSET of 0x208000, since they also have reserved
> > > regions at the base of RAM.
> > >
> > > My question was what else we need beyond:
> > > - the EFI stub TEXT_OFFSET fix [0]
> > > - the change to disregard NOMAP memblocks in adjust_lowmem_bounds()
> > > - what else???
> >
> > I think the only missing part here is to ensure that non-reserved memory in
> > bank 0 starts from a PMD-aligned address. I believe this could be done if
> > EFI stub, but I'm not really familiar with it so this just a semi-educated
> > guess :)
> >
> 
> Given that it is the ARM arch code that imposes this requirement, how
> about adding something like this to adjust_lowmem_bounds():
> 
> if (memblock_start_of_DRAM() % PMD_SIZE)
>     memblock_mark_nomap(memblock_start_of_DRAM(),
>         PMD_SIZE - (memblock_start_of_DRAM() % PMD_SIZE));

memblock_start_of_DRAM() won't work here, as it returns the actual start of
the DRAM including NOMAP regions. Moreover, as we cannot mark a region
NOMAP inside for_each_memblock() this should be done beforehand.

I think something like this could work:

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 2f0f07e..f2b635b 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1178,6 +1178,19 @@ void __init adjust_lowmem_bounds(void)
 	 */
 	vmalloc_limit = (u64)(uintptr_t)vmalloc_min - PAGE_OFFSET + PHYS_OFFSET;
 
+	/*
+	 * The first usable region must be PMD aligned. Mark its start
+	 * as MEMBLOCK_NOMAP if it isn't
+	 */
+	for_each_memblock(memory, reg) {
+		if (!memblock_is_nomap(reg) && (reg->base % PMD_SIZE)) {
+			phys_addr_t size = PMD_SIZE - (reg->base % PMD_SIZE);
+
+			memblock_mark_nomap(reg->base, size);
+			break;
+		}
+	}
+
 	for_each_memblock(memory, reg) {
 		phys_addr_t block_start = reg->base;
 		phys_addr_t block_end = reg->base + reg->size;



 
> (and introduce the nomap check into the loop)
Ard Biesheuvel Aug. 21, 2019, 9:42 a.m. UTC | #23
On Wed, 21 Aug 2019 at 11:29, Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> On Wed, Aug 21, 2019 at 10:29:37AM +0300, Ard Biesheuvel wrote:
> > On Wed, 21 Aug 2019 at 10:11, Mike Rapoport <rppt@linux.ibm.com> wrote:
> > >
...
> > > I think the only missing part here is to ensure that non-reserved memory in
> > > bank 0 starts from a PMD-aligned address. I believe this could be done if
> > > EFI stub, but I'm not really familiar with it so this just a semi-educated
> > > guess :)
> > >
> >
> > Given that it is the ARM arch code that imposes this requirement, how
> > about adding something like this to adjust_lowmem_bounds():
> >
> > if (memblock_start_of_DRAM() % PMD_SIZE)
> >     memblock_mark_nomap(memblock_start_of_DRAM(),
> >         PMD_SIZE - (memblock_start_of_DRAM() % PMD_SIZE));
>
> memblock_start_of_DRAM() won't work here, as it returns the actual start of
> the DRAM including NOMAP regions. Moreover, as we cannot mark a region
> NOMAP inside for_each_memblock() this should be done beforehand.
>
> I think something like this could work:
>
> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index 2f0f07e..f2b635b 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1178,6 +1178,19 @@ void __init adjust_lowmem_bounds(void)
>          */
>         vmalloc_limit = (u64)(uintptr_t)vmalloc_min - PAGE_OFFSET + PHYS_OFFSET;
>
> +       /*
> +        * The first usable region must be PMD aligned. Mark its start
> +        * as MEMBLOCK_NOMAP if it isn't
> +        */
> +       for_each_memblock(memory, reg) {
> +               if (!memblock_is_nomap(reg) && (reg->base % PMD_SIZE)) {
> +                       phys_addr_t size = PMD_SIZE - (reg->base % PMD_SIZE);
> +
> +                       memblock_mark_nomap(reg->base, size);
> +                       break;

We should break on the first !NOMAP memblock, even if it is already
PMD aligned, but beyond that, this looks ok to me.
diff mbox series

Patch

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index f3ce34113f89..909b11ba48d8 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1184,6 +1184,9 @@  void __init adjust_lowmem_bounds(void)
 		phys_addr_t block_start = reg->base;
 		phys_addr_t block_end = reg->base + reg->size;
 
+		if (memblock_is_nomap(reg))
+			continue;
+
 		if (reg->base < vmalloc_limit) {
 			if (block_end > lowmem_limit)
 				/*
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index e8f7aefb6813..10d33d36df00 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -128,7 +128,7 @@  static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
 
 	for (l = 0; l < map_size; l += desc_size) {
 		efi_memory_desc_t *desc;
-		u64 start, end;
+		u64 start, end, spare, kernel_base;
 
 		desc = (void *)memory_map + l;
 		start = desc->phys_addr;
@@ -144,27 +144,52 @@  static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
 		case EFI_BOOT_SERVICES_DATA:
 			/* Ignore types that are released to the OS anyway */
 			continue;
-
+		case EFI_RESERVED_TYPE:
+			/* Ignore reserved regions */
+			continue;
 		case EFI_CONVENTIONAL_MEMORY:
 			/*
 			 * Reserve the intersection between this entry and the
 			 * region.
 			 */
 			start = max(start, (u64)dram_base);
-			end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
+			kernel_base = round_up(start, PMD_SIZE);
+			spare = kernel_base - start;
+			end = min(end, kernel_base + MAX_UNCOMP_KERNEL_SIZE);
+
+			status = efi_call_early(allocate_pages,
+					EFI_ALLOCATE_ADDRESS,
+					EFI_LOADER_DATA,
+					MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE,
+					&kernel_base);
+			if (status != EFI_SUCCESS) {
+				pr_efi_err(sys_table_arg,
+					"reserve_kernel_base: alloc failed.\n");
+				goto out;
+			}
+			*reserve_addr = kernel_base;
 
+			if (!spare)
+				break;
+			/*
+			 * If there's a gap between start and kernel_base,
+			 * it needs be reserved so that the memblock_limit
+			 * will not fall on a very low address when running
+			 * adjust_lowmem_bounds(), wchich could eventually
+			 * cause CMA reservation issue.
+			 */
 			status = efi_call_early(allocate_pages,
 						EFI_ALLOCATE_ADDRESS,
-						EFI_LOADER_DATA,
-						(end - start) / EFI_PAGE_SIZE,
+						EFI_RESERVED_TYPE,
+						spare / EFI_PAGE_SIZE,
 						&start);
 			if (status != EFI_SUCCESS) {
 				pr_efi_err(sys_table_arg,
-					"reserve_kernel_base(): alloc failed.\n");
+					"reserve spare-region failed\n");
 				goto out;
 			}
-			break;
 
+			break;
 		case EFI_LOADER_CODE:
 		case EFI_LOADER_DATA:
 			/*
@@ -220,7 +245,7 @@  efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
 	*image_size = image->image_size;
 	status = efi_relocate_kernel(sys_table, image_addr, *image_size,
 				     *image_size,
-				     dram_base + MAX_UNCOMP_KERNEL_SIZE, 0);
+				     *reserve_addr + MAX_UNCOMP_KERNEL_SIZE, 0);
 	if (status != EFI_SUCCESS) {
 		pr_efi_err(sys_table, "Failed to relocate kernel.\n");
 		efi_free(sys_table, *reserve_size, *reserve_addr);
@@ -233,7 +258,7 @@  efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
 	 * in memory. The kernel determines the base of DRAM from the
 	 * address at which the zImage is loaded.
 	 */
-	if (*image_addr + *image_size > dram_base + ZIMAGE_OFFSET_LIMIT) {
+	if (*image_addr + *image_size > *reserve_addr + ZIMAGE_OFFSET_LIMIT) {
 		pr_efi_err(sys_table, "Failed to relocate kernel, no low memory available.\n");
 		efi_free(sys_table, *reserve_size, *reserve_addr);
 		*reserve_size = 0;