diff mbox series

[v4,2/2] RISC-V: Implement sparsemem

Message ID 20190109203911.7887-3-logang@deltatee.com (mailing list archive)
State New, archived
Headers show
Series sparsemem support for RISC-V | expand

Commit Message

Logan Gunthorpe Jan. 9, 2019, 8:39 p.m. UTC
This patch implements sparsemem support for risc-v which helps pave the
way for memory hotplug and eventually P2P support.

We introduce Kconfig options for virtual and physical address bits which
are used to calculate the size of the vmemmap and set the
MAX_PHYSMEM_BITS.

The vmemmap is located directly before the VMALLOC region and sized
such that we can allocate enough pages to populate all the virtual
address space in the system (similar to the way it's done in arm64).

During initialization, call memblocks_present() and sparse_init(),
and provide a stub for vmemmap_populate() (all of which is similar to
arm64).

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andrew Waterman <andrew@sifive.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Michael Clark <michaeljclark@mac.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Zong Li <zong@andestech.com>
---
 arch/riscv/Kconfig                 | 23 +++++++++++++++++++++++
 arch/riscv/include/asm/pgtable.h   | 21 +++++++++++++++++----
 arch/riscv/include/asm/sparsemem.h | 11 +++++++++++
 arch/riscv/kernel/setup.c          |  4 +++-
 arch/riscv/mm/init.c               |  8 ++++++++
 5 files changed, 62 insertions(+), 5 deletions(-)
 create mode 100644 arch/riscv/include/asm/sparsemem.h

Comments

Christoph Hellwig Jan. 15, 2019, 1:58 p.m. UTC | #1
Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>
Greentime Hu July 31, 2019, 6:30 a.m. UTC | #2
Hi Logan,

Logan Gunthorpe <logang@deltatee.com> 於 2019年1月10日 週四 上午5:07寫道:
>
> This patch implements sparsemem support for risc-v which helps pave the
> way for memory hotplug and eventually P2P support.
>
> We introduce Kconfig options for virtual and physical address bits which
> are used to calculate the size of the vmemmap and set the
> MAX_PHYSMEM_BITS.
>
> The vmemmap is located directly before the VMALLOC region and sized
> such that we can allocate enough pages to populate all the virtual
> address space in the system (similar to the way it's done in arm64).
>
> During initialization, call memblocks_present() and sparse_init(),
> and provide a stub for vmemmap_populate() (all of which is similar to
> arm64).
>
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
> Cc: Albert Ou <aou@eecs.berkeley.edu>
> Cc: Andrew Waterman <andrew@sifive.com>
> Cc: Olof Johansson <olof@lixom.net>
> Cc: Michael Clark <michaeljclark@mac.com>
> Cc: Rob Herring <robh@kernel.org>
> Cc: Zong Li <zong@andestech.com>
> ---
>  arch/riscv/Kconfig                 | 23 +++++++++++++++++++++++
>  arch/riscv/include/asm/pgtable.h   | 21 +++++++++++++++++----
>  arch/riscv/include/asm/sparsemem.h | 11 +++++++++++
>  arch/riscv/kernel/setup.c          |  4 +++-
>  arch/riscv/mm/init.c               |  8 ++++++++
>  5 files changed, 62 insertions(+), 5 deletions(-)
>  create mode 100644 arch/riscv/include/asm/sparsemem.h
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e0d7d61779a6..bd659327bc6b 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -54,12 +54,32 @@ config ZONE_DMA32
>         bool
>         default y if 64BIT
>
> +config VA_BITS
> +       int
> +       default 32 if 32BIT
> +       default 39 if 64BIT
> +
> +config PA_BITS
> +       int
> +       default 34 if 32BIT
> +       default 56 if 64BIT
> +
>  config PAGE_OFFSET
>         hex
>         default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
>         default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
>         default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
>
> +config ARCH_FLATMEM_ENABLE
> +       def_bool y
> +
> +config ARCH_SPARSEMEM_ENABLE
> +       def_bool y
> +       select SPARSEMEM_VMEMMAP_ENABLE
> +
> +config ARCH_SELECT_MEMORY_MODEL
> +       def_bool ARCH_SPARSEMEM_ENABLE
> +
>  config STACKTRACE_SUPPORT
>         def_bool y
>
> @@ -94,6 +114,9 @@ config PGTABLE_LEVELS
>  config HAVE_KPROBES
>         def_bool n
>
> +config HAVE_ARCH_PFN_VALID
> +       def_bool y
> +
>  menu "Platform type"
>
>  choice
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 16301966d65b..e1162336f5ea 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -89,6 +89,23 @@ extern pgd_t swapper_pg_dir[];
>  #define __S110 PAGE_SHARED_EXEC
>  #define __S111 PAGE_SHARED_EXEC
>
> +#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
> +#define VMALLOC_END      (PAGE_OFFSET - 1)
> +#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
> +
> +/*
> + * Roughly size the vmemmap space to be large enough to fit enough
> + * struct pages to map half the virtual address space. Then
> + * position vmemmap directly below the VMALLOC region.
> + */
> +#define VMEMMAP_SHIFT \
> +       (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
> +#define VMEMMAP_SIZE   (1UL << VMEMMAP_SHIFT)
> +#define VMEMMAP_END    (VMALLOC_START - 1)
> +#define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
> +
> +#define vmemmap                ((struct page *)VMEMMAP_START)
> +
>  /*
>   * ZERO_PAGE is a global shared page that is always zero,
>   * used for zero-mapped memory areas, etc.
> @@ -411,10 +428,6 @@ static inline void pgtable_cache_init(void)
>         /* No page table caches to initialize */
>  }
>
> -#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
> -#define VMALLOC_END      (PAGE_OFFSET - 1)
> -#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
> -
>  /*
>   * Task size is 0x40000000000 for RV64 or 0xb800000 for RV32.
>   * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
> diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h
> new file mode 100644
> index 000000000000..b58ba2d9ed6e
> --- /dev/null
> +++ b/arch/riscv/include/asm/sparsemem.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_SPARSEMEM_H
> +#define __ASM_SPARSEMEM_H
> +
> +#ifdef CONFIG_SPARSEMEM
> +#define MAX_PHYSMEM_BITS       CONFIG_PA_BITS
> +#define SECTION_SIZE_BITS      27
> +#endif /* CONFIG_SPARSEMEM */
> +
> +#endif /* __ASM_SPARSEMEM_H */
> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
> index fc8006a042eb..98f39adefb1a 100644
> --- a/arch/riscv/kernel/setup.c
> +++ b/arch/riscv/kernel/setup.c
> @@ -193,6 +193,9 @@ static void __init setup_bootmem(void)
>                                   PFN_PHYS(end_pfn - start_pfn),
>                                   &memblock.memory, 0);
>         }
> +
> +       memblocks_present();
> +       sparse_init();
>  }

I just applied this patch to Linux kernel 5.2.
I used a dts with 2 memory nodes with hole int it.

memory@80000000 {
    device_type = "memory";
    reg = <0x0 0x80000000 0x0 0x40000000>;
};
memory@180000000 {
    device_type = "memory";
    reg = <0x1 0x80000000 0x0 0x40000000>;
};

I found it will boot failure. Did I miss anything?

[ 0.000000] Sorting __ex_table...
[ 0.000000] BUG: Bad page state in process swapper pfn:180001
[ 0.000000] page:ffffffcf05400038 refcount:0 mapcount:94371937
mapping:00000000ffffffff index:0x4000000000000000
[ 0.000000] anon
[ 0.000000] flags: 0x0()
[ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000
00000000ffffffff
[ 0.000000] raw: 4000000000000000 ffffffcf05a00060 0000000005a00060
[ 0.000000] page dumped because: non-NULL mapping
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.0-00001-g737d8214d9a9 #3
[ 0.000000] Call Trace:
[ 0.000000] [<ffffffe00017759c>] walk_stackframe+0x0/0xa0
[ 0.000000] [<ffffffe00017769c>] show_stack+0x2a/0x34
[ 0.000000] [<ffffffe00070c53e>] dump_stack+0x62/0x7c
[ 0.000000] [<ffffffe0002330ae>] bad_page+0xca/0x120
[ 0.000000] [<ffffffe00023313c>] free_pages_check_bad+0x38/0x7a
[ 0.000000] [<ffffffe00023368a>] __free_pages_ok+0x496/0x4ba
[ 0.000000] [<ffffffe000234a82>] __free_pages.part.4+0xe/0x22
[ 0.000000] [<ffffffe000234c9e>] __free_pages_core+0x9a/0xa6
[ 0.000000] [<ffffffe000009b0a>] memblock_free_pages+0x12/0x1a
[ 0.000000] [<ffffffe00000b496>] memblock_free_all+0x144/0x1a8
[ 0.000000] [<ffffffe00000274a>] mem_init+0x28/0x36
[ 0.000000] [<ffffffe0000008a0>] start_kernel+0x1bc/0x360
[ 0.000000] [<ffffffe000000074>] clear_bss_done+0x34/0x38
[ 0.000000] Disabling lock debugging due to kernel taint
[ 0.000000] BUG: Bad page state in process swapper pfn:180002
[ 0.000000] page:ffffffcf05400070 refcount:0 mapcount:94371993
mapping:00000000ffffffff index:0x4000000000000000
[ 0.000000] anon
[ 0.000000] flags: 0x0()
[ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000
00000000ffffffff
[ 0.000000] raw: 4000000000000000 ffffffcf05a00098 0000000005a00098
[ 0.000000] page dumped because: non-NULL mapping
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G B
5.2.0-00001-g737d8214d9a9 #3
[ 0.000000] Call Trace:
[ 0.000000] [<ffffffe00017759c>] walk_stackframe+0x0/0xa0
[ 0.000000] [<ffffffe00017769c>] show_stack+0x2a/0x34
[ 0.000000] [<ffffffe00070c53e>] dump_stack+0x62/0x7c
[ 0.000000] [<ffffffe0002330ae>] bad_page+0xca/0x120
[ 0.000000] [<ffffffe00023313c>] free_pages_check_bad+0x38/0x7a
[ 0.000000] [<ffffffe00023368a>] __free_pages_ok+0x496/0x4ba
[ 0.000000] [<ffffffe000234a82>] __free_pages.part.4+0xe/0x22
[ 0.000000] [<ffffffe000234c9e>] __free_pages_core+0x9a/0xa6
[ 0.000000] [<ffffffe000009b0a>] memblock_free_pages+0x12/0x1a
[ 0.000000] [<ffffffe00000b496>] memblock_free_all+0x144/0x1a8
[ 0.000000] [<ffffffe00000274a>] mem_init+0x28/0x36
[ 0.000000] [<ffffffe0000008a0>] start_kernel+0x1bc/0x360
[ 0.000000] [<ffffffe000000074>] clear_bss_done+0x34/0x38
[ 0.000000] BUG: Bad page state in process swapper pfn:180003
[ 0.000000] page:ffffffcf054000a8 refcount:0 mapcount:94372049
mapping:00000000ffffffff index:0x4000000000000000
[ 0.000000] anon
[ 0.000000] flags: 0x0()
[ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000
00000000ffffffff
[ 0.000000] raw: 4000000000000000 ffffffcf05a000d0 0000000005a000d0
[ 0.000000] page dumped because: non-NULL mapping

I look this issue more closely.
I found it always sets each memblock region to node 0. Does this make sense?
I am not sure if I understand this correctly. Do you have any idea for
this? Thank you. :)

for_each_memblock(memory, reg) {
    unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
    unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
    memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn -
start_pfn), &memblock.memory, 0);


                     ^^^
}

[ 0.000000] Early memory node ranges
[ 0.000000] node 0: [mem 0x0000000080200000-0x00000000bfffffff]
[ 0.000000] node 0: [mem 0x0000000180000000-0x00000001bfffffff]
[ 0.000000] Initmem setup node 0 [mem 0x0000000080200000-0x00000001bfffffff]
Logan Gunthorpe July 31, 2019, 5:07 p.m. UTC | #3
On 2019-07-31 12:30 a.m., Greentime Hu wrote:
> I look this issue more closely.
> I found it always sets each memblock region to node 0. Does this make sense?
> I am not sure if I understand this correctly. Do you have any idea for
> this? Thank you. :)

Yes, I think this is normal. When we talk about memory nodes we're
talking about NUMA nodes which is unrelated to device tree nodes.

I'm not really sure what's causing the crash. Have you verified it's
this patch that causes it? Is it related to there being a hole in your
memory, does it work if you only have one memory node?

Thanks,

Logan
Greentime Hu Aug. 1, 2019, 3:34 a.m. UTC | #4
Hi Logan,

Logan Gunthorpe <logang@deltatee.com> 於 2019年8月1日 週四 上午1:08寫道:
>
>
>
> On 2019-07-31 12:30 a.m., Greentime Hu wrote:
> > I look this issue more closely.
> > I found it always sets each memblock region to node 0. Does this make sense?
> > I am not sure if I understand this correctly. Do you have any idea for
> > this? Thank you. :)
>
> Yes, I think this is normal. When we talk about memory nodes we're
> talking about NUMA nodes which is unrelated to device tree nodes.

Ok, but it seems the second memblock_region may overwrite the first
memblock_region in for_each_memblock(memory, reg)  loop. It always
uses this API to set to node 0.
memblock_set_node(PFN_PHYS(start_pfn),PFN_PHYS(end_pfn - start_pfn),
&memblock.memory,0)


> I'm not really sure what's causing the crash. Have you verified it's
> this patch that causes it? Is it related to there being a hole in your
> memory, does it work if you only have one memory node?
>

It works fine if there is only one memory node described in dts.

I think it is related to there being a hole in the device tree script.
I don't actually have a platform with a hole in the memory region, so
I use device tree script to describe it.

The physical address layout will be like this.
2GB-3GB-hole-6GB-7GB

memory@80000000 {
    device_type = "memory";
    reg = <0x0 0x80000000 0x0 0x40000000>;
};
memory@180000000 {
    device_type = "memory";
    reg = <0x1 0x80000000 0x0 0x40000000>;
};

Thank you for the quick reply. :)
Logan Gunthorpe Aug. 9, 2019, 3:46 p.m. UTC | #5
On 2019-08-08 10:23 p.m., Greentime Hu wrote:
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 3f12b069af1d..208b3e14ccd8 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
>         default 2
> 
>  config HAVE_ARCH_PFN_VALID
> -       def_bool y
> +       bool
> +       default !SPARSEMEM_VMEMMAP
> 
>  menu "Platform type"
> 
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 8ddb6c7fedac..6991f7a5a4a7 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn;
>  #define virt_to_pfn(vaddr)     (phys_to_pfn(__pa(vaddr)))
>  #define pfn_to_virt(pfn)       (__va(pfn_to_phys(pfn)))
> 
> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
> +#define pfn_valid(pfn) \
> +       (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
>  #define virt_to_page(vaddr)    (pfn_to_page(virt_to_pfn(vaddr)))
>  #define page_to_virt(page)     (pfn_to_virt(page_to_pfn(page)))
> +#else
> +#define virt_to_page(vaddr)    ((struct page *)((((u64)vaddr -
> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START))
> +#define page_to_virt(pg)       ((void *)(((((u64)pg - VMEMMAP_START) /
> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset))
> +#endif

This doesn't make sense to me at all. It should always use pfn_to_page()
for virt_to_page() and the generic pfn_to_page()/page_to_pfn()
implementations essentially already do what you are doing in a cleaner
way. So I'd be really surprised if this does anything at all.

Logan
Greentime Hu Aug. 9, 2019, 5:01 p.m. UTC | #6
Hi Logan,

Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道:
>
>
>
> On 2019-08-08 10:23 p.m., Greentime Hu wrote:
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 3f12b069af1d..208b3e14ccd8 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
> >         default 2
> >
> >  config HAVE_ARCH_PFN_VALID
> > -       def_bool y
> > +       bool
> > +       default !SPARSEMEM_VMEMMAP
> >
> >  menu "Platform type"
> >
> > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> > index 8ddb6c7fedac..6991f7a5a4a7 100644
> > --- a/arch/riscv/include/asm/page.h
> > +++ b/arch/riscv/include/asm/page.h
> > @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn;
> >  #define virt_to_pfn(vaddr)     (phys_to_pfn(__pa(vaddr)))
> >  #define pfn_to_virt(pfn)       (__va(pfn_to_phys(pfn)))
> >
> > +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
> > +#define pfn_valid(pfn) \
> > +       (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> >  #define virt_to_page(vaddr)    (pfn_to_page(virt_to_pfn(vaddr)))
> >  #define page_to_virt(page)     (pfn_to_virt(page_to_pfn(page)))
> > +#else
> > +#define virt_to_page(vaddr)    ((struct page *)((((u64)vaddr -
> > va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START))
> > +#define page_to_virt(pg)       ((void *)(((((u64)pg - VMEMMAP_START) /
> > sizeof(struct page)) * PAGE_SIZE) + va_pa_offset))
> > +#endif
>
> This doesn't make sense to me at all. It should always use pfn_to_page()
> for virt_to_page() and the generic pfn_to_page()/page_to_pfn()
> implementations essentially already do what you are doing in a cleaner
> way. So I'd be really surprised if this does anything at all.
>

Thank you for point me out that. I just checked the generic
implementation and I should use that one.
Sorry I didn't check the generic one and just implement it again.
I think the only patch we need is the first part to use generic
pfn_valid(). I just tested it and yes it can boot successfully in dts
with hole.

It will fail in this check ((pfn)-pfn_base) < max_mapnr.
In my test case it will be
((6GB>>PAGE_SHIFT)-(2GB>>PAGE_SHIFT)=(4GB>>PAGE_SHIFT) <
(3GB>>PAGE_SHIFT) to return false.
 memory@80000000 {
         device_type = "memory";
         reg = <0x0 0x80000000 0x0 0x80000000>;
 };
 memory@180000000 {
         device_type = "memory";
         reg = <0x1 0x80000000 0x0 0x40000000>;
 };

To cause the check here failed to initialize page struct.

for (pfn = start_pfn; pfn < end_pfn; pfn++) {
        /*
         * There can be holes in boot-time mem_map[]s handed to this
         * function.  They do not exist on hotplugged memory.
         */
        if (context == MEMMAP_EARLY) {
                if (!early_pfn_valid(pfn))
                        continue;
                if (!early_pfn_in_nid(pfn, nid))
                        continue;
                if (overlap_memmap_init(zone, &pfn))
                        continue;
                if (defer_init(nid, pfn, end_pfn))
                        break;
        }

        page = pfn_to_page(pfn);
        __init_single_page(page, pfn, zone, nid);


------------------------------------------------------------------------------

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3f12b069af1d..208b3e14ccd8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -116,7 +116,8 @@ config PGTABLE_LEVELS
        default 2

 config HAVE_ARCH_PFN_VALID
-       def_bool y
+       bool
+       default !SPARSEMEM_VMEMMAP

 menu "Platform type"

diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 8ddb6c7fedac..80d28fa1e2eb 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
 #define page_to_bus(page)      (page_to_phys(page))
 #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))

+#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define pfn_valid(pfn) \
        (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
+#endif

 #define ARCH_PFN_OFFSET                (pfn_base)
Logan Gunthorpe Aug. 9, 2019, 7:03 p.m. UTC | #7
On 2019-08-09 11:01 a.m., Greentime Hu wrote:
> Hi Logan,
> 
> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道:
>>
>>
>>
>> On 2019-08-08 10:23 p.m., Greentime Hu wrote:
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index 3f12b069af1d..208b3e14ccd8 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
>>>         default 2
>>>
>>>  config HAVE_ARCH_PFN_VALID
>>> -       def_bool y
>>> +       bool
>>> +       default !SPARSEMEM_VMEMMAP
>>>
>>>  menu "Platform type"
>>>
>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>>> index 8ddb6c7fedac..6991f7a5a4a7 100644
>>> --- a/arch/riscv/include/asm/page.h
>>> +++ b/arch/riscv/include/asm/page.h
>>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn;
>>>  #define virt_to_pfn(vaddr)     (phys_to_pfn(__pa(vaddr)))
>>>  #define pfn_to_virt(pfn)       (__va(pfn_to_phys(pfn)))
>>>
>>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
>>> +#define pfn_valid(pfn) \
>>> +       (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
>>>  #define virt_to_page(vaddr)    (pfn_to_page(virt_to_pfn(vaddr)))
>>>  #define page_to_virt(page)     (pfn_to_virt(page_to_pfn(page)))
>>> +#else
>>> +#define virt_to_page(vaddr)    ((struct page *)((((u64)vaddr -
>>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START))
>>> +#define page_to_virt(pg)       ((void *)(((((u64)pg - VMEMMAP_START) /
>>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset))
>>> +#endif
>>
>> This doesn't make sense to me at all. It should always use pfn_to_page()
>> for virt_to_page() and the generic pfn_to_page()/page_to_pfn()
>> implementations essentially already do what you are doing in a cleaner
>> way. So I'd be really surprised if this does anything at all.
>>
> 
> Thank you for point me out that. I just checked the generic
> implementation and I should use that one.
> Sorry I didn't check the generic one and just implement it again.
> I think the only patch we need is the first part to use generic
> pfn_valid(). I just tested it and yes it can boot successfully in dts
> with hole.
> 
> It will fail in this check ((pfn)-pfn_base) < max_mapnr.

Sounds to me like max_mapnr is not set correctly. See the code in
setup_bootmem(). Seems like 'mem_size' should be set to the largest
memory block, not just the one that contains the kernel...


> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 3f12b069af1d..208b3e14ccd8 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
>         default 2
> 
>  config HAVE_ARCH_PFN_VALID
> -       def_bool y
> +       bool
> +       default !SPARSEMEM_VMEMMAP
> 
>  menu "Platform type"
> 
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 8ddb6c7fedac..80d28fa1e2eb 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
>  #define page_to_bus(page)      (page_to_phys(page))
>  #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))
> 
> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
>  #define pfn_valid(pfn) \
>         (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> +#endif
> 
>  #define ARCH_PFN_OFFSET                (pfn_base)


This patch still makes no sense. I'm not sure why we have an arch
specific pfn_valid() because it's very similar to the generic one. But
my guess is there's a reason for it and it's not doing what it is
supposed when you remove it for the sparsemem case.

Logan
Greentime Hu Aug. 12, 2019, 4:01 a.m. UTC | #8
Hi Logan,

Logan Gunthorpe <logang@deltatee.com> 於 2019年8月10日 週六 上午3:03寫道:
>
>
>
> On 2019-08-09 11:01 a.m., Greentime Hu wrote:
> > Hi Logan,
> >
> > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道:
> >>
> >>
> >>
> >> On 2019-08-08 10:23 p.m., Greentime Hu wrote:
> >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> >>> index 3f12b069af1d..208b3e14ccd8 100644
> >>> --- a/arch/riscv/Kconfig
> >>> +++ b/arch/riscv/Kconfig
> >>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
> >>>         default 2
> >>>
> >>>  config HAVE_ARCH_PFN_VALID
> >>> -       def_bool y
> >>> +       bool
> >>> +       default !SPARSEMEM_VMEMMAP
> >>>
> >>>  menu "Platform type"
> >>>
> >>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> >>> index 8ddb6c7fedac..6991f7a5a4a7 100644
> >>> --- a/arch/riscv/include/asm/page.h
> >>> +++ b/arch/riscv/include/asm/page.h
> >>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn;
> >>>  #define virt_to_pfn(vaddr)     (phys_to_pfn(__pa(vaddr)))
> >>>  #define pfn_to_virt(pfn)       (__va(pfn_to_phys(pfn)))
> >>>
> >>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
> >>> +#define pfn_valid(pfn) \
> >>> +       (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> >>>  #define virt_to_page(vaddr)    (pfn_to_page(virt_to_pfn(vaddr)))
> >>>  #define page_to_virt(page)     (pfn_to_virt(page_to_pfn(page)))
> >>> +#else
> >>> +#define virt_to_page(vaddr)    ((struct page *)((((u64)vaddr -
> >>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START))
> >>> +#define page_to_virt(pg)       ((void *)(((((u64)pg - VMEMMAP_START) /
> >>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset))
> >>> +#endif
> >>
> >> This doesn't make sense to me at all. It should always use pfn_to_page()
> >> for virt_to_page() and the generic pfn_to_page()/page_to_pfn()
> >> implementations essentially already do what you are doing in a cleaner
> >> way. So I'd be really surprised if this does anything at all.
> >>
> >
> > Thank you for point me out that. I just checked the generic
> > implementation and I should use that one.
> > Sorry I didn't check the generic one and just implement it again.
> > I think the only patch we need is the first part to use generic
> > pfn_valid(). I just tested it and yes it can boot successfully in dts
> > with hole.
> >
> > It will fail in this check ((pfn)-pfn_base) < max_mapnr.
>
> Sounds to me like max_mapnr is not set correctly. See the code in
> setup_bootmem(). Seems like 'mem_size' should be set to the largest
> memory block, not just the one that contains the kernel...
>
>
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 3f12b069af1d..208b3e14ccd8 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
> >         default 2
> >
> >  config HAVE_ARCH_PFN_VALID
> > -       def_bool y
> > +       bool
> > +       default !SPARSEMEM_VMEMMAP
> >
> >  menu "Platform type"
> >
> > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> > index 8ddb6c7fedac..80d28fa1e2eb 100644
> > --- a/arch/riscv/include/asm/page.h
> > +++ b/arch/riscv/include/asm/page.h
> > @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
> >  #define page_to_bus(page)      (page_to_phys(page))
> >  #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))
> >
> > +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
> >  #define pfn_valid(pfn) \
> >         (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> > +#endif
> >
> >  #define ARCH_PFN_OFFSET                (pfn_base)
>
>
> This patch still makes no sense. I'm not sure why we have an arch
> specific pfn_valid() because it's very similar to the generic one. But
> my guess is there's a reason for it and it's not doing what it is
> supposed when you remove it for the sparsemem case.

It will use another pfn_valid() implementation in
include/linux/mmzone.h if CONFIG_SPARSEMEM and
!CONFIG_HAVE_ARCH_PFN_VALID
It will be this one.

static inline int pfn_valid(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
}

This generic pfn_valid() API can check the pfn is valid or not even if
there a hole in the memory.
For example:
A hole is between 0x100000000 to 0x180000000 (4GB-6GB) in my dts test case.

[    0.000000] In setup_bootmem, pfn_valid(0x180000)=1
[    0.000000] In setup_bootmem, pfn_valid(0x80000)=1
[    0.000000] In setup_bootmem, pfn_valid(0x80200)=1
[    0.000000] In setup_bootmem, pfn_valid(0x80300)=1
[    0.000000] In setup_bootmem, pfn_valid(0x160000)=0
[    0.000000] In setup_bootmem, pfn_valid(0x17ffff)=0
[    0.000000] In setup_bootmem, pfn_valid(0x120000)=0
[    0.000000] In setup_bootmem, pfn_valid(0x100000)=0
[    0.000000] In setup_bootmem, pfn_valid(0xfffff)=1

This generic pfn_valid() could tell the pfn is valid or not.


I think this one is only available for flatmem.
#define pfn_valid(pfn)  (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
Logan Gunthorpe Aug. 12, 2019, 3:51 p.m. UTC | #9
On 2019-08-11 10:01 p.m., Greentime Hu wrote:
> Hi Logan,
> 
> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月10日 週六 上午3:03寫道:
>>
>>
>>
>> On 2019-08-09 11:01 a.m., Greentime Hu wrote:
>>> Hi Logan,
>>>
>>> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道:
>>>>
>>>>
>>>>
>>>> On 2019-08-08 10:23 p.m., Greentime Hu wrote:
>>>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>>>> index 3f12b069af1d..208b3e14ccd8 100644
>>>>> --- a/arch/riscv/Kconfig
>>>>> +++ b/arch/riscv/Kconfig
>>>>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
>>>>>         default 2
>>>>>
>>>>>  config HAVE_ARCH_PFN_VALID
>>>>> -       def_bool y
>>>>> +       bool
>>>>> +       default !SPARSEMEM_VMEMMAP
>>>>>
>>>>>  menu "Platform type"
>>>>>
>>>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>>>>> index 8ddb6c7fedac..6991f7a5a4a7 100644
>>>>> --- a/arch/riscv/include/asm/page.h
>>>>> +++ b/arch/riscv/include/asm/page.h
>>>>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn;
>>>>>  #define virt_to_pfn(vaddr)     (phys_to_pfn(__pa(vaddr)))
>>>>>  #define pfn_to_virt(pfn)       (__va(pfn_to_phys(pfn)))
>>>>>
>>>>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
>>>>> +#define pfn_valid(pfn) \
>>>>> +       (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
>>>>>  #define virt_to_page(vaddr)    (pfn_to_page(virt_to_pfn(vaddr)))
>>>>>  #define page_to_virt(page)     (pfn_to_virt(page_to_pfn(page)))
>>>>> +#else
>>>>> +#define virt_to_page(vaddr)    ((struct page *)((((u64)vaddr -
>>>>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START))
>>>>> +#define page_to_virt(pg)       ((void *)(((((u64)pg - VMEMMAP_START) /
>>>>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset))
>>>>> +#endif
>>>>
>>>> This doesn't make sense to me at all. It should always use pfn_to_page()
>>>> for virt_to_page() and the generic pfn_to_page()/page_to_pfn()
>>>> implementations essentially already do what you are doing in a cleaner
>>>> way. So I'd be really surprised if this does anything at all.
>>>>
>>>
>>> Thank you for point me out that. I just checked the generic
>>> implementation and I should use that one.
>>> Sorry I didn't check the generic one and just implement it again.
>>> I think the only patch we need is the first part to use generic
>>> pfn_valid(). I just tested it and yes it can boot successfully in dts
>>> with hole.
>>>
>>> It will fail in this check ((pfn)-pfn_base) < max_mapnr.
>>
>> Sounds to me like max_mapnr is not set correctly. See the code in
>> setup_bootmem(). Seems like 'mem_size' should be set to the largest
>> memory block, not just the one that contains the kernel...
>>
>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index 3f12b069af1d..208b3e14ccd8 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
>>>         default 2
>>>
>>>  config HAVE_ARCH_PFN_VALID
>>> -       def_bool y
>>> +       bool
>>> +       default !SPARSEMEM_VMEMMAP
>>>
>>>  menu "Platform type"
>>>
>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>>> index 8ddb6c7fedac..80d28fa1e2eb 100644
>>> --- a/arch/riscv/include/asm/page.h
>>> +++ b/arch/riscv/include/asm/page.h
>>> @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
>>>  #define page_to_bus(page)      (page_to_phys(page))
>>>  #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))
>>>
>>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
>>>  #define pfn_valid(pfn) \
>>>         (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
>>> +#endif
>>>
>>>  #define ARCH_PFN_OFFSET                (pfn_base)
>>
>>
>> This patch still makes no sense. I'm not sure why we have an arch
>> specific pfn_valid() because it's very similar to the generic one. But
>> my guess is there's a reason for it and it's not doing what it is
>> supposed when you remove it for the sparsemem case.
> 
> It will use another pfn_valid() implementation in
> include/linux/mmzone.h if CONFIG_SPARSEMEM and
> !CONFIG_HAVE_ARCH_PFN_VALID
> It will be this one.
> 
> static inline int pfn_valid(unsigned long pfn)
> {
>         if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
>                 return 0;
>         return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> }

Ah, ok I see. "page.h" is only included in no-mmu arches. Which explains
why riscv re-implements that macro. Couple follow up questions then:

* Did you test the memory-with-hole scenario without the sparsemem
patches? It seems pfn_valid() will be wrong regardless of sparse/flat mem.

* Any chance we can just use the generic pfn_valid() function in all
cases not just sparsemem? Can you test that?

Thanks,

Logan
Greentime Hu Aug. 13, 2019, 6:04 a.m. UTC | #10
Hi Logan,

Logan Gunthorpe <logang@deltatee.com> 於 2019年8月12日 週一 下午11:52寫道:
>
>
>
> On 2019-08-11 10:01 p.m., Greentime Hu wrote:
> > Hi Logan,
> >
> > Logan Gunthorpe <logang@deltatee.com> 於 2019年8月10日 週六 上午3:03寫道:
> >>
> >>
> >>
> >> On 2019-08-09 11:01 a.m., Greentime Hu wrote:
> >>> Hi Logan,
> >>>
> >>> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月9日 週五 下午11:47寫道:
> >>>>
> >>>>
> >>>>
> >>>> On 2019-08-08 10:23 p.m., Greentime Hu wrote:
> >>>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> >>>>> index 3f12b069af1d..208b3e14ccd8 100644
> >>>>> --- a/arch/riscv/Kconfig
> >>>>> +++ b/arch/riscv/Kconfig
> >>>>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
> >>>>>         default 2
> >>>>>
> >>>>>  config HAVE_ARCH_PFN_VALID
> >>>>> -       def_bool y
> >>>>> +       bool
> >>>>> +       default !SPARSEMEM_VMEMMAP
> >>>>>
> >>>>>  menu "Platform type"
> >>>>>
> >>>>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> >>>>> index 8ddb6c7fedac..6991f7a5a4a7 100644
> >>>>> --- a/arch/riscv/include/asm/page.h
> >>>>> +++ b/arch/riscv/include/asm/page.h
> >>>>> @@ -93,16 +93,20 @@ extern unsigned long min_low_pfn;
> >>>>>  #define virt_to_pfn(vaddr)     (phys_to_pfn(__pa(vaddr)))
> >>>>>  #define pfn_to_virt(pfn)       (__va(pfn_to_phys(pfn)))
> >>>>>
> >>>>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
> >>>>> +#define pfn_valid(pfn) \
> >>>>> +       (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> >>>>>  #define virt_to_page(vaddr)    (pfn_to_page(virt_to_pfn(vaddr)))
> >>>>>  #define page_to_virt(page)     (pfn_to_virt(page_to_pfn(page)))
> >>>>> +#else
> >>>>> +#define virt_to_page(vaddr)    ((struct page *)((((u64)vaddr -
> >>>>> va_pa_offset) / PAGE_SIZE) * sizeof(struct page) + VMEMMAP_START))
> >>>>> +#define page_to_virt(pg)       ((void *)(((((u64)pg - VMEMMAP_START) /
> >>>>> sizeof(struct page)) * PAGE_SIZE) + va_pa_offset))
> >>>>> +#endif
> >>>>
> >>>> This doesn't make sense to me at all. It should always use pfn_to_page()
> >>>> for virt_to_page() and the generic pfn_to_page()/page_to_pfn()
> >>>> implementations essentially already do what you are doing in a cleaner
> >>>> way. So I'd be really surprised if this does anything at all.
> >>>>
> >>>
> >>> Thank you for point me out that. I just checked the generic
> >>> implementation and I should use that one.
> >>> Sorry I didn't check the generic one and just implement it again.
> >>> I think the only patch we need is the first part to use generic
> >>> pfn_valid(). I just tested it and yes it can boot successfully in dts
> >>> with hole.
> >>>
> >>> It will fail in this check ((pfn)-pfn_base) < max_mapnr.
> >>
> >> Sounds to me like max_mapnr is not set correctly. See the code in
> >> setup_bootmem(). Seems like 'mem_size' should be set to the largest
> >> memory block, not just the one that contains the kernel...
> >>
> >>
> >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> >>> index 3f12b069af1d..208b3e14ccd8 100644
> >>> --- a/arch/riscv/Kconfig
> >>> +++ b/arch/riscv/Kconfig
> >>> @@ -116,7 +116,8 @@ config PGTABLE_LEVELS
> >>>         default 2
> >>>
> >>>  config HAVE_ARCH_PFN_VALID
> >>> -       def_bool y
> >>> +       bool
> >>> +       default !SPARSEMEM_VMEMMAP
> >>>
> >>>  menu "Platform type"
> >>>
> >>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> >>> index 8ddb6c7fedac..80d28fa1e2eb 100644
> >>> --- a/arch/riscv/include/asm/page.h
> >>> +++ b/arch/riscv/include/asm/page.h
> >>> @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
> >>>  #define page_to_bus(page)      (page_to_phys(page))
> >>>  #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))
> >>>
> >>> +#if !defined(CONFIG_SPARSEMEM_VMEMMAP)
> >>>  #define pfn_valid(pfn) \
> >>>         (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> >>> +#endif
> >>>
> >>>  #define ARCH_PFN_OFFSET                (pfn_base)
> >>
> >>
> >> This patch still makes no sense. I'm not sure why we have an arch
> >> specific pfn_valid() because it's very similar to the generic one. But
> >> my guess is there's a reason for it and it's not doing what it is
> >> supposed when you remove it for the sparsemem case.
> >
> > It will use another pfn_valid() implementation in
> > include/linux/mmzone.h if CONFIG_SPARSEMEM and
> > !CONFIG_HAVE_ARCH_PFN_VALID
> > It will be this one.
> >
> > static inline int pfn_valid(unsigned long pfn)
> > {
> >         if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
> >                 return 0;
> >         return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> > }
>
> Ah, ok I see. "page.h" is only included in no-mmu arches. Which explains
> why riscv re-implements that macro. Couple follow up questions then:
>
> * Did you test the memory-with-hole scenario without the sparsemem
> patches? It seems pfn_valid() will be wrong regardless of sparse/flat mem.
>
> * Any chance we can just use the generic pfn_valid() function in all
> cases not just sparsemem? Can you test that?
>

I think  flat mem doesn't support memory-with-hole scenario.
In mm/Kconfig, it says
"
          For systems that have holes in their physical address
          spaces and for features like NUMA and memory hotplug,
          choose "Sparse Memory"
"
IMHO, the memory-with-hole scenario should only be tested for sparse
mem but flat mem.

The generic pfn_valid() is just for non-mmu arches. Every architecture
with mmu defines their own pfn_valid().
This is supposed to be another separate patch that do we need to
implement a generic pfn_valid().
Logan Gunthorpe Aug. 13, 2019, 4:14 p.m. UTC | #11
On 2019-08-13 12:04 a.m., Greentime Hu wrote:
> I think  flat mem doesn't support memory-with-hole scenario.
> In mm/Kconfig, it says
> "
>           For systems that have holes in their physical address
>           spaces and for features like NUMA and memory hotplug,
>           choose "Sparse Memory"
> "
> IMHO, the memory-with-hole scenario should only be tested for sparse
> mem but flat mem.

Fair enough.

> The generic pfn_valid() is just for non-mmu arches. 

The generic pfn_valid() in asm-generic is only for non-mmu arches.

> Every architecture
> with mmu defines their own pfn_valid().

Not true. Arm64, for example just uses the generic implementation in
mmzone.h. My main question is whether we can just do that. If we can't
we should probably structure it like powerpc where they only use the
arch-specific helper for CONFIG_FLATMEM instead of when CONFIG_SPARSEMEM
isn't set.

Logan
Paul Walmsley Aug. 13, 2019, 4:39 p.m. UTC | #12
On Tue, 13 Aug 2019, Logan Gunthorpe wrote:

> On 2019-08-13 12:04 a.m., Greentime Hu wrote:
> 
> > Every architecture with mmu defines their own pfn_valid().
> 
> Not true. Arm64, for example just uses the generic implementation in
> mmzone.h. 

arm64 seems to define their own:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235

While there are many architectures which have their own pfn_valid(); 
oddly, almost none of them set HAVE_ARCH_PFN_VALID ?


- Paul
Paul Walmsley Aug. 13, 2019, 4:48 p.m. UTC | #13
On Tue, 13 Aug 2019, Paul Walmsley wrote:

> On Tue, 13 Aug 2019, Logan Gunthorpe wrote:
> 
> > On 2019-08-13 12:04 a.m., Greentime Hu wrote:
> > 
> > > Every architecture with mmu defines their own pfn_valid().
> > 
> > Not true. Arm64, for example just uses the generic implementation in
> > mmzone.h. 
> 
> arm64 seems to define their own:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235
> 
> While there are many architectures which have their own pfn_valid(); 
> oddly, almost none of them set HAVE_ARCH_PFN_VALID ?

(fixed the linux-mm@ address)


- Paul
Logan Gunthorpe Aug. 13, 2019, 4:49 p.m. UTC | #14
On 2019-08-13 10:39 a.m., Paul Walmsley wrote:
> On Tue, 13 Aug 2019, Logan Gunthorpe wrote:
> 
>> On 2019-08-13 12:04 a.m., Greentime Hu wrote:
>>
>>> Every architecture with mmu defines their own pfn_valid().
>>
>> Not true. Arm64, for example just uses the generic implementation in
>> mmzone.h. 
> 
> arm64 seems to define their own:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899

Oh, yup. My mistake.

> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235
> 
> While there are many architectures which have their own pfn_valid(); 
> oddly, almost none of them set HAVE_ARCH_PFN_VALID ?

Yes, much of this is super confusing. Seems HAVE_ARCH_PFN_VALID only
matters if SPARSEMEM is set. So risc-v probably doesn't need to set it
and we just need a #ifdef !CONFIG_FLATMEM around the pfn_valid
definition like other arches.

Logan
Greentime Hu Aug. 14, 2019, 1:35 p.m. UTC | #15
Logan Gunthorpe <logang@deltatee.com> 於 2019年8月14日 週三 上午12:50寫道:
>
> On 2019-08-13 10:39 a.m., Paul Walmsley wrote:
> > On Tue, 13 Aug 2019, Logan Gunthorpe wrote:
> >
> >> On 2019-08-13 12:04 a.m., Greentime Hu wrote:
> >>
> >>> Every architecture with mmu defines their own pfn_valid().
> >>
> >> Not true. Arm64, for example just uses the generic implementation in
> >> mmzone.h.
> >
> > arm64 seems to define their own:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899
>
> Oh, yup. My mistake.
>
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235
> >
> > While there are many architectures which have their own pfn_valid();
> > oddly, almost none of them set HAVE_ARCH_PFN_VALID ?
>
> Yes, much of this is super confusing. Seems HAVE_ARCH_PFN_VALID only
> matters if SPARSEMEM is set. So risc-v probably doesn't need to set it
> and we just need a #ifdef !CONFIG_FLATMEM around the pfn_valid
> definition like other arches.
>

Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM.
https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85

BTW, I found another issue here.
#define FIXADDR_TOP            (VMALLOC_START)
#define FIXADDR_START           (FIXADDR_TOP - FIXADDR_SIZE)
#define VMEMMAP_END    (VMALLOC_START - 1)
#define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
These 2 regions are overlapped.

How about this fix? Not sure if it is good for everyone.

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3f12b069af1d..3c4d394679d0 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -115,9 +115,6 @@ config PGTABLE_LEVELS
        default 3 if 64BIT
        default 2

-config HAVE_ARCH_PFN_VALID
-       def_bool y
-
 menu "Platform type"

 choice
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index c207f6634b91..72e106b60bc5 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -26,7 +26,7 @@ enum fixed_addresses {
 };

 #define FIXADDR_SIZE           (__end_of_fixed_addresses * PAGE_SIZE)
-#define FIXADDR_TOP            (VMALLOC_START)
+#define FIXADDR_TOP            (VMEMMAP_START)
 #define FIXADDR_START          (FIXADDR_TOP - FIXADDR_SIZE)

 #define FIXMAP_PAGE_IO         PAGE_KERNEL
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 8ddb6c7fedac..83830997dce6 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
 #define page_to_bus(page)      (page_to_phys(page))
 #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))

+#if defined(CONFIG_FLATMEM)
 #define pfn_valid(pfn) \
        (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
+#endif

 #define ARCH_PFN_OFFSET                (pfn_base)
Logan Gunthorpe Aug. 14, 2019, 4:56 p.m. UTC | #16
On 2019-08-14 7:35 a.m., Greentime Hu wrote:
> Logan Gunthorpe <logang@deltatee.com> 於 2019年8月14日 週三 上午12:50寫道:
>>
>> On 2019-08-13 10:39 a.m., Paul Walmsley wrote:
>>> On Tue, 13 Aug 2019, Logan Gunthorpe wrote:
>>>
>>>> On 2019-08-13 12:04 a.m., Greentime Hu wrote:
>>>>
>>>>> Every architecture with mmu defines their own pfn_valid().
>>>>
>>>> Not true. Arm64, for example just uses the generic implementation in
>>>> mmzone.h.
>>>
>>> arm64 seems to define their own:
>>>
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig#n899
>>
>> Oh, yup. My mistake.
>>
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/mm/init.c#n235
>>>
>>> While there are many architectures which have their own pfn_valid();
>>> oddly, almost none of them set HAVE_ARCH_PFN_VALID ?
>>
>> Yes, much of this is super confusing. Seems HAVE_ARCH_PFN_VALID only
>> matters if SPARSEMEM is set. So risc-v probably doesn't need to set it
>> and we just need a #ifdef !CONFIG_FLATMEM around the pfn_valid
>> definition like other arches.
>>
> 
> Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM.
> https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85
> 
> BTW, I found another issue here.
> #define FIXADDR_TOP            (VMALLOC_START)
> #define FIXADDR_START           (FIXADDR_TOP - FIXADDR_SIZE)
> #define VMEMMAP_END    (VMALLOC_START - 1)
> #define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
> These 2 regions are overlapped.
> 
> How about this fix? Not sure if it is good for everyone.

Yes, this looks good to me. I can fold these changes into my patch and
send a v5 to the list.

Thanks!

Logan


> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 3f12b069af1d..3c4d394679d0 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -115,9 +115,6 @@ config PGTABLE_LEVELS
>         default 3 if 64BIT
>         default 2
> 
> -config HAVE_ARCH_PFN_VALID
> -       def_bool y
> -
>  menu "Platform type"
> 
>  choice
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index c207f6634b91..72e106b60bc5 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -26,7 +26,7 @@ enum fixed_addresses {
>  };
> 
>  #define FIXADDR_SIZE           (__end_of_fixed_addresses * PAGE_SIZE)
> -#define FIXADDR_TOP            (VMALLOC_START)
> +#define FIXADDR_TOP            (VMEMMAP_START)
>  #define FIXADDR_START          (FIXADDR_TOP - FIXADDR_SIZE)
> 
>  #define FIXMAP_PAGE_IO         PAGE_KERNEL
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 8ddb6c7fedac..83830997dce6 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -100,8 +100,10 @@ extern unsigned long min_low_pfn;
>  #define page_to_bus(page)      (page_to_phys(page))
>  #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))
> 
> +#if defined(CONFIG_FLATMEM)
>  #define pfn_valid(pfn) \
>         (((pfn) >= pfn_base) && (((pfn)-pfn_base) < max_mapnr))
> +#endif
> 
>  #define ARCH_PFN_OFFSET                (pfn_base)
Paul Walmsley Aug. 14, 2019, 5:40 p.m. UTC | #17
On Wed, 14 Aug 2019, Logan Gunthorpe wrote:

> On 2019-08-14 7:35 a.m., Greentime Hu wrote:
>
> > Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM.
> > https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85
> > 
> > BTW, I found another issue here.
> > #define FIXADDR_TOP            (VMALLOC_START)
> > #define FIXADDR_START           (FIXADDR_TOP - FIXADDR_SIZE)
> > #define VMEMMAP_END    (VMALLOC_START - 1)
> > #define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
> > These 2 regions are overlapped.
> > 
> > How about this fix? Not sure if it is good for everyone.
> 
> Yes, this looks good to me. I can fold these changes into my patch and
> send a v5 to the list.

The change to FIXADDR_TOP should be separated out into its own patch - it 
probably needs to go up as a fix.


- Paul
Logan Gunthorpe Aug. 14, 2019, 5:46 p.m. UTC | #18
On 2019-08-14 11:40 a.m., Paul Walmsley wrote:
> On Wed, 14 Aug 2019, Logan Gunthorpe wrote:
> 
>> On 2019-08-14 7:35 a.m., Greentime Hu wrote:
>>
>>> Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM.
>>> https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85
>>>
>>> BTW, I found another issue here.
>>> #define FIXADDR_TOP            (VMALLOC_START)
>>> #define FIXADDR_START           (FIXADDR_TOP - FIXADDR_SIZE)
>>> #define VMEMMAP_END    (VMALLOC_START - 1)
>>> #define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
>>> These 2 regions are overlapped.
>>>
>>> How about this fix? Not sure if it is good for everyone.
>>
>> Yes, this looks good to me. I can fold these changes into my patch and
>> send a v5 to the list.
> 
> The change to FIXADDR_TOP should be separated out into its own patch - it 
> probably needs to go up as a fix.

I don't think so... VMEMMAP_START doesn't exist until the sparsemem
patch so it can't be changed until after the sparsemem patch and no
sense adding a bug in the sparsemem patch...

Logan
Paul Walmsley Aug. 14, 2019, 8:09 p.m. UTC | #19
On Wed, 14 Aug 2019, Logan Gunthorpe wrote:

> On 2019-08-14 11:40 a.m., Paul Walmsley wrote:
> > On Wed, 14 Aug 2019, Logan Gunthorpe wrote:
> > 
> >> On 2019-08-14 7:35 a.m., Greentime Hu wrote:
> >>
> >>> Maybe this commit explains why it used HAVE_ARCH_PFN_VALID instead of SPARSEMEM.
> >>> https://github.com/torvalds/linux/commit/7b7bf499f79de3f6c85a340c8453a78789523f85
> >>>
> >>> BTW, I found another issue here.
> >>> #define FIXADDR_TOP            (VMALLOC_START)
> >>> #define FIXADDR_START           (FIXADDR_TOP - FIXADDR_SIZE)
> >>> #define VMEMMAP_END    (VMALLOC_START - 1)
> >>> #define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
> >>> These 2 regions are overlapped.
> >>>
> >>> How about this fix? Not sure if it is good for everyone.
> >>
> >> Yes, this looks good to me. I can fold these changes into my patch and
> >> send a v5 to the list.
> > 
> > The change to FIXADDR_TOP should be separated out into its own patch - it 
> > probably needs to go up as a fix.
> 
> I don't think so... VMEMMAP_START doesn't exist until the sparsemem
> patch so it can't be changed until after the sparsemem patch and no
> sense adding a bug in the sparsemem patch...

OK, that's fine then.


- Paul
Logan Gunthorpe Aug. 14, 2019, 10:21 p.m. UTC | #20
Hey,

On 2019-08-14 7:35 a.m., Greentime Hu wrote:
> How about this fix? Not sure if it is good for everyone.

I applied your fix to the patch and it seems ok. But it doesn't seem to
work on a recent version of the kernel. Have you got it working on v5.3?
It seems the following patch breaks things:

671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages")

I don't really have time right now to debug this any further.

Logan
Greentime Hu Aug. 15, 2019, 9:31 a.m. UTC | #21
Hi Logan,

On Thu, Aug 15, 2019 at 6:21 AM Logan Gunthorpe <logang@deltatee.com> wrote:
>
> Hey,
>
> On 2019-08-14 7:35 a.m., Greentime Hu wrote:
> > How about this fix? Not sure if it is good for everyone.
>
> I applied your fix to the patch and it seems ok. But it doesn't seem to
> work on a recent version of the kernel. Have you got it working on v5.3?
> It seems the following patch breaks things:
>
> 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages")
>
> I don't really have time right now to debug this any further.
>

I just tried v5.3-rc4 and it failed. I try to debug this case.
I found it failed might be because of an unmapped virtual address is used
in memblocks_present() -> memblock_alloc ().

In this commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two
stages"), it finishes all the VA/PA mapping after setup_vm_final() is
called.
So we have to call memblocks_present() and sparse_init() right after
setup_vm_final().

Here is my patch and I tested with memory-with-hole.
It can boot normally in Unleashed board after applying this patch.

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 27d1d847fb2d..35aacb0c93e5 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -138,8 +138,6 @@ void __init setup_bootmem(void)
                                  PFN_PHYS(end_pfn - start_pfn),
                                  &memblock.memory, 0);
        }
-       memblocks_present();
-       sparse_init();
 }

 unsigned long va_pa_offset;
@@ -452,6 +450,8 @@ static void __init setup_vm_final(void)
 void __init paging_init(void)
 {
        setup_vm_final();
+       memblocks_present();
+       sparse_init();
        setup_zero_page();
        zone_sizes_init();
 }

Test case:
memory@80000000 {
        device_type = "memory";
        reg = <0x0 0x80000000 0x0 0x80000000>;
};
memory@180000000 {
        device_type = "memory";
        reg = <0x1 0x80000000 0x0 0x40000000>;
};


# cat /proc/meminfo
MemTotal:        3003496 kB
MemFree:         2986584 kB
MemAvailable:    2970176 kB
Buffers:               0 kB
Cached:             3540 kB
SwapCached:            0 kB
Active:             3920 kB
Inactive:             68 kB
Active(anon):       3920 kB
Inactive(anon):       68 kB
Active(file):          0 kB
Inactive(file):        0 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:                 0 kB
Writeback:             0 kB
AnonPages:           528 kB
Mapped:             1984 kB
Shmem:              3540 kB
KReclaimable:        688 kB
Slab:               5700 kB
SReclaimable:        688 kB
SUnreclaim:         5012 kB
KernelStack:         424 kB
PageTables:           80 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:     1501748 kB
Committed_AS:       3200 kB
VmallocTotal:   67108863 kB
VmallocUsed:          12 kB
VmallocChunk:          0 kB
Percpu:              272 kB
# uname -a
Linux buildroot 5.3.0-rc4-00001-g44404421c481-dirty #10 SMP Thu Aug 15
16:28:24 DST 2019 riscv64 GNU/Lin[ 2352.443621] random: fast init done
ux
# zcat /proc/config.gz |grep SPARSE
CONFIG_SPARSE_IRQ=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_SPARSEMEM_MANUAL=y
CONFIG_SPARSEMEM=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_VMEMMAP=y
# CONFIG_INPUT_SPARSEKMAP is not set
Logan Gunthorpe Aug. 15, 2019, 4:20 p.m. UTC | #22
On 2019-08-15 3:31 a.m., Greentime Hu wrote:
> Hi Logan,
> 
> On Thu, Aug 15, 2019 at 6:21 AM Logan Gunthorpe <logang@deltatee.com> wrote:
>>
>> Hey,
>>
>> On 2019-08-14 7:35 a.m., Greentime Hu wrote:
>>> How about this fix? Not sure if it is good for everyone.
>>
>> I applied your fix to the patch and it seems ok. But it doesn't seem to
>> work on a recent version of the kernel. Have you got it working on v5.3?
>> It seems the following patch breaks things:
>>
>> 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages")
>>
>> I don't really have time right now to debug this any further.
>>
> 
> I just tried v5.3-rc4 and it failed. I try to debug this case.
> I found it failed might be because of an unmapped virtual address is used
> in memblocks_present() -> memblock_alloc ().
> 
> In this commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two
> stages"), it finishes all the VA/PA mapping after setup_vm_final() is
> called.
> So we have to call memblocks_present() and sparse_init() right after
> setup_vm_final().
> 
> Here is my patch and I tested with memory-with-hole.
> It can boot normally in Unleashed board after applying this patch.

Great, thanks! I'll roll this into my patch and send v5 out when I have
a moment. Can I add your Signed-off-by for the bits you've contributed
to give you credit for your work?

Logan
Greentime Hu Aug. 16, 2019, 2:07 a.m. UTC | #23
On Fri, Aug 16, 2019 at 12:20 AM Logan Gunthorpe <logang@deltatee.com> wrote:
>
>
>
> On 2019-08-15 3:31 a.m., Greentime Hu wrote:
> > Hi Logan,
> >
> > On Thu, Aug 15, 2019 at 6:21 AM Logan Gunthorpe <logang@deltatee.com> wrote:
> >>
> >> Hey,
> >>
> >> On 2019-08-14 7:35 a.m., Greentime Hu wrote:
> >>> How about this fix? Not sure if it is good for everyone.
> >>
> >> I applied your fix to the patch and it seems ok. But it doesn't seem to
> >> work on a recent version of the kernel. Have you got it working on v5.3?
> >> It seems the following patch breaks things:
> >>
> >> 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages")
> >>
> >> I don't really have time right now to debug this any further.
> >>
> >
> > I just tried v5.3-rc4 and it failed. I try to debug this case.
> > I found it failed might be because of an unmapped virtual address is used
> > in memblocks_present() -> memblock_alloc ().
> >
> > In this commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two
> > stages"), it finishes all the VA/PA mapping after setup_vm_final() is
> > called.
> > So we have to call memblocks_present() and sparse_init() right after
> > setup_vm_final().
> >
> > Here is my patch and I tested with memory-with-hole.
> > It can boot normally in Unleashed board after applying this patch.
>
> Great, thanks! I'll roll this into my patch and send v5 out when I have
> a moment. Can I add your Signed-off-by for the bits you've contributed
> to give you credit for your work?

Sure. Please use my Signed-off-by: Greentime Hu <greentime.hu@sifive.com>
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e0d7d61779a6..bd659327bc6b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -54,12 +54,32 @@  config ZONE_DMA32
 	bool
 	default y if 64BIT
 
+config VA_BITS
+	int
+	default 32 if 32BIT
+	default 39 if 64BIT
+
+config PA_BITS
+	int
+	default 34 if 32BIT
+	default 56 if 64BIT
+
 config PAGE_OFFSET
 	hex
 	default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
 	default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
 	default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
 
+config ARCH_FLATMEM_ENABLE
+	def_bool y
+
+config ARCH_SPARSEMEM_ENABLE
+	def_bool y
+	select SPARSEMEM_VMEMMAP_ENABLE
+
+config ARCH_SELECT_MEMORY_MODEL
+	def_bool ARCH_SPARSEMEM_ENABLE
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -94,6 +114,9 @@  config PGTABLE_LEVELS
 config HAVE_KPROBES
 	def_bool n
 
+config HAVE_ARCH_PFN_VALID
+	def_bool y
+
 menu "Platform type"
 
 choice
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 16301966d65b..e1162336f5ea 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -89,6 +89,23 @@  extern pgd_t swapper_pg_dir[];
 #define __S110	PAGE_SHARED_EXEC
 #define __S111	PAGE_SHARED_EXEC
 
+#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
+#define VMALLOC_END      (PAGE_OFFSET - 1)
+#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
+
+/*
+ * Roughly size the vmemmap space to be large enough to fit enough
+ * struct pages to map half the virtual address space. Then
+ * position vmemmap directly below the VMALLOC region.
+ */
+#define VMEMMAP_SHIFT \
+	(CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
+#define VMEMMAP_SIZE	(1UL << VMEMMAP_SHIFT)
+#define VMEMMAP_END	(VMALLOC_START - 1)
+#define VMEMMAP_START	(VMALLOC_START - VMEMMAP_SIZE)
+
+#define vmemmap		((struct page *)VMEMMAP_START)
+
 /*
  * ZERO_PAGE is a global shared page that is always zero,
  * used for zero-mapped memory areas, etc.
@@ -411,10 +428,6 @@  static inline void pgtable_cache_init(void)
 	/* No page table caches to initialize */
 }
 
-#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
-#define VMALLOC_END      (PAGE_OFFSET - 1)
-#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
-
 /*
  * Task size is 0x40000000000 for RV64 or 0xb800000 for RV32.
  * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h
new file mode 100644
index 000000000000..b58ba2d9ed6e
--- /dev/null
+++ b/arch/riscv/include/asm/sparsemem.h
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_SPARSEMEM_H
+#define __ASM_SPARSEMEM_H
+
+#ifdef CONFIG_SPARSEMEM
+#define MAX_PHYSMEM_BITS	CONFIG_PA_BITS
+#define SECTION_SIZE_BITS	27
+#endif /* CONFIG_SPARSEMEM */
+
+#endif /* __ASM_SPARSEMEM_H */
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index fc8006a042eb..98f39adefb1a 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -193,6 +193,9 @@  static void __init setup_bootmem(void)
 		                  PFN_PHYS(end_pfn - start_pfn),
 		                  &memblock.memory, 0);
 	}
+
+	memblocks_present();
+	sparse_init();
 }
 
 void __init setup_arch(char **cmdline_p)
@@ -224,4 +227,3 @@  void __init setup_arch(char **cmdline_p)
 
 	riscv_fill_hwcap();
 }
-
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 1d9bfaff60bc..09568d5bc5b8 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -69,3 +69,11 @@  void free_initrd_mem(unsigned long start, unsigned long end)
 {
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
+
+#ifdef CONFIG_SPARSEMEM
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+			       struct vmem_altmap *altmap)
+{
+	return vmemmap_populate_basepages(start, end, node);
+}
+#endif