diff mbox series

[v2,4/8] riscv: mm: Add memory hotplugging support

Message ID 20240514140446.538622-5-bjorn@kernel.org (mailing list archive)
State Superseded
Headers show
Series riscv: Memory Hot(Un)Plug support | expand

Checks

Context Check Description
conchuod/vmtest-fixes-PR fail PR summary
conchuod/patch-4-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-4-test-2 success .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-4-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-4-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-4-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-4-test-6 success .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-4-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-4-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-4-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-4-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-4-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-4-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Björn Töpel May 14, 2024, 2:04 p.m. UTC
From: Björn Töpel <bjorn@rivosinc.com>

For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:

 arch_add_memory()
  This callback is responsible for adding the physical memory into the
  direct map, and call into the memory hotplugging generic code via
  __add_pages() that adds the corresponding struct page entries, and
  updates the vmemmap mapping.

 arch_remove_memory()
  This is the inverse of the callback above.

 vmemmap_free()
  This function tears down the vmemmap mappings (if
  CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
  backing vmemmap pages. Note that for persistent memory, an
  alternative allocator for the backing pages can be used; The
  vmem_altmap. This means that when the backing pages are cleared,
  extra care is needed so that the correct deallocation method is
  used.

 arch_get_mappable_range()
  This functions returns the PA range that the direct map can map.
  Used by the MHP internals for sanity checks.

The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.

Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
---
 arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)

Comments

David Hildenbrand May 14, 2024, 4:04 p.m. UTC | #1
On 14.05.24 16:04, Björn Töpel wrote:
> From: Björn Töpel <bjorn@rivosinc.com>
> 
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
> 
>   arch_add_memory()
>    This callback is responsible for adding the physical memory into the
>    direct map, and call into the memory hotplugging generic code via
>    __add_pages() that adds the corresponding struct page entries, and
>    updates the vmemmap mapping.
> 
>   arch_remove_memory()
>    This is the inverse of the callback above.
> 
>   vmemmap_free()
>    This function tears down the vmemmap mappings (if
>    CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>    backing vmemmap pages. Note that for persistent memory, an
>    alternative allocator for the backing pages can be used; The
>    vmem_altmap. This means that when the backing pages are cleared,
>    extra care is needed so that the correct deallocation method is
>    used.
> 
>   arch_get_mappable_range()
>    This functions returns the PA range that the direct map can map.
>    Used by the MHP internals for sanity checks.
> 
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
> 
> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
> ---
>   arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 242 insertions(+)
> 
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 6f72b0b2b854..7f0b921a3d3a 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
>   	}
>   }
>   #endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (!pte_none(*pte))
> +			return;
> +	}
> +
> +	free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
> +	pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (!pmd_none(*pmd))
> +			return;
> +	}
> +
> +	free_pages((unsigned long)page_address(pud_page(*pud)), 0);
> +	pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (!pud_none(*pud))
> +			return;
> +	}
> +
> +	free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
> +	p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> +					   struct vmem_altmap *altmap)
> +{
> +	if (altmap)
> +		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> +	else
> +		free_pages((unsigned long)page_address(page), get_order(size));

If you unplug a DIMM that was added during boot (can happen on x86-64, 
can it happen on riscv?), free_pages() would not be sufficient. You'd be 
freeing a PG_reserved page that has to be freed differently.
Björn Töpel May 14, 2024, 4:34 p.m. UTC | #2
David Hildenbrand <david@redhat.com> writes:

> On 14.05.24 16:04, Björn Töpel wrote:
>> From: Björn Töpel <bjorn@rivosinc.com>
>> 
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>> 
>>   arch_add_memory()
>>    This callback is responsible for adding the physical memory into the
>>    direct map, and call into the memory hotplugging generic code via
>>    __add_pages() that adds the corresponding struct page entries, and
>>    updates the vmemmap mapping.
>> 
>>   arch_remove_memory()
>>    This is the inverse of the callback above.
>> 
>>   vmemmap_free()
>>    This function tears down the vmemmap mappings (if
>>    CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>>    backing vmemmap pages. Note that for persistent memory, an
>>    alternative allocator for the backing pages can be used; The
>>    vmem_altmap. This means that when the backing pages are cleared,
>>    extra care is needed so that the correct deallocation method is
>>    used.
>> 
>>   arch_get_mappable_range()
>>    This functions returns the PA range that the direct map can map.
>>    Used by the MHP internals for sanity checks.
>> 
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>> 
>> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
>> ---
>>   arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 242 insertions(+)
>> 
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..7f0b921a3d3a 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
>>   	}
>>   }
>>   #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +	pte_t *pte;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PTE; i++) {
>> +		pte = pte_start + i;
>> +		if (!pte_none(*pte))
>> +			return;
>> +	}
>> +
>> +	free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
>> +	pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +	pmd_t *pmd;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PMD; i++) {
>> +		pmd = pmd_start + i;
>> +		if (!pmd_none(*pmd))
>> +			return;
>> +	}
>> +
>> +	free_pages((unsigned long)page_address(pud_page(*pud)), 0);
>> +	pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> +	pud_t *pud;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PUD; i++) {
>> +		pud = pud_start + i;
>> +		if (!pud_none(*pud))
>> +			return;
>> +	}
>> +
>> +	free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
>> +	p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +					   struct vmem_altmap *altmap)
>> +{
>> +	if (altmap)
>> +		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +	else
>> +		free_pages((unsigned long)page_address(page), get_order(size));
>
> If you unplug a DIMM that was added during boot (can happen on x86-64, 
> can it happen on riscv?), free_pages() would not be sufficient. You'd be 
> freeing a PG_reserved page that has to be freed differently.

I'd say if it can happen on x86-64, it probably can on RISC-V. I'll look
into this for the next spin!

Thanks for spending time on the series!


Cheers,
Björn
Alexandre Ghiti May 14, 2024, 5:49 p.m. UTC | #3
On Tue, May 14, 2024 at 4:05 PM Björn Töpel <bjorn@kernel.org> wrote:
>
> From: Björn Töpel <bjorn@rivosinc.com>
>
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
>
>  arch_add_memory()
>   This callback is responsible for adding the physical memory into the
>   direct map, and call into the memory hotplugging generic code via
>   __add_pages() that adds the corresponding struct page entries, and
>   updates the vmemmap mapping.
>
>  arch_remove_memory()
>   This is the inverse of the callback above.
>
>  vmemmap_free()
>   This function tears down the vmemmap mappings (if
>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>   backing vmemmap pages. Note that for persistent memory, an
>   alternative allocator for the backing pages can be used; The
>   vmem_altmap. This means that when the backing pages are cleared,
>   extra care is needed so that the correct deallocation method is
>   used.
>
>  arch_get_mappable_range()
>   This functions returns the PA range that the direct map can map.
>   Used by the MHP internals for sanity checks.
>
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
>
> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
> ---
>  arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 242 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 6f72b0b2b854..7f0b921a3d3a 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
>         }
>  }
>  #endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +       pte_t *pte;
> +       int i;
> +
> +       for (i = 0; i < PTRS_PER_PTE; i++) {
> +               pte = pte_start + i;
> +               if (!pte_none(*pte))
> +                       return;
> +       }
> +
> +       free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
> +       pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +       pmd_t *pmd;
> +       int i;
> +
> +       for (i = 0; i < PTRS_PER_PMD; i++) {
> +               pmd = pmd_start + i;
> +               if (!pmd_none(*pmd))
> +                       return;
> +       }
> +
> +       free_pages((unsigned long)page_address(pud_page(*pud)), 0);
> +       pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> +       pud_t *pud;
> +       int i;
> +
> +       for (i = 0; i < PTRS_PER_PUD; i++) {
> +               pud = pud_start + i;
> +               if (!pud_none(*pud))
> +                       return;
> +       }
> +
> +       free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
> +       p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> +                                          struct vmem_altmap *altmap)
> +{
> +       if (altmap)
> +               vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> +       else
> +               free_pages((unsigned long)page_address(page), get_order(size));
> +}
> +
> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       pte_t *ptep, pte;
> +
> +       for (; addr < end; addr = next) {
> +               next = (addr + PAGE_SIZE) & PAGE_MASK;
> +               if (next > end)
> +                       next = end;
> +
> +               ptep = pte_base + pte_index(addr);
> +               pte = READ_ONCE(*ptep);
> +
> +               if (!pte_present(*ptep))
> +                       continue;
> +
> +               pte_clear(&init_mm, addr, ptep);
> +               if (is_vmemmap)
> +                       free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
> +       }
> +}
> +
> +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       pte_t *pte_base;
> +       pmd_t *pmdp, pmd;
> +
> +       for (; addr < end; addr = next) {
> +               next = pmd_addr_end(addr, end);
> +               pmdp = pmd_base + pmd_index(addr);
> +               pmd = READ_ONCE(*pmdp);
> +
> +               if (!pmd_present(pmd))
> +                       continue;
> +
> +               if (pmd_leaf(pmd)) {
> +                       pmd_clear(pmdp);
> +                       if (is_vmemmap)
> +                               free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
> +                       continue;
> +               }
> +
> +               pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
> +               remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
> +               free_pte_table(pte_base, pmdp);
> +       }
> +}
> +
> +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       pud_t *pudp, pud;
> +       pmd_t *pmd_base;
> +
> +       for (; addr < end; addr = next) {
> +               next = pud_addr_end(addr, end);
> +               pudp = pud_base + pud_index(addr);
> +               pud = READ_ONCE(*pudp);
> +
> +               if (!pud_present(pud))
> +                       continue;
> +
> +               if (pud_leaf(pud)) {
> +                       if (pgtable_l4_enabled) {
> +                               pud_clear(pudp);
> +                               if (is_vmemmap)
> +                                       free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
> +                       }
> +                       continue;
> +               }
> +
> +               pmd_base = pmd_offset(pudp, 0);
> +               remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
> +
> +               if (pgtable_l4_enabled)
> +                       free_pmd_table(pmd_base, pudp);
> +       }
> +}
> +
> +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       p4d_t *p4dp, p4d;
> +       pud_t *pud_base;
> +
> +       for (; addr < end; addr = next) {
> +               next = p4d_addr_end(addr, end);
> +               p4dp = p4d_base + p4d_index(addr);
> +               p4d = READ_ONCE(*p4dp);
> +
> +               if (!p4d_present(p4d))
> +                       continue;
> +
> +               if (p4d_leaf(p4d)) {
> +                       if (pgtable_l5_enabled) {
> +                               p4d_clear(p4dp);
> +                               if (is_vmemmap)
> +                                       free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
> +                       }
> +                       continue;
> +               }
> +
> +               pud_base = pud_offset(p4dp, 0);
> +               remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
> +
> +               if (pgtable_l5_enabled)
> +                       free_pud_table(pud_base, p4dp);
> +       }
> +}
> +
> +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
> +                                        struct vmem_altmap *altmap)
> +{
> +       unsigned long addr, next;
> +       p4d_t *p4d_base;
> +       pgd_t *pgd;
> +
> +       for (addr = va; addr < end; addr = next) {
> +               next = pgd_addr_end(addr, end);
> +               pgd = pgd_offset_k(addr);
> +
> +               if (!pgd_present(*pgd))
> +                       continue;
> +
> +               if (pgd_leaf(*pgd))
> +                       continue;
> +
> +               p4d_base = p4d_offset(pgd, 0);
> +               remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
> +       }
> +
> +       flush_tlb_all();
> +}
> +
> +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
> +{
> +       unsigned long va = (unsigned long)__va(start);
> +       unsigned long end = (unsigned long)__va(start + size);
> +
> +       remove_pgd_mapping(va, end, false, NULL);
> +}
> +
> +struct range arch_get_mappable_range(void)
> +{
> +       struct range mhp_range;
> +
> +       mhp_range.start = __pa(PAGE_OFFSET);
> +       mhp_range.end = __pa(PAGE_END - 1);
> +       return mhp_range;
> +}
> +
> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
> +{
> +       int ret;
> +
> +       create_linear_mapping_range(start, start + size, 0, &params->pgprot);
> +       flush_tlb_all();
> +       ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
> +       if (ret) {
> +               remove_linear_mapping(start, size);
> +               return ret;
> +       }
> +

You need to flush the TLB here too since __add_pages() populates the
page table with the new vmemmap mapping (only because riscv allows to
cache invalid entries, I'll adapt this in my next version of Svvptc
support).

> +       max_pfn = PFN_UP(start + size);
> +       max_low_pfn = max_pfn;
> +       return 0;
> +}
> +
> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
> +{
> +       __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
> +       remove_linear_mapping(start, size);

You need to flush the TLB here too.

> +}
> +
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
> +{
> +       remove_pgd_mapping(start, end, true, altmap);
> +}
> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> --
> 2.40.1
>
Björn Töpel May 14, 2024, 7:31 p.m. UTC | #4
Alexandre Ghiti <alexghiti@rivosinc.com> writes:

> On Tue, May 14, 2024 at 4:05 PM Björn Töpel <bjorn@kernel.org> wrote:

>> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
>> +{
>> +       int ret;
>> +
>> +       create_linear_mapping_range(start, start + size, 0, &params->pgprot);
>> +       flush_tlb_all();
>> +       ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
>> +       if (ret) {
>> +               remove_linear_mapping(start, size);
>> +               return ret;
>> +       }
>> +
>
> You need to flush the TLB here too since __add_pages() populates the
> page table with the new vmemmap mapping (only because riscv allows to
> cache invalid entries, I'll adapt this in my next version of Svvptc
> support).
>
>> +       max_pfn = PFN_UP(start + size);
>> +       max_low_pfn = max_pfn;
>> +       return 0;
>> +}
>> +
>> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>> +{
>> +       __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
>> +       remove_linear_mapping(start, size);
>
> You need to flush the TLB here too.

I'll address all of the above in the next version. Thanks for reviewing
the series!


Björn
Oscar Salvador May 14, 2024, 8:49 p.m. UTC | #5
On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote:
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> +					   struct vmem_altmap *altmap)
> +{
> +	if (altmap)
> +		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> +	else
> +		free_pages((unsigned long)page_address(page), get_order(size));

David already pointed this out, but can check
arch/x86/mm/init_64.c:free_pagetable().

You will see that we have to do some magic for bootmem memory (DIMMs
which were not hotplugged but already present)

> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
> +{
> +	remove_pgd_mapping(start, end, true, altmap);
> +}
> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
> +#endif /* CONFIG_MEMORY_HOTPLUG */

I will comment on the patch where you add support for hotplug and the
dependency, but on a track in LSFMM today, we decided that most likely
we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP
environments.
So, since you are adding this plain fresh, please consider to tight the
hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP.
As a bonus, you will only have to maintain one flavour of functions.
Björn Töpel May 15, 2024, 5:41 a.m. UTC | #6
Oscar Salvador <osalvador@suse.de> writes:

> On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote:
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +					   struct vmem_altmap *altmap)
>> +{
>> +	if (altmap)
>> +		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +	else
>> +		free_pages((unsigned long)page_address(page), get_order(size));
>
> David already pointed this out, but can check
> arch/x86/mm/init_64.c:free_pagetable().
>
> You will see that we have to do some magic for bootmem memory (DIMMs
> which were not hotplugged but already present)

Thank you!

>> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
>> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
>> +{
>> +	remove_pgd_mapping(start, end, true, altmap);
>> +}
>> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
>> +#endif /* CONFIG_MEMORY_HOTPLUG */
>
> I will comment on the patch where you add support for hotplug and the
> dependency, but on a track in LSFMM today, we decided that most likely
> we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP
> environments.
> So, since you are adding this plain fresh, please consider to tight the
> hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP.
> As a bonus, you will only have to maintain one flavour of functions.

Ah, yeah, I saw it mentioned on the LSF/MM/BPF topics. Less is
definitely more -- I'll make the next version depend on
SPARSEMEM_VMEMMAP.


Björn
diff mbox series

Patch

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 6f72b0b2b854..7f0b921a3d3a 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1493,3 +1493,245 @@  void __init pgtable_cache_init(void)
 	}
 }
 #endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
+	pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	free_pages((unsigned long)page_address(pud_page(*pud)), 0);
+	pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (!pud_none(*pud))
+			return;
+	}
+
+	free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
+	p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+					   struct vmem_altmap *altmap)
+{
+	if (altmap)
+		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+	else
+		free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pte_t *ptep, pte;
+
+	for (; addr < end; addr = next) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		ptep = pte_base + pte_index(addr);
+		pte = READ_ONCE(*ptep);
+
+		if (!pte_present(*ptep))
+			continue;
+
+		pte_clear(&init_mm, addr, ptep);
+		if (is_vmemmap)
+			free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+	}
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmdp, pmd;
+
+	for (; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+		pmdp = pmd_base + pmd_index(addr);
+		pmd = READ_ONCE(*pmdp);
+
+		if (!pmd_present(pmd))
+			continue;
+
+		if (pmd_leaf(pmd)) {
+			pmd_clear(pmdp);
+			if (is_vmemmap)
+				free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
+			continue;
+		}
+
+		pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
+		remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
+		free_pte_table(pte_base, pmdp);
+	}
+}
+
+static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pud_t *pudp, pud;
+	pmd_t *pmd_base;
+
+	for (; addr < end; addr = next) {
+		next = pud_addr_end(addr, end);
+		pudp = pud_base + pud_index(addr);
+		pud = READ_ONCE(*pudp);
+
+		if (!pud_present(pud))
+			continue;
+
+		if (pud_leaf(pud)) {
+			if (pgtable_l4_enabled) {
+				pud_clear(pudp);
+				if (is_vmemmap)
+					free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
+			}
+			continue;
+		}
+
+		pmd_base = pmd_offset(pudp, 0);
+		remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
+
+		if (pgtable_l4_enabled)
+			free_pmd_table(pmd_base, pudp);
+	}
+}
+
+static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	p4d_t *p4dp, p4d;
+	pud_t *pud_base;
+
+	for (; addr < end; addr = next) {
+		next = p4d_addr_end(addr, end);
+		p4dp = p4d_base + p4d_index(addr);
+		p4d = READ_ONCE(*p4dp);
+
+		if (!p4d_present(p4d))
+			continue;
+
+		if (p4d_leaf(p4d)) {
+			if (pgtable_l5_enabled) {
+				p4d_clear(p4dp);
+				if (is_vmemmap)
+					free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
+			}
+			continue;
+		}
+
+		pud_base = pud_offset(p4dp, 0);
+		remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
+
+		if (pgtable_l5_enabled)
+			free_pud_table(pud_base, p4dp);
+	}
+}
+
+static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
+					 struct vmem_altmap *altmap)
+{
+	unsigned long addr, next;
+	p4d_t *p4d_base;
+	pgd_t *pgd;
+
+	for (addr = va; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+
+		if (!pgd_present(*pgd))
+			continue;
+
+		if (pgd_leaf(*pgd))
+			continue;
+
+		p4d_base = p4d_offset(pgd, 0);
+		remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
+	}
+
+	flush_tlb_all();
+}
+
+static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
+{
+	unsigned long va = (unsigned long)__va(start);
+	unsigned long end = (unsigned long)__va(start + size);
+
+	remove_pgd_mapping(va, end, false, NULL);
+}
+
+struct range arch_get_mappable_range(void)
+{
+	struct range mhp_range;
+
+	mhp_range.start = __pa(PAGE_OFFSET);
+	mhp_range.end = __pa(PAGE_END - 1);
+	return mhp_range;
+}
+
+int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
+{
+	int ret;
+
+	create_linear_mapping_range(start, start + size, 0, &params->pgprot);
+	flush_tlb_all();
+	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
+	if (ret) {
+		remove_linear_mapping(start, size);
+		return ret;
+	}
+
+	max_pfn = PFN_UP(start + size);
+	max_low_pfn = max_pfn;
+	return 0;
+}
+
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+{
+	__remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
+	remove_linear_mapping(start, size);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
+{
+	remove_pgd_mapping(start, end, true, altmap);
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+#endif /* CONFIG_MEMORY_HOTPLUG */