diff mbox series

[v3,5/9] riscv: mm: Add memory hotplugging support

Message ID 20240521114830.841660-6-bjorn@kernel.org (mailing list archive)
State New
Headers show
Series riscv: Memory Hot(Un)Plug support | expand

Commit Message

Björn Töpel May 21, 2024, 11:48 a.m. UTC
From: Björn Töpel <bjorn@rivosinc.com>

For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:

 arch_add_memory()
  This callback is responsible for adding the physical memory into the
  direct map, and call into the memory hotplugging generic code via
  __add_pages() that adds the corresponding struct page entries, and
  updates the vmemmap mapping.

 arch_remove_memory()
  This is the inverse of the callback above.

 vmemmap_free()
  This function tears down the vmemmap mappings (if
  CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
  backing vmemmap pages. Note that for persistent memory, an
  alternative allocator for the backing pages can be used; The
  vmem_altmap. This means that when the backing pages are cleared,
  extra care is needed so that the correct deallocation method is
  used.

 arch_get_mappable_range()
  This functions returns the PA range that the direct map can map.
  Used by the MHP internals for sanity checks.

The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.

Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
---
 arch/riscv/mm/init.c | 261 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 261 insertions(+)

Comments

Alexandre Ghiti May 21, 2024, 1:19 p.m. UTC | #1
On Tue, May 21, 2024 at 1:49 PM Björn Töpel <bjorn@kernel.org> wrote:
>
> From: Björn Töpel <bjorn@rivosinc.com>
>
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
>
>  arch_add_memory()
>   This callback is responsible for adding the physical memory into the
>   direct map, and call into the memory hotplugging generic code via
>   __add_pages() that adds the corresponding struct page entries, and
>   updates the vmemmap mapping.
>
>  arch_remove_memory()
>   This is the inverse of the callback above.
>
>  vmemmap_free()
>   This function tears down the vmemmap mappings (if
>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>   backing vmemmap pages. Note that for persistent memory, an
>   alternative allocator for the backing pages can be used; The
>   vmem_altmap. This means that when the backing pages are cleared,
>   extra care is needed so that the correct deallocation method is
>   used.
>
>  arch_get_mappable_range()
>   This functions returns the PA range that the direct map can map.
>   Used by the MHP internals for sanity checks.
>
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
>
> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
> ---
>  arch/riscv/mm/init.c | 261 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 261 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 6f72b0b2b854..6693b742bf2f 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
>         }
>  }
>  #endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pagetable(struct page *page, int order)
> +{
> +       unsigned int nr_pages = 1 << order;
> +
> +       /*
> +        * vmemmap/direct page tables can be reserved, if added at
> +        * boot.
> +        */
> +       if (PageReserved(page)) {
> +               __ClearPageReserved(page);

What's the difference between __ClearPageReserved() and
ClearPageReserved()? Because it seems like free_reserved_page() calls
the latter already, so why would you need to call
__ClearPageReserved() on the first page?

> +               while (nr_pages--)
> +                       free_reserved_page(page++);
> +               return;
> +       }
> +
> +       free_pages((unsigned long)page_address(page), order);
> +}
> +
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +       pte_t *pte;
> +       int i;
> +
> +       for (i = 0; i < PTRS_PER_PTE; i++) {
> +               pte = pte_start + i;
> +               if (!pte_none(*pte))
> +                       return;
> +       }
> +
> +       free_pagetable(pmd_page(*pmd), 0);
> +       pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +       pmd_t *pmd;
> +       int i;
> +
> +       for (i = 0; i < PTRS_PER_PMD; i++) {
> +               pmd = pmd_start + i;
> +               if (!pmd_none(*pmd))
> +                       return;
> +       }
> +
> +       free_pagetable(pud_page(*pud), 0);
> +       pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> +       pud_t *pud;
> +       int i;
> +
> +       for (i = 0; i < PTRS_PER_PUD; i++) {
> +               pud = pud_start + i;
> +               if (!pud_none(*pud))
> +                       return;
> +       }
> +
> +       free_pagetable(p4d_page(*p4d), 0);
> +       p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> +                                          struct vmem_altmap *altmap)
> +{
> +       if (altmap)
> +               vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> +       else
> +               free_pagetable(page, get_order(size));
> +}
> +
> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       pte_t *ptep, pte;
> +
> +       for (; addr < end; addr = next) {
> +               next = (addr + PAGE_SIZE) & PAGE_MASK;

Nit: use ALIGN() instead.

> +               if (next > end)
> +                       next = end;
> +
> +               ptep = pte_base + pte_index(addr);
> +               pte = READ_ONCE(*ptep);

Nit: Use ptep_get()

> +
> +               if (!pte_present(*ptep))
> +                       continue;
> +
> +               pte_clear(&init_mm, addr, ptep);
> +               if (is_vmemmap)
> +                       free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
> +       }
> +}
> +
> +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       pte_t *pte_base;
> +       pmd_t *pmdp, pmd;
> +
> +       for (; addr < end; addr = next) {
> +               next = pmd_addr_end(addr, end);
> +               pmdp = pmd_base + pmd_index(addr);
> +               pmd = READ_ONCE(*pmdp);

Nit: Use pmdp_get()

> +
> +               if (!pmd_present(pmd))
> +                       continue;
> +
> +               if (pmd_leaf(pmd)) {
> +                       pmd_clear(pmdp);
> +                       if (is_vmemmap)
> +                               free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
> +                       continue;
> +               }
> +
> +               pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
> +               remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
> +               free_pte_table(pte_base, pmdp);
> +       }
> +}
> +
> +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       pud_t *pudp, pud;
> +       pmd_t *pmd_base;
> +
> +       for (; addr < end; addr = next) {
> +               next = pud_addr_end(addr, end);
> +               pudp = pud_base + pud_index(addr);
> +               pud = READ_ONCE(*pudp);

Nit: Use pudp_get()

> +
> +               if (!pud_present(pud))
> +                       continue;
> +
> +               if (pud_leaf(pud)) {
> +                       if (pgtable_l4_enabled) {
> +                               pud_clear(pudp);
> +                               if (is_vmemmap)
> +                                       free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
> +                       }
> +                       continue;
> +               }
> +
> +               pmd_base = pmd_offset(pudp, 0);
> +               remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
> +
> +               if (pgtable_l4_enabled)
> +                       free_pmd_table(pmd_base, pudp);
> +       }
> +}
> +
> +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> +       unsigned long next;
> +       p4d_t *p4dp, p4d;
> +       pud_t *pud_base;
> +
> +       for (; addr < end; addr = next) {
> +               next = p4d_addr_end(addr, end);
> +               p4dp = p4d_base + p4d_index(addr);
> +               p4d = READ_ONCE(*p4dp);

Nit: Use p4dp_get()

> +
> +               if (!p4d_present(p4d))
> +                       continue;
> +
> +               if (p4d_leaf(p4d)) {
> +                       if (pgtable_l5_enabled) {
> +                               p4d_clear(p4dp);
> +                               if (is_vmemmap)
> +                                       free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
> +                       }
> +                       continue;
> +               }
> +
> +               pud_base = pud_offset(p4dp, 0);
> +               remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
> +
> +               if (pgtable_l5_enabled)
> +                       free_pud_table(pud_base, p4dp);
> +       }
> +}
> +
> +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
> +                                        struct vmem_altmap *altmap)
> +{
> +       unsigned long addr, next;
> +       p4d_t *p4d_base;
> +       pgd_t *pgd;
> +
> +       for (addr = va; addr < end; addr = next) {
> +               next = pgd_addr_end(addr, end);
> +               pgd = pgd_offset_k(addr);
> +
> +               if (!pgd_present(*pgd))
> +                       continue;
> +
> +               if (pgd_leaf(*pgd))
> +                       continue;
> +
> +               p4d_base = p4d_offset(pgd, 0);
> +               remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
> +       }
> +
> +       flush_tlb_all();
> +}
> +
> +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
> +{
> +       unsigned long va = (unsigned long)__va(start);
> +       unsigned long end = (unsigned long)__va(start + size);
> +
> +       remove_pgd_mapping(va, end, false, NULL);
> +}
> +
> +struct range arch_get_mappable_range(void)
> +{
> +       struct range mhp_range;
> +
> +       mhp_range.start = __pa(PAGE_OFFSET);
> +       mhp_range.end = __pa(PAGE_END - 1);
> +       return mhp_range;
> +}
> +
> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
> +{
> +       int ret = 0;
> +
> +       create_linear_mapping_range(start, start + size, 0, &params->pgprot);
> +       ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
> +       if (ret) {
> +               remove_linear_mapping(start, size);
> +               goto out;
> +       }
> +
> +       max_pfn = PFN_UP(start + size);
> +       max_low_pfn = max_pfn;
> +
> + out:
> +       flush_tlb_all();
> +       return ret;
> +}
> +
> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
> +{
> +       __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
> +       remove_linear_mapping(start, size);
> +       flush_tlb_all();
> +}
> +
> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
> +{
> +       remove_pgd_mapping(start, end, true, altmap);
> +}
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> --
> 2.40.1
>
Björn Töpel May 21, 2024, 2:18 p.m. UTC | #2
Alexandre Ghiti <alexghiti@rivosinc.com> writes:

> On Tue, May 21, 2024 at 1:49 PM Björn Töpel <bjorn@kernel.org> wrote:
>>
>> From: Björn Töpel <bjorn@rivosinc.com>
>>
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>>
>>  arch_add_memory()
>>   This callback is responsible for adding the physical memory into the
>>   direct map, and call into the memory hotplugging generic code via
>>   __add_pages() that adds the corresponding struct page entries, and
>>   updates the vmemmap mapping.
>>
>>  arch_remove_memory()
>>   This is the inverse of the callback above.
>>
>>  vmemmap_free()
>>   This function tears down the vmemmap mappings (if
>>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>>   backing vmemmap pages. Note that for persistent memory, an
>>   alternative allocator for the backing pages can be used; The
>>   vmem_altmap. This means that when the backing pages are cleared,
>>   extra care is needed so that the correct deallocation method is
>>   used.
>>
>>  arch_get_mappable_range()
>>   This functions returns the PA range that the direct map can map.
>>   Used by the MHP internals for sanity checks.
>>
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>>
>> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
>> ---
>>  arch/riscv/mm/init.c | 261 +++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 261 insertions(+)
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..6693b742bf2f 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
>>         }
>>  }
>>  #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
>> +{
>> +       unsigned int nr_pages = 1 << order;
>> +
>> +       /*
>> +        * vmemmap/direct page tables can be reserved, if added at
>> +        * boot.
>> +        */
>> +       if (PageReserved(page)) {
>> +               __ClearPageReserved(page);
>
> What's the difference between __ClearPageReserved() and
> ClearPageReserved()? Because it seems like free_reserved_page() calls
> the latter already, so why would you need to call
> __ClearPageReserved() on the first page?

Indeed! x86 copy pasta (which uses bootmem info page that RV doesn't).

>> +               while (nr_pages--)
>> +                       free_reserved_page(page++);
>> +               return;
>> +       }
>> +
>> +       free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +       pte_t *pte;
>> +       int i;
>> +
>> +       for (i = 0; i < PTRS_PER_PTE; i++) {
>> +               pte = pte_start + i;
>> +               if (!pte_none(*pte))
>> +                       return;
>> +       }
>> +
>> +       free_pagetable(pmd_page(*pmd), 0);
>> +       pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +       pmd_t *pmd;
>> +       int i;
>> +
>> +       for (i = 0; i < PTRS_PER_PMD; i++) {
>> +               pmd = pmd_start + i;
>> +               if (!pmd_none(*pmd))
>> +                       return;
>> +       }
>> +
>> +       free_pagetable(pud_page(*pud), 0);
>> +       pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> +       pud_t *pud;
>> +       int i;
>> +
>> +       for (i = 0; i < PTRS_PER_PUD; i++) {
>> +               pud = pud_start + i;
>> +               if (!pud_none(*pud))
>> +                       return;
>> +       }
>> +
>> +       free_pagetable(p4d_page(*p4d), 0);
>> +       p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +                                          struct vmem_altmap *altmap)
>> +{
>> +       if (altmap)
>> +               vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +       else
>> +               free_pagetable(page, get_order(size));
>> +}
>> +
>> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
>> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> +       unsigned long next;
>> +       pte_t *ptep, pte;
>> +
>> +       for (; addr < end; addr = next) {
>> +               next = (addr + PAGE_SIZE) & PAGE_MASK;
>
> Nit: use ALIGN() instead.
>
>> +               if (next > end)
>> +                       next = end;
>> +
>> +               ptep = pte_base + pte_index(addr);
>> +               pte = READ_ONCE(*ptep);
>
> Nit: Use ptep_get()
>
>> +
>> +               if (!pte_present(*ptep))
>> +                       continue;
>> +
>> +               pte_clear(&init_mm, addr, ptep);
>> +               if (is_vmemmap)
>> +                       free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
>> +       }
>> +}
>> +
>> +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
>> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> +       unsigned long next;
>> +       pte_t *pte_base;
>> +       pmd_t *pmdp, pmd;
>> +
>> +       for (; addr < end; addr = next) {
>> +               next = pmd_addr_end(addr, end);
>> +               pmdp = pmd_base + pmd_index(addr);
>> +               pmd = READ_ONCE(*pmdp);
>
> Nit: Use pmdp_get()
>
>> +
>> +               if (!pmd_present(pmd))
>> +                       continue;
>> +
>> +               if (pmd_leaf(pmd)) {
>> +                       pmd_clear(pmdp);
>> +                       if (is_vmemmap)
>> +                               free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
>> +                       continue;
>> +               }
>> +
>> +               pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
>> +               remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
>> +               free_pte_table(pte_base, pmdp);
>> +       }
>> +}
>> +
>> +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
>> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> +       unsigned long next;
>> +       pud_t *pudp, pud;
>> +       pmd_t *pmd_base;
>> +
>> +       for (; addr < end; addr = next) {
>> +               next = pud_addr_end(addr, end);
>> +               pudp = pud_base + pud_index(addr);
>> +               pud = READ_ONCE(*pudp);
>
> Nit: Use pudp_get()
>
>> +
>> +               if (!pud_present(pud))
>> +                       continue;
>> +
>> +               if (pud_leaf(pud)) {
>> +                       if (pgtable_l4_enabled) {
>> +                               pud_clear(pudp);
>> +                               if (is_vmemmap)
>> +                                       free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
>> +                       }
>> +                       continue;
>> +               }
>> +
>> +               pmd_base = pmd_offset(pudp, 0);
>> +               remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
>> +
>> +               if (pgtable_l4_enabled)
>> +                       free_pmd_table(pmd_base, pudp);
>> +       }
>> +}
>> +
>> +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
>> +                                        bool is_vmemmap, struct vmem_altmap *altmap)
>> +{
>> +       unsigned long next;
>> +       p4d_t *p4dp, p4d;
>> +       pud_t *pud_base;
>> +
>> +       for (; addr < end; addr = next) {
>> +               next = p4d_addr_end(addr, end);
>> +               p4dp = p4d_base + p4d_index(addr);
>> +               p4d = READ_ONCE(*p4dp);
>
> Nit: Use p4dp_get()

...and I'll make sure to address these nits as well.


Thanks!
Björn
Oscar Salvador May 21, 2024, 2:20 p.m. UTC | #3
On Tue, May 21, 2024 at 03:19:37PM +0200, Alexandre Ghiti wrote:
> On Tue, May 21, 2024 at 1:49 PM Björn Töpel <bjorn@kernel.org> wrote:
> > +       if (PageReserved(page)) {
> > +               __ClearPageReserved(page);
> 
> What's the difference between __ClearPageReserved() and
> ClearPageReserved()? Because it seems like free_reserved_page() calls
> the latter already, so why would you need to call
> __ClearPageReserved() on the first page?

__{Set,Clear}Page are the non-atomic version.
Usually used when you know that no one else can fiddle with the page, which
should be the case here since we are removing the memory.

As to why we have __ClearPageReserved and then having
free_reserved_page() call ClearPageReserved I do not really know.
Looking at the history, it has always been like this.

I remember I looked at this a few years ago but I cannot remember the outcome
of that.

Maybe David remembers better, but I think we could remove that
__ClearPageReserved.
Looking at powerpc implementation code, it does not do the
__ClearPageReserved and relies only on free_reserved_page().

I will have a look.
diff mbox series

Patch

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 6f72b0b2b854..6693b742bf2f 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1493,3 +1493,264 @@  void __init pgtable_cache_init(void)
 	}
 }
 #endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pagetable(struct page *page, int order)
+{
+	unsigned int nr_pages = 1 << order;
+
+	/*
+	 * vmemmap/direct page tables can be reserved, if added at
+	 * boot.
+	 */
+	if (PageReserved(page)) {
+		__ClearPageReserved(page);
+		while (nr_pages--)
+			free_reserved_page(page++);
+		return;
+	}
+
+	free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	free_pagetable(pmd_page(*pmd), 0);
+	pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	free_pagetable(pud_page(*pud), 0);
+	pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (!pud_none(*pud))
+			return;
+	}
+
+	free_pagetable(p4d_page(*p4d), 0);
+	p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+					   struct vmem_altmap *altmap)
+{
+	if (altmap)
+		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+	else
+		free_pagetable(page, get_order(size));
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pte_t *ptep, pte;
+
+	for (; addr < end; addr = next) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		ptep = pte_base + pte_index(addr);
+		pte = READ_ONCE(*ptep);
+
+		if (!pte_present(*ptep))
+			continue;
+
+		pte_clear(&init_mm, addr, ptep);
+		if (is_vmemmap)
+			free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+	}
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmdp, pmd;
+
+	for (; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+		pmdp = pmd_base + pmd_index(addr);
+		pmd = READ_ONCE(*pmdp);
+
+		if (!pmd_present(pmd))
+			continue;
+
+		if (pmd_leaf(pmd)) {
+			pmd_clear(pmdp);
+			if (is_vmemmap)
+				free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
+			continue;
+		}
+
+		pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
+		remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
+		free_pte_table(pte_base, pmdp);
+	}
+}
+
+static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pud_t *pudp, pud;
+	pmd_t *pmd_base;
+
+	for (; addr < end; addr = next) {
+		next = pud_addr_end(addr, end);
+		pudp = pud_base + pud_index(addr);
+		pud = READ_ONCE(*pudp);
+
+		if (!pud_present(pud))
+			continue;
+
+		if (pud_leaf(pud)) {
+			if (pgtable_l4_enabled) {
+				pud_clear(pudp);
+				if (is_vmemmap)
+					free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
+			}
+			continue;
+		}
+
+		pmd_base = pmd_offset(pudp, 0);
+		remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
+
+		if (pgtable_l4_enabled)
+			free_pmd_table(pmd_base, pudp);
+	}
+}
+
+static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
+					 bool is_vmemmap, struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	p4d_t *p4dp, p4d;
+	pud_t *pud_base;
+
+	for (; addr < end; addr = next) {
+		next = p4d_addr_end(addr, end);
+		p4dp = p4d_base + p4d_index(addr);
+		p4d = READ_ONCE(*p4dp);
+
+		if (!p4d_present(p4d))
+			continue;
+
+		if (p4d_leaf(p4d)) {
+			if (pgtable_l5_enabled) {
+				p4d_clear(p4dp);
+				if (is_vmemmap)
+					free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
+			}
+			continue;
+		}
+
+		pud_base = pud_offset(p4dp, 0);
+		remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
+
+		if (pgtable_l5_enabled)
+			free_pud_table(pud_base, p4dp);
+	}
+}
+
+static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
+					 struct vmem_altmap *altmap)
+{
+	unsigned long addr, next;
+	p4d_t *p4d_base;
+	pgd_t *pgd;
+
+	for (addr = va; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+
+		if (!pgd_present(*pgd))
+			continue;
+
+		if (pgd_leaf(*pgd))
+			continue;
+
+		p4d_base = p4d_offset(pgd, 0);
+		remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
+	}
+
+	flush_tlb_all();
+}
+
+static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
+{
+	unsigned long va = (unsigned long)__va(start);
+	unsigned long end = (unsigned long)__va(start + size);
+
+	remove_pgd_mapping(va, end, false, NULL);
+}
+
+struct range arch_get_mappable_range(void)
+{
+	struct range mhp_range;
+
+	mhp_range.start = __pa(PAGE_OFFSET);
+	mhp_range.end = __pa(PAGE_END - 1);
+	return mhp_range;
+}
+
+int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
+{
+	int ret = 0;
+
+	create_linear_mapping_range(start, start + size, 0, &params->pgprot);
+	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
+	if (ret) {
+		remove_linear_mapping(start, size);
+		goto out;
+	}
+
+	max_pfn = PFN_UP(start + size);
+	max_low_pfn = max_pfn;
+
+ out:
+	flush_tlb_all();
+	return ret;
+}
+
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+{
+	__remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
+	remove_linear_mapping(start, size);
+	flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
+{
+	remove_pgd_mapping(start, end, true, altmap);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */