@@ -273,6 +273,9 @@ config ZONE_DMA32
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+
config SMP
def_bool y
@@ -722,6 +722,250 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(pte));
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_hotplug_page_range(struct page *page, size_t size)
+{
+ WARN_ON(!page || PageReserved(page));
+ free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+ free_hotplug_page_range(page, PAGE_SIZE);
+}
+
+static void free_pte_table(pmd_t *pmdp, unsigned long addr)
+{
+ struct page *page;
+ pte_t *ptep;
+ int i;
+
+ ptep = pte_offset_kernel(pmdp, 0UL);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (!pte_none(READ_ONCE(ptep[i])))
+ return;
+ }
+
+ page = pmd_page(READ_ONCE(*pmdp));
+ pmd_clear(pmdp);
+ __flush_tlb_kernel_pgtable(addr);
+ free_hotplug_pgtable_page(page);
+}
+
+static void free_pmd_table(pud_t *pudp, unsigned long addr)
+{
+ struct page *page;
+ pmd_t *pmdp;
+ int i;
+
+ if (CONFIG_PGTABLE_LEVELS <= 2)
+ return;
+
+ pmdp = pmd_offset(pudp, 0UL);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(READ_ONCE(pmdp[i])))
+ return;
+ }
+
+ page = pud_page(READ_ONCE(*pudp));
+ pud_clear(pudp);
+ __flush_tlb_kernel_pgtable(addr);
+ free_hotplug_pgtable_page(page);
+}
+
+static void free_pud_table(pgd_t *pgdp, unsigned long addr)
+{
+ struct page *page;
+ pud_t *pudp;
+ int i;
+
+ if (CONFIG_PGTABLE_LEVELS <= 3)
+ return;
+
+ pudp = pud_offset(pgdp, 0UL);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_none(READ_ONCE(pudp[i])))
+ return;
+ }
+
+ page = pgd_page(READ_ONCE(*pgdp));
+ pgd_clear(pgdp);
+ __flush_tlb_kernel_pgtable(addr);
+ free_hotplug_pgtable_page(page);
+}
+
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, bool sparse_vmap)
+{
+ struct page *page;
+ pte_t *ptep, pte;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+ if (pte_none(pte))
+ continue;
+
+ WARN_ON(!pte_present(pte));
+ page = sparse_vmap ? pte_page(pte) : NULL;
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (sparse_vmap)
+ free_hotplug_page_range(page, PAGE_SIZE);
+ } while (addr += PAGE_SIZE, addr < end);
+}
+
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
+ unsigned long end, bool sparse_vmap)
+{
+ unsigned long next;
+ struct page *page;
+ pmd_t *pmdp, pmd;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd));
+ if (pmd_sect(pmd)) {
+ page = sparse_vmap ? pmd_page(pmd) : NULL;
+ pmd_clear(pmdp);
+ flush_tlb_kernel_range(addr, next);
+ if (sparse_vmap)
+ free_hotplug_page_range(page, PMD_SIZE);
+ continue;
+ }
+ WARN_ON(!pmd_table(pmd));
+ unmap_hotplug_pte_range(pmdp, addr, next, sparse_vmap);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_pud_range(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, bool sparse_vmap)
+{
+ unsigned long next;
+ struct page *page;
+ pud_t *pudp, pud;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(pgdp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud));
+ if (pud_sect(pud)) {
+ page = sparse_vmap ? pud_page(pud) : NULL;
+ pud_clear(pudp);
+ flush_tlb_kernel_range(addr, next);
+ if (sparse_vmap)
+ free_hotplug_page_range(page, PUD_SIZE);
+ continue;
+ }
+ WARN_ON(!pud_table(pud));
+ unmap_hotplug_pmd_range(pudp, addr, next, sparse_vmap);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
+ bool sparse_vmap)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ unmap_hotplug_pud_range(pgdp, addr, next, sparse_vmap);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
+ unsigned long end)
+{
+ pte_t *ptep, pte;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+ WARN_ON(!pte_none(pte));
+ } while (addr += PAGE_SIZE, addr < end);
+}
+
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmdp, pmd;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+ free_empty_pte_table(pmdp, addr, next);
+ free_pte_table(pmdp, addr);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_pud_table(pgd_t *pgdp, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(pgdp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+ free_empty_pmd_table(pudp, addr, next);
+ free_pmd_table(pudp, addr);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_tables(unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ free_empty_pud_table(pgdp, addr, next);
+ free_pud_table(pgdp, addr);
+ } while (addr = next, addr < end);
+}
+
+static void remove_pagetable(unsigned long start, unsigned long end,
+ bool sparse_vmap)
+{
+ unmap_hotplug_range(start, end, sparse_vmap);
+ free_empty_tables(start, end);
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#if !ARM64_SWAPPER_USES_SECTION_MAPS
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -769,6 +1013,27 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * FIXME: We should have called remove_pagetable(start, end, true).
+ * vmemmap and vmalloc virtual range might share intermediate kernel
+ * page table entries. Removing vmemmap range page table pages here
+ * can potentially conflict with a concurrent vmalloc() allocation.
+ *
+ * This is primarily because vmalloc() does not take init_mm ptl for
+ * the entire page table walk and it's modification. Instead it just
+ * takes the lock while allocating and installing page table pages
+ * via [p4d|pud|pmd|pte]_alloc(). A concurrently vanishing page table
+ * entry via memory hot remove can cause vmalloc() kernel page table
+ * walk pointers to be invalid on the fly which can cause corruption
+ * or worst, a crash.
+ *
+ * To avoid this problem, lets not free empty page table pages for
+ * given vmemmap range being hot-removed. Just unmap and free the
+ * range instead.
+ */
+ unmap_hotplug_range(start, end, true);
+#endif
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
@@ -1060,10 +1325,18 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
}
#ifdef CONFIG_MEMORY_HOTPLUG
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
+{
+ unsigned long end = start + size;
+
+ WARN_ON(pgdir != init_mm.pgd);
+ remove_pagetable(start, end, false);
+}
+
int arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions)
{
- int flags = 0;
+ int ret, flags = 0;
if (rodata_full || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -1071,9 +1344,14 @@ int arch_add_memory(int nid, u64 start, u64 size,
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
restrictions);
+ if (ret)
+ __remove_pgd_mapping(swapper_pg_dir,
+ __phys_to_virt(start), size);
+ return ret;
}
+
void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
{
@@ -1081,14 +1359,8 @@ void arch_remove_memory(int nid, u64 start, u64 size,
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
- /*
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
- * adding fails). Until then, this function should only be used
- * during memory hotplug (adding memory), not for memory
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
- * unlocked yet.
- */
zone = page_zone(pfn_to_page(start_pfn));
__remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}
#endif