[V3,4/4] arm64/mm: Enable memory hot remove
diff mbox series

Message ID 1557824407-19092-5-git-send-email-anshuman.khandual@arm.com
State New
Headers show
Series
  • arm64/mm: Enable memory hot remove
Related show

Commit Message

Anshuman Khandual May 14, 2019, 9 a.m. UTC
Memory removal from an arch perspective involves tearing down two different
kernel based mappings i.e vmemmap and linear while releasing related page
table and any mapped pages allocated for given physical memory range to be
removed.

Define a common kernel page table tear down helper remove_pagetable() which
can be used to unmap given kernel virtual address range. In effect it can
tear down both vmemap or kernel linear mappings. This new helper is called
from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.

For linear mapping there are no actual allocated pages which are mapped to
create the translation. Any pfn on a given entry is derived from physical
address (__va(PA) --> PA) whose linear translation is to be created. They
need not be freed as they were never allocated in the first place. But for
vmemmap which is a real virtual mapping (like vmalloc) physical pages are
allocated either from buddy or memblock which get mapped in the kernel page
table. These allocated and mapped pages need to be freed during translation
tear down. But page table pages need to be freed in both these cases.

These mappings need to be differentiated while deciding if a mapped page at
any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
mapping tear down process should pass on 'sparse_vmap' variable identifying
kernel vmemmap mappings.

While here update arch_add_mempory() to handle __add_pages() failures by
just unmapping recently added kernel linear mapping. Now enable memory hot
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.

This implementation is overall inspired from kernel page table tear down
procedure on X86 architecture.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/Kconfig  |   3 +
 arch/arm64/mm/mmu.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 205 insertions(+), 2 deletions(-)

Comments

David Hildenbrand May 14, 2019, 9:08 a.m. UTC | #1
On 14.05.19 11:00, Anshuman Khandual wrote:
> Memory removal from an arch perspective involves tearing down two different
> kernel based mappings i.e vmemmap and linear while releasing related page
> table and any mapped pages allocated for given physical memory range to be
> removed.
> 
> Define a common kernel page table tear down helper remove_pagetable() which
> can be used to unmap given kernel virtual address range. In effect it can
> tear down both vmemap or kernel linear mappings. This new helper is called
> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> 
> For linear mapping there are no actual allocated pages which are mapped to
> create the translation. Any pfn on a given entry is derived from physical
> address (__va(PA) --> PA) whose linear translation is to be created. They
> need not be freed as they were never allocated in the first place. But for
> vmemmap which is a real virtual mapping (like vmalloc) physical pages are
> allocated either from buddy or memblock which get mapped in the kernel page
> table. These allocated and mapped pages need to be freed during translation
> tear down. But page table pages need to be freed in both these cases.
> 
> These mappings need to be differentiated while deciding if a mapped page at
> any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
> mapping tear down process should pass on 'sparse_vmap' variable identifying
> kernel vmemmap mappings.
> 
> While here update arch_add_mempory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture.
> 
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>  arch/arm64/Kconfig  |   3 +
>  arch/arm64/mm/mmu.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 205 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1c0cb51..bb4e571 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -268,6 +268,9 @@ config HAVE_GENERIC_GUP
>  config ARCH_ENABLE_MEMORY_HOTPLUG
>  	def_bool y
>  
> +config ARCH_ENABLE_MEMORY_HOTREMOVE
> +	def_bool y
> +
>  config SMP
>  	def_bool y
>  
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 37a902c..bd2d003 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -733,6 +733,177 @@ int kern_addr_valid(unsigned long addr)
>  
>  	return pfn_valid(pte_pfn(pte));
>  }
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_hotplug_page_range(struct page *page, ssize_t size)
> +{
> +	WARN_ON(PageReserved(page));
> +	free_pages((unsigned long)page_address(page), get_order(size));
> +}
> +
> +static void free_hotplug_pgtable_page(struct page *page)
> +{
> +	free_hotplug_page_range(page, PAGE_SIZE);
> +}
> +
> +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		if (!pte_none(ptep[i]))
> +			return;
> +	}
> +
> +	page = pmd_page(*pmdp);
> +	pmd_clear(pmdp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +
> +#if (CONFIG_PGTABLE_LEVELS > 2)
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		if (!pmd_none(pmdp[i]))
> +			return;
> +	}
> +
> +	page = pud_page(*pudp);
> +	pud_clear(pudp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +#else
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) { }
> +#endif
> +
> +#if (CONFIG_PGTABLE_LEVELS > 3)
> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		if (!pud_none(pudp[i]))
> +			return;
> +	}
> +
> +	page = pgd_page(*pgdp);
> +	pgd_clear(pgdp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +#else
> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr) { }
> +#endif
> +
> +static void
> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	struct page *page;
> +	pte_t *ptep;
> +	unsigned long start = addr;
> +
> +	for (; addr < end; addr += PAGE_SIZE) {
> +		ptep = pte_offset_kernel(pmdp, addr);
> +		if (!pte_present(*ptep))
> +			continue;
> +
> +		if (sparse_vmap) {
> +			page = pte_page(READ_ONCE(*ptep));
> +			free_hotplug_page_range(page, PAGE_SIZE);
> +		}
> +		pte_clear(&init_mm, addr, ptep);
> +	}
> +	flush_tlb_kernel_range(start, end);
> +}
> +
> +static void
> +remove_pmd_table(pud_t *pudp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long next;
> +	struct page *page;
> +	pte_t *ptep_base;
> +	pmd_t *pmdp;
> +
> +	for (; addr < end; addr = next) {
> +		next = pmd_addr_end(addr, end);
> +		pmdp = pmd_offset(pudp, addr);
> +		if (!pmd_present(*pmdp))
> +			continue;
> +
> +		if (pmd_sect(*pmdp)) {
> +			if (sparse_vmap) {
> +				page = pmd_page(READ_ONCE(*pmdp));
> +				free_hotplug_page_range(page, PMD_SIZE);
> +			}
> +			pmd_clear(pmdp);
> +			continue;
> +		}
> +		ptep_base = pte_offset_kernel(pmdp, 0UL);
> +		remove_pte_table(pmdp, addr, next, sparse_vmap);
> +		free_pte_table(ptep_base, pmdp, addr);
> +	}
> +}
> +
> +static void
> +remove_pud_table(pgd_t *pgdp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long next;
> +	struct page *page;
> +	pmd_t *pmdp_base;
> +	pud_t *pudp;
> +
> +	for (; addr < end; addr = next) {
> +		next = pud_addr_end(addr, end);
> +		pudp = pud_offset(pgdp, addr);
> +		if (!pud_present(*pudp))
> +			continue;
> +
> +		if (pud_sect(*pudp)) {
> +			if (sparse_vmap) {
> +				page = pud_page(READ_ONCE(*pudp));
> +				free_hotplug_page_range(page, PUD_SIZE);
> +			}
> +			pud_clear(pudp);
> +			continue;
> +		}
> +		pmdp_base = pmd_offset(pudp, 0UL);
> +		remove_pmd_table(pudp, addr, next, sparse_vmap);
> +		free_pmd_table(pmdp_base, pudp, addr);
> +	}
> +}
> +
> +static void
> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long addr, next;
> +	pud_t *pudp_base;
> +	pgd_t *pgdp;
> +
> +	spin_lock(&init_mm.page_table_lock);
> +	for (addr = start; addr < end; addr = next) {
> +		next = pgd_addr_end(addr, end);
> +		pgdp = pgd_offset_k(addr);
> +		if (!pgd_present(*pgdp))
> +			continue;
> +
> +		pudp_base = pud_offset(pgdp, 0UL);
> +		remove_pud_table(pgdp, addr, next, sparse_vmap);
> +		free_pud_table(pudp_base, pgdp, addr);
> +	}
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +#endif
> +
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
>  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> @@ -780,6 +951,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>  void vmemmap_free(unsigned long start, unsigned long end,
>  		struct vmem_altmap *altmap)
>  {
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +	remove_pagetable(start, end, true);
> +#endif
>  }
>  #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
>  
> @@ -1070,10 +1244,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
> +{
> +	WARN_ON(pgdir != init_mm.pgd);
> +	remove_pagetable(start, start + size, false);
> +}
> +
>  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>  		    bool want_memblock)
>  {
> -	int flags = 0;
> +	int ret, flags = 0;
>  
>  	if (rodata_full || debug_pagealloc_enabled())
>  		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> @@ -1081,7 +1261,27 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>  	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>  			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>  
> -	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
> +	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>  			   altmap, want_memblock);
> +	if (ret)
> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
>  }
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)

You need to rebase to -next (linus master soon). This function is now a
void ...

> +{
> +	unsigned long start_pfn = start >> PAGE_SHIFT;
> +	unsigned long nr_pages = size >> PAGE_SHIFT;
> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
> +	int ret = 0;
> +
> +	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);

.. and this call can no longer fail :) Which simplifies this patch.

> +	if (!ret)
> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
> +}
> +#endif
>  #endif
>
Mark Rutland May 15, 2019, 11:49 a.m. UTC | #2
Hi Anshuman,

On Tue, May 14, 2019 at 02:30:07PM +0530, Anshuman Khandual wrote:
> Memory removal from an arch perspective involves tearing down two different
> kernel based mappings i.e vmemmap and linear while releasing related page
> table and any mapped pages allocated for given physical memory range to be
> removed.
> 
> Define a common kernel page table tear down helper remove_pagetable() which
> can be used to unmap given kernel virtual address range. In effect it can
> tear down both vmemap or kernel linear mappings. This new helper is called
> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> 
> For linear mapping there are no actual allocated pages which are mapped to
> create the translation. Any pfn on a given entry is derived from physical
> address (__va(PA) --> PA) whose linear translation is to be created. They
> need not be freed as they were never allocated in the first place. But for
> vmemmap which is a real virtual mapping (like vmalloc) physical pages are
> allocated either from buddy or memblock which get mapped in the kernel page
> table. These allocated and mapped pages need to be freed during translation
> tear down. But page table pages need to be freed in both these cases.

As previously discussed, we should only hot-remove memory which was
hot-added, so we shouldn't encounter memory allocated from memblock.

> These mappings need to be differentiated while deciding if a mapped page at
> any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
> mapping tear down process should pass on 'sparse_vmap' variable identifying
> kernel vmemmap mappings.

I think that you can simplify the paragraphs above down to:

  The arch code for hot-remove must tear down portions of the linear map
  and vmemmap corresponding to memory being removed. In both cases the
  page tables mapping these regions must be freed, and when sparse
  vmemmap is in use the memory backing the vmemmap must also be freed.

  This patch adds a new remove_pagetable() helper which can be used to
  tear down either region, and calls it from vmemmap_free() and
  ___remove_pgd_mapping(). The sparse_vmap argument determines whether
  the backing memory will be freed.

Could you add a paragraph describing when we can encounter partial
tables (for which we need the p??_none() checks? IIUC that's not just
for cleaning up a failed hot-add, and it would be good to call that out.

> While here update arch_add_mempory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.

Nit: s/arch_add_mempory/arch_add_memory/.

[...]

> +#if (CONFIG_PGTABLE_LEVELS > 2)
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
> +{
> +	struct page *page;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		if (!pmd_none(pmdp[i]))
> +			return;
> +	}
> +
> +	page = pud_page(*pudp);
> +	pud_clear(pudp);
> +	__flush_tlb_kernel_pgtable(addr);
> +	free_hotplug_pgtable_page(page);
> +}
> +#else
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) { }
> +#endif

Can we fold the check in and remove the ifdeferry? e.g.

static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
{
	struct page *page;
	int i;

	if (CONFIG_PGTABLE_LEVELS <= 2)
		return;
	
	...
}

... that would ensure that we always got build coverage here, and
minimize duplication. We do similar in map_kernel() and
early_fixmap_init() today.

Likewise for the other levels.

For arm64, the general policy is to use READ_ONCE() when reading a page
table entry (even if not strictly necessary), so please do so
consistently.

[...]

> +static void
> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
> +			unsigned long end, bool sparse_vmap)
> +{
> +	struct page *page;
> +	pte_t *ptep;
> +	unsigned long start = addr;
> +
> +	for (; addr < end; addr += PAGE_SIZE) {
> +		ptep = pte_offset_kernel(pmdp, addr);
> +		if (!pte_present(*ptep))
> +			continue;
> +
> +		if (sparse_vmap) {
> +			page = pte_page(READ_ONCE(*ptep));
> +			free_hotplug_page_range(page, PAGE_SIZE);
> +		}
> +		pte_clear(&init_mm, addr, ptep);
> +	}
> +	flush_tlb_kernel_range(start, end);
> +}

Please use a temporary pte variable here, e.g.

static void remove_pte_table(pmd_t *pmdp, unsigned long addr,
			     unsigned long end, bool sparse_vmap)
{
	unsigned long start = addr;
	struct page *page;
	pte_t *ptep, pte;

	for (; addr < end; addr += PAGE_SIZE) {
		ptep = pte_offset_kernel(pmdp, addr);
		pte = READ_ONCE(*ptep);

		if (!pte_present(pte))
			continue;
		
		if (sparse_vmap) {
			page = pte_page(pte);
			free_hotplug_page_range(page, PAGE_SIZE);
		}

		pte_clear(&init_mm, addr, ptep);
	}

	flush_tlb_kernel_range(start, end);
}

Likewise for the other levels.

[...]

> +static void
> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
> +{
> +	unsigned long addr, next;
> +	pud_t *pudp_base;
> +	pgd_t *pgdp;
> +
> +	spin_lock(&init_mm.page_table_lock);

It would be good to explain why we need to take the ptl here.

IIUC that shouldn't be necessary for the linear map. Am I mistaken?

Is there a specific race when tearing down the vmemmap?

Thanks,
Mark.
Anshuman Khandual May 16, 2019, 5:34 a.m. UTC | #3
On 05/15/2019 05:19 PM, Mark Rutland wrote:
> Hi Anshuman,
> 
> On Tue, May 14, 2019 at 02:30:07PM +0530, Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table and any mapped pages allocated for given physical memory range to be
>> removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>>
>> For linear mapping there are no actual allocated pages which are mapped to
>> create the translation. Any pfn on a given entry is derived from physical
>> address (__va(PA) --> PA) whose linear translation is to be created. They
>> need not be freed as they were never allocated in the first place. But for
>> vmemmap which is a real virtual mapping (like vmalloc) physical pages are
>> allocated either from buddy or memblock which get mapped in the kernel page
>> table. These allocated and mapped pages need to be freed during translation
>> tear down. But page table pages need to be freed in both these cases.
> 
> As previously discussed, we should only hot-remove memory which was
> hot-added, so we shouldn't encounter memory allocated from memblock.

Right, not applicable any more. Will drop this word.

> 
>> These mappings need to be differentiated while deciding if a mapped page at
>> any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
>> mapping tear down process should pass on 'sparse_vmap' variable identifying
>> kernel vmemmap mappings.
> 
> I think that you can simplify the paragraphs above down to:
> 
>   The arch code for hot-remove must tear down portions of the linear map
>   and vmemmap corresponding to memory being removed. In both cases the
>   page tables mapping these regions must be freed, and when sparse
>   vmemmap is in use the memory backing the vmemmap must also be freed.
> 
>   This patch adds a new remove_pagetable() helper which can be used to
>   tear down either region, and calls it from vmemmap_free() and
>   ___remove_pgd_mapping(). The sparse_vmap argument determines whether
>   the backing memory will be freed.

The current one is bit more descriptive on detail. Anyways will replace with
the above writeup if that is preferred.

> 
> Could you add a paragraph describing when we can encounter partial
> tables (for which we need the p??_none() checks? IIUC that's not just> for cleaning up a failed hot-add, and it would be good to call that out.

Sure, will do.

> 
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> Nit: s/arch_add_mempory/arch_add_memory/.

Oops, will do.

> 
> [...]
> 
>> +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
>> +{
>> +	struct page *page;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PMD; i++) {
>> +		if (!pmd_none(pmdp[i]))
>> +			return;
>> +	}
>> +
>> +	page = pud_page(*pudp);
>> +	pud_clear(pudp);
>> +	__flush_tlb_kernel_pgtable(addr);
>> +	free_hotplug_pgtable_page(page);
>> +}
>> +#else
>> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) { }
>> +#endif
> 
> Can we fold the check in and remove the ifdeferry? e.g.
> 
> static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
> {
> 	struct page *page;
> 	int i;
> 
> 	if (CONFIG_PGTABLE_LEVELS <= 2)
> 		return;
> 	
> 	...
> }
> 
> ... that would ensure that we always got build coverage here, and

Thats true. This will get compiled for all combinations.

> minimize duplication. We do similar in map_kernel() and
> early_fixmap_init() today.
> 
> Likewise for the other levels.

Sure, will do.

> 
> For arm64, the general policy is to use READ_ONCE() when reading a page
> table entry (even if not strictly necessary), so please do so
> consistently.

For the likes "page = p???_page(*p???p)" which got missed ? Will fix it.

> 
> [...]
> 
>> +static void
>> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
>> +			unsigned long end, bool sparse_vmap)
>> +{
>> +	struct page *page;
>> +	pte_t *ptep;
>> +	unsigned long start = addr;
>> +
>> +	for (; addr < end; addr += PAGE_SIZE) {
>> +		ptep = pte_offset_kernel(pmdp, addr);
>> +		if (!pte_present(*ptep))
>> +			continue;
>> +
>> +		if (sparse_vmap) {
>> +			page = pte_page(READ_ONCE(*ptep));
>> +			free_hotplug_page_range(page, PAGE_SIZE);
>> +		}
>> +		pte_clear(&init_mm, addr, ptep);
>> +	}
>> +	flush_tlb_kernel_range(start, end);
>> +}
> 
> Please use a temporary pte variable here, e.g.
> 
> static void remove_pte_table(pmd_t *pmdp, unsigned long addr,
> 			     unsigned long end, bool sparse_vmap)
> {
> 	unsigned long start = addr;
> 	struct page *page;
> 	pte_t *ptep, pte;
> 
> 	for (; addr < end; addr += PAGE_SIZE) {
> 		ptep = pte_offset_kernel(pmdp, addr);
> 		pte = READ_ONCE(*ptep);
> 
> 		if (!pte_present(pte))
> 			continue;
> 		
> 		if (sparse_vmap) {
> 			page = pte_page(pte);
> 			free_hotplug_page_range(page, PAGE_SIZE);
> 		}
> 
> 		pte_clear(&init_mm, addr, ptep);
> 	}
> 
> 	flush_tlb_kernel_range(start, end);
> }
> 
> Likewise for the other levels.

Makes sense. Will do.

> 
> [...]
> 
>> +static void
>> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
>> +{
>> +	unsigned long addr, next;
>> +	pud_t *pudp_base;
>> +	pgd_t *pgdp;
>> +
>> +	spin_lock(&init_mm.page_table_lock);
> 
> It would be good to explain why we need to take the ptl here.

Will update both commit message and add an in-code comment here.

> 
> IIUC that shouldn't be necessary for the linear map. Am I mistaken?

Its not absolutely necessary for linear map right now because both memory hot
plug & ptdump which modifies or walks the page table ranges respectively take
memory hotplug lock. That apart, no other callers creates or destroys linear
mapping at runtime.

> 
> Is there a specific race when tearing down the vmemmap?

This is trickier than linear map. vmemmap additions would be protected with
memory hotplug lock but this can potential collide with vmalloc/IO regions.
Even if they dont right now that will be because they dont share intermediate
page table levels.

Memory hot-remove is not a very performance critical path. Not even as critical
as memory hot add. Hence its not worth relying on current non-overlapping kernel
virtual address range placement and reason it for not taking this critical lock
which might be problematic if things change later. Lets be on the safer side and
keep this lock.
Mark Rutland May 16, 2019, 10:57 a.m. UTC | #4
On Thu, May 16, 2019 at 11:04:48AM +0530, Anshuman Khandual wrote:
> On 05/15/2019 05:19 PM, Mark Rutland wrote:
> > On Tue, May 14, 2019 at 02:30:07PM +0530, Anshuman Khandual wrote:
> >> Memory removal from an arch perspective involves tearing down two different
> >> kernel based mappings i.e vmemmap and linear while releasing related page
> >> table and any mapped pages allocated for given physical memory range to be
> >> removed.
> >>
> >> Define a common kernel page table tear down helper remove_pagetable() which
> >> can be used to unmap given kernel virtual address range. In effect it can
> >> tear down both vmemap or kernel linear mappings. This new helper is called
> >> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> >>
> >> For linear mapping there are no actual allocated pages which are mapped to
> >> create the translation. Any pfn on a given entry is derived from physical
> >> address (__va(PA) --> PA) whose linear translation is to be created. They
> >> need not be freed as they were never allocated in the first place. But for
> >> vmemmap which is a real virtual mapping (like vmalloc) physical pages are
> >> allocated either from buddy or memblock which get mapped in the kernel page
> >> table. These allocated and mapped pages need to be freed during translation
> >> tear down. But page table pages need to be freed in both these cases.
> > 
> > As previously discussed, we should only hot-remove memory which was
> > hot-added, so we shouldn't encounter memory allocated from memblock.
> 
> Right, not applicable any more. Will drop this word.
> 
> >> These mappings need to be differentiated while deciding if a mapped page at
> >> any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
> >> mapping tear down process should pass on 'sparse_vmap' variable identifying
> >> kernel vmemmap mappings.
> > 
> > I think that you can simplify the paragraphs above down to:
> > 
> >   The arch code for hot-remove must tear down portions of the linear map
> >   and vmemmap corresponding to memory being removed. In both cases the
> >   page tables mapping these regions must be freed, and when sparse
> >   vmemmap is in use the memory backing the vmemmap must also be freed.
> > 
> >   This patch adds a new remove_pagetable() helper which can be used to
> >   tear down either region, and calls it from vmemmap_free() and
> >   ___remove_pgd_mapping(). The sparse_vmap argument determines whether
> >   the backing memory will be freed.
> 
> The current one is bit more descriptive on detail. Anyways will replace with
> the above writeup if that is preferred.

I would prefer the suggested form above, as it's easier to extract the
necessary details from it.

[...]

> >> +static void
> >> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
> >> +{
> >> +	unsigned long addr, next;
> >> +	pud_t *pudp_base;
> >> +	pgd_t *pgdp;
> >> +
> >> +	spin_lock(&init_mm.page_table_lock);
> > 
> > It would be good to explain why we need to take the ptl here.
> 
> Will update both commit message and add an in-code comment here.
> 
> > 
> > IIUC that shouldn't be necessary for the linear map. Am I mistaken?
> 
> Its not absolutely necessary for linear map right now because both memory hot
> plug & ptdump which modifies or walks the page table ranges respectively take
> memory hotplug lock. That apart, no other callers creates or destroys linear
> mapping at runtime.
> 
> > 
> > Is there a specific race when tearing down the vmemmap?
> 
> This is trickier than linear map. vmemmap additions would be protected with
> memory hotplug lock but this can potential collide with vmalloc/IO regions.
> Even if they dont right now that will be because they dont share intermediate
> page table levels.

Sure; if we could just state something like:

  The vmemmap region may share levels of table with the vmalloc region.
  Take the ptl so that we can safely free potentially-sahred tables.

... I think that would be sufficient.

Thanks,
Mark.
Anshuman Khandual May 17, 2019, 3:15 a.m. UTC | #5
On 05/16/2019 04:27 PM, Mark Rutland wrote:
> On Thu, May 16, 2019 at 11:04:48AM +0530, Anshuman Khandual wrote:
>> On 05/15/2019 05:19 PM, Mark Rutland wrote:
>>> On Tue, May 14, 2019 at 02:30:07PM +0530, Anshuman Khandual wrote:
>>>> Memory removal from an arch perspective involves tearing down two different
>>>> kernel based mappings i.e vmemmap and linear while releasing related page
>>>> table and any mapped pages allocated for given physical memory range to be
>>>> removed.
>>>>
>>>> Define a common kernel page table tear down helper remove_pagetable() which
>>>> can be used to unmap given kernel virtual address range. In effect it can
>>>> tear down both vmemap or kernel linear mappings. This new helper is called
>>>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>>>>
>>>> For linear mapping there are no actual allocated pages which are mapped to
>>>> create the translation. Any pfn on a given entry is derived from physical
>>>> address (__va(PA) --> PA) whose linear translation is to be created. They
>>>> need not be freed as they were never allocated in the first place. But for
>>>> vmemmap which is a real virtual mapping (like vmalloc) physical pages are
>>>> allocated either from buddy or memblock which get mapped in the kernel page
>>>> table. These allocated and mapped pages need to be freed during translation
>>>> tear down. But page table pages need to be freed in both these cases.
>>>
>>> As previously discussed, we should only hot-remove memory which was
>>> hot-added, so we shouldn't encounter memory allocated from memblock.
>>
>> Right, not applicable any more. Will drop this word.
>>
>>>> These mappings need to be differentiated while deciding if a mapped page at
>>>> any level i.e [pte|pmd|pud]_page() should be freed or not. Callers for the
>>>> mapping tear down process should pass on 'sparse_vmap' variable identifying
>>>> kernel vmemmap mappings.
>>>
>>> I think that you can simplify the paragraphs above down to:
>>>
>>>   The arch code for hot-remove must tear down portions of the linear map
>>>   and vmemmap corresponding to memory being removed. In both cases the
>>>   page tables mapping these regions must be freed, and when sparse
>>>   vmemmap is in use the memory backing the vmemmap must also be freed.
>>>
>>>   This patch adds a new remove_pagetable() helper which can be used to
>>>   tear down either region, and calls it from vmemmap_free() and
>>>   ___remove_pgd_mapping(). The sparse_vmap argument determines whether
>>>   the backing memory will be freed.
>>
>> The current one is bit more descriptive on detail. Anyways will replace with
>> the above writeup if that is preferred.
> 
> I would prefer the suggested form above, as it's easier to extract the
> necessary details from it.

Fair enough.

> 
> [...]
> 
>>>> +static void
>>>> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
>>>> +{
>>>> +	unsigned long addr, next;
>>>> +	pud_t *pudp_base;
>>>> +	pgd_t *pgdp;
>>>> +
>>>> +	spin_lock(&init_mm.page_table_lock);
>>>
>>> It would be good to explain why we need to take the ptl here.
>>
>> Will update both commit message and add an in-code comment here.
>>
>>>
>>> IIUC that shouldn't be necessary for the linear map. Am I mistaken?
>>
>> Its not absolutely necessary for linear map right now because both memory hot
>> plug & ptdump which modifies or walks the page table ranges respectively take
>> memory hotplug lock. That apart, no other callers creates or destroys linear
>> mapping at runtime.
>>
>>>
>>> Is there a specific race when tearing down the vmemmap?
>>
>> This is trickier than linear map. vmemmap additions would be protected with
>> memory hotplug lock but this can potential collide with vmalloc/IO regions.
>> Even if they dont right now that will be because they dont share intermediate
>> page table levels.
> 
> Sure; if we could just state something like:
> 
>   The vmemmap region may share levels of table with the vmalloc region.
>   Take the ptl so that we can safely free potentially-sahred tables.
> 
> ... I think that would be sufficient.

Will do.

Patch
diff mbox series

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1c0cb51..bb4e571 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -268,6 +268,9 @@  config HAVE_GENERIC_GUP
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+	def_bool y
+
 config SMP
 	def_bool y
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 37a902c..bd2d003 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -733,6 +733,177 @@  int kern_addr_valid(unsigned long addr)
 
 	return pfn_valid(pte_pfn(pte));
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_hotplug_page_range(struct page *page, ssize_t size)
+{
+	WARN_ON(PageReserved(page));
+	free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+	free_hotplug_page_range(page, PAGE_SIZE);
+}
+
+static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		if (!pte_none(ptep[i]))
+			return;
+	}
+
+	page = pmd_page(*pmdp);
+	pmd_clear(pmdp);
+	__flush_tlb_kernel_pgtable(addr);
+	free_hotplug_pgtable_page(page);
+}
+
+#if (CONFIG_PGTABLE_LEVELS > 2)
+static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		if (!pmd_none(pmdp[i]))
+			return;
+	}
+
+	page = pud_page(*pudp);
+	pud_clear(pudp);
+	__flush_tlb_kernel_pgtable(addr);
+	free_hotplug_pgtable_page(page);
+}
+#else
+static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr) { }
+#endif
+
+#if (CONFIG_PGTABLE_LEVELS > 3)
+static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		if (!pud_none(pudp[i]))
+			return;
+	}
+
+	page = pgd_page(*pgdp);
+	pgd_clear(pgdp);
+	__flush_tlb_kernel_pgtable(addr);
+	free_hotplug_pgtable_page(page);
+}
+#else
+static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr) { }
+#endif
+
+static void
+remove_pte_table(pmd_t *pmdp, unsigned long addr,
+			unsigned long end, bool sparse_vmap)
+{
+	struct page *page;
+	pte_t *ptep;
+	unsigned long start = addr;
+
+	for (; addr < end; addr += PAGE_SIZE) {
+		ptep = pte_offset_kernel(pmdp, addr);
+		if (!pte_present(*ptep))
+			continue;
+
+		if (sparse_vmap) {
+			page = pte_page(READ_ONCE(*ptep));
+			free_hotplug_page_range(page, PAGE_SIZE);
+		}
+		pte_clear(&init_mm, addr, ptep);
+	}
+	flush_tlb_kernel_range(start, end);
+}
+
+static void
+remove_pmd_table(pud_t *pudp, unsigned long addr,
+			unsigned long end, bool sparse_vmap)
+{
+	unsigned long next;
+	struct page *page;
+	pte_t *ptep_base;
+	pmd_t *pmdp;
+
+	for (; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+		pmdp = pmd_offset(pudp, addr);
+		if (!pmd_present(*pmdp))
+			continue;
+
+		if (pmd_sect(*pmdp)) {
+			if (sparse_vmap) {
+				page = pmd_page(READ_ONCE(*pmdp));
+				free_hotplug_page_range(page, PMD_SIZE);
+			}
+			pmd_clear(pmdp);
+			continue;
+		}
+		ptep_base = pte_offset_kernel(pmdp, 0UL);
+		remove_pte_table(pmdp, addr, next, sparse_vmap);
+		free_pte_table(ptep_base, pmdp, addr);
+	}
+}
+
+static void
+remove_pud_table(pgd_t *pgdp, unsigned long addr,
+			unsigned long end, bool sparse_vmap)
+{
+	unsigned long next;
+	struct page *page;
+	pmd_t *pmdp_base;
+	pud_t *pudp;
+
+	for (; addr < end; addr = next) {
+		next = pud_addr_end(addr, end);
+		pudp = pud_offset(pgdp, addr);
+		if (!pud_present(*pudp))
+			continue;
+
+		if (pud_sect(*pudp)) {
+			if (sparse_vmap) {
+				page = pud_page(READ_ONCE(*pudp));
+				free_hotplug_page_range(page, PUD_SIZE);
+			}
+			pud_clear(pudp);
+			continue;
+		}
+		pmdp_base = pmd_offset(pudp, 0UL);
+		remove_pmd_table(pudp, addr, next, sparse_vmap);
+		free_pmd_table(pmdp_base, pudp, addr);
+	}
+}
+
+static void
+remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
+{
+	unsigned long addr, next;
+	pud_t *pudp_base;
+	pgd_t *pgdp;
+
+	spin_lock(&init_mm.page_table_lock);
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgdp = pgd_offset_k(addr);
+		if (!pgd_present(*pgdp))
+			continue;
+
+		pudp_base = pud_offset(pgdp, 0UL);
+		remove_pud_table(pgdp, addr, next, sparse_vmap);
+		free_pud_table(pudp_base, pgdp, addr);
+	}
+	spin_unlock(&init_mm.page_table_lock);
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -780,6 +951,9 @@  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
+#ifdef CONFIG_MEMORY_HOTPLUG
+	remove_pagetable(start, end, true);
+#endif
 }
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
 
@@ -1070,10 +1244,16 @@  int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
+{
+	WARN_ON(pgdir != init_mm.pgd);
+	remove_pagetable(start, start + size, false);
+}
+
 int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 		    bool want_memblock)
 {
-	int flags = 0;
+	int ret, flags = 0;
 
 	if (rodata_full || debug_pagealloc_enabled())
 		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -1081,7 +1261,27 @@  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
 			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
 
-	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   altmap, want_memblock);
+	if (ret)
+		__remove_pgd_mapping(swapper_pg_dir,
+					__phys_to_virt(start), size);
+	return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone = page_zone(pfn_to_page(start_pfn));
+	int ret = 0;
+
+	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+	if (!ret)
+		__remove_pgd_mapping(swapper_pg_dir,
+					__phys_to_virt(start), size);
+	return ret;
+}
+#endif
 #endif