diff mbox series

[2/6] arm64/mm: Enable memory hot remove

Message ID 1554265806-11501-3-git-send-email-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64/mm: Enable memory hot remove and ZONE_DEVICE | expand

Commit Message

Anshuman Khandual April 3, 2019, 4:30 a.m. UTC
Memory removal from an arch perspective involves tearing down two different
kernel based mappings i.e vmemmap and linear while releasing related page
table pages allocated for the physical memory range to be removed.

Define a common kernel page table tear down helper remove_pagetable() which
can be used to unmap given kernel virtual address range. In effect it can
tear down both vmemap or kernel linear mappings. This new helper is called
from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
The argument 'direct' here identifies kernel linear mappings.

Vmemmap mappings page table pages are allocated through sparse mem helper
functions like vmemmap_alloc_block() which does not cycle the pages through
pgtable_page_ctor() constructs. Hence while removing it skips corresponding
destructor construct pgtable_page_dtor().

While here update arch_add_mempory() to handle __add_pages() failures by
just unmapping recently added kernel linear mapping. Now enable memory hot
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.

This implementation is overall inspired from kernel page table tear down
procedure on X86 architecture.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/Kconfig               |   3 +
 arch/arm64/include/asm/pgtable.h |  14 +++
 arch/arm64/mm/mmu.c              | 227 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 241 insertions(+), 3 deletions(-)

Comments

Robin Murphy April 3, 2019, 12:37 p.m. UTC | #1
[ +Steve ]

Hi Anshuman,

On 03/04/2019 05:30, Anshuman Khandual wrote:
> Memory removal from an arch perspective involves tearing down two different
> kernel based mappings i.e vmemmap and linear while releasing related page
> table pages allocated for the physical memory range to be removed.
> 
> Define a common kernel page table tear down helper remove_pagetable() which
> can be used to unmap given kernel virtual address range. In effect it can
> tear down both vmemap or kernel linear mappings. This new helper is called
> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> The argument 'direct' here identifies kernel linear mappings.
> 
> Vmemmap mappings page table pages are allocated through sparse mem helper
> functions like vmemmap_alloc_block() which does not cycle the pages through
> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
> destructor construct pgtable_page_dtor().
> 
> While here update arch_add_mempory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture.

A bit of a nit, but since this depends on at least patch #4 to work 
properly, it would be good to reorder the series appropriately.
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>   arch/arm64/Kconfig               |   3 +
>   arch/arm64/include/asm/pgtable.h |  14 +++
>   arch/arm64/mm/mmu.c              | 227 ++++++++++++++++++++++++++++++++++++++-
>   3 files changed, 241 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index a2418fb..db3e625 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -266,6 +266,9 @@ config HAVE_GENERIC_GUP
>   config ARCH_ENABLE_MEMORY_HOTPLUG
>   	def_bool y
>   
> +config ARCH_ENABLE_MEMORY_HOTREMOVE
> +	def_bool y
> +
>   config ARCH_MEMORY_PROBE
>   	bool "Enable /sys/devices/system/memory/probe interface"
>   	depends on MEMORY_HOTPLUG
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index de70c1e..858098e 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -355,6 +355,18 @@ static inline int pmd_protnone(pmd_t pmd)
>   }
>   #endif
>   
> +#if (CONFIG_PGTABLE_LEVELS > 2)
> +#define pmd_large(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
> +#else
> +#define pmd_large(pmd) 0
> +#endif
> +
> +#if (CONFIG_PGTABLE_LEVELS > 3)
> +#define pud_large(pud)	(pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT))
> +#else
> +#define pud_large(pmd) 0
> +#endif

These seem rather different from the versions that Steve is proposing in 
the generic pagewalk series - can you reach an agreement on which 
implementation is preferred?

> +
>   /*
>    * THP definitions.
>    */
> @@ -555,6 +567,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
>   
>   #else
>   
> +#define pmd_index(addr) 0
>   #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
>   
>   /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
> @@ -612,6 +625,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>   
>   #else
>   
> +#define pud_index(adrr)	0
>   #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
>   
>   /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index e97f018..ae0777b 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -714,6 +714,198 @@ int kern_addr_valid(unsigned long addr)
>   
>   	return pfn_valid(pte_pfn(pte));
>   }
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pagetable(struct page *page, int order)

Do these need to be __meminit? AFAICS it's effectively redundant with 
the containing #ifdef, and removal feels like it's inherently a 
later-than-init thing anyway.

> +{
> +	unsigned long magic;
> +	unsigned int nr_pages = 1 << order;
> +
> +	if (PageReserved(page)) {
> +		__ClearPageReserved(page);
> +
> +		magic = (unsigned long)page->freelist;
> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
> +			while (nr_pages--)
> +				put_page_bootmem(page++);
> +		} else
> +			while (nr_pages--)
> +				free_reserved_page(page++);
> +	} else
> +		free_pages((unsigned long)page_address(page), order);
> +}
> +
> +#if (CONFIG_PGTABLE_LEVELS > 2)
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (!pte_none(*pte))
> +			return;
> +	}
> +
> +	if (direct)
> +		pgtable_page_dtor(pmd_page(*pmd));
> +	free_pagetable(pmd_page(*pmd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pmd_clear(pmd);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +#else
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
> +{
> +}
> +#endif
> +
> +#if (CONFIG_PGTABLE_LEVELS > 3)
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (!pmd_none(*pmd))
> +			return;
> +	}
> +
> +	if (direct)
> +		pgtable_page_dtor(pud_page(*pud));
> +	free_pagetable(pud_page(*pud), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pud_clear(pud);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (!pud_none(*pud))
> +			return;
> +	}
> +
> +	if (direct)
> +		pgtable_page_dtor(pgd_page(*pgd));
> +	free_pagetable(pgd_page(*pgd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pgd_clear(pgd);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +#else
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
> +{
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
> +{
> +}
> +#endif
> +
> +static void __meminit
> +remove_pte_table(pte_t *pte_start, unsigned long addr,
> +			unsigned long end, bool direct)
> +{
> +	pte_t *pte;
> +
> +	pte = pte_start + pte_index(addr);
> +	for (; addr < end; addr += PAGE_SIZE, pte++) {
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		if (!direct)
> +			free_pagetable(pte_page(*pte), 0);
> +		spin_lock(&init_mm.page_table_lock);
> +		pte_clear(&init_mm, addr, pte);
> +		spin_unlock(&init_mm.page_table_lock);
> +	}
> +}
> +
> +static void __meminit
> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> +			unsigned long end, bool direct)
> +{
> +	unsigned long next;
> +	pte_t *pte_base;
> +	pmd_t *pmd;
> +
> +	pmd = pmd_start + pmd_index(addr);
> +	for (; addr < end; addr = next, pmd++) {
> +		next = pmd_addr_end(addr, end);
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if (!direct)
> +				free_pagetable(pmd_page(*pmd),
> +						get_order(PMD_SIZE));
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_clear(pmd);
> +			spin_unlock(&init_mm.page_table_lock);
> +			continue;
> +		}
> +		pte_base = pte_offset_kernel(pmd, 0UL);
> +		remove_pte_table(pte_base, addr, next, direct);
> +		free_pte_table(pte_base, pmd, direct);
> +	}
> +}
> +
> +static void __meminit
> +remove_pud_table(pud_t *pud_start, unsigned long addr,
> +			unsigned long end, bool direct)
> +{
> +	unsigned long next;
> +	pmd_t *pmd_base;
> +	pud_t *pud;
> +
> +	pud = pud_start + pud_index(addr);
> +	for (; addr < end; addr = next, pud++) {
> +		next = pud_addr_end(addr, end);
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if (!direct)
> +				free_pagetable(pud_page(*pud),
> +						get_order(PUD_SIZE));
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_clear(pud);
> +			spin_unlock(&init_mm.page_table_lock);
> +			continue;
> +		}
> +		pmd_base = pmd_offset(pud, 0UL);
> +		remove_pmd_table(pmd_base, addr, next, direct);
> +		free_pmd_table(pmd_base, pud, direct);
> +	}
> +}
> +
> +static void __meminit
> +remove_pagetable(unsigned long start, unsigned long end, bool direct)
> +{
> +	unsigned long addr, next;
> +	pud_t *pud_base;
> +	pgd_t *pgd;
> +
> +	for (addr = start; addr < end; addr = next) {
> +		next = pgd_addr_end(addr, end);
> +		pgd = pgd_offset_k(addr);
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		pud_base = pud_offset(pgd, 0UL);
> +		remove_pud_table(pud_base, addr, next, direct);
> +		free_pud_table(pud_base, pgd, direct);
> +	}
> +	flush_tlb_kernel_range(start, end);
> +}
> +#endif
> +
>   #ifdef CONFIG_SPARSEMEM_VMEMMAP
>   #if !ARM64_SWAPPER_USES_SECTION_MAPS
>   int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> @@ -758,9 +950,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>   	return 0;
>   }
>   #endif	/* CONFIG_ARM64_64K_PAGES */
> -void vmemmap_free(unsigned long start, unsigned long end,
> +void __ref vmemmap_free(unsigned long start, unsigned long end,

Why is the __ref needed? Presumably it's avoidable by addressing the 
__meminit thing above.

>   		struct vmem_altmap *altmap)
>   {
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +	remove_pagetable(start, end, false);
> +#endif
>   }
>   #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
>   
> @@ -1046,10 +1241,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>   }
>   
>   #ifdef CONFIG_MEMORY_HOTPLUG
> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
> +{
> +	WARN_ON(pgdir != init_mm.pgd);
> +	remove_pagetable(start, start + size, true);
> +}
> +
>   int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>   		    bool want_memblock)
>   {
> -	int flags = 0;
> +	int flags = 0, ret = 0;

Initialising ret here is unnecessary.

Robin.

>   
>   	if (rodata_full || debug_pagealloc_enabled())
>   		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> @@ -1057,7 +1258,27 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>   	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>   			     size, PAGE_KERNEL, pgd_pgtable_alloc, flags);
>   
> -	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
> +	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>   			   altmap, want_memblock);
> +	if (ret)
> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
>   }
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
> +{
> +	unsigned long start_pfn = start >> PAGE_SHIFT;
> +	unsigned long nr_pages = size >> PAGE_SHIFT;
> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
> +	int ret;
> +
> +	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
> +	if (!ret)
> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
> +}
> +#endif
>   #endif
>
Steven Price April 3, 2019, 1:15 p.m. UTC | #2
On 03/04/2019 13:37, Robin Murphy wrote:
> [ +Steve ]
> 
> Hi Anshuman,
> 
> On 03/04/2019 05:30, Anshuman Khandual wrote:

<snip>

>> diff --git a/arch/arm64/include/asm/pgtable.h
>> b/arch/arm64/include/asm/pgtable.h
>> index de70c1e..858098e 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -355,6 +355,18 @@ static inline int pmd_protnone(pmd_t pmd)
>>   }
>>   #endif
>>   +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +#define pmd_large(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) &
>> PMD_TABLE_BIT))
>> +#else
>> +#define pmd_large(pmd) 0
>> +#endif
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +#define pud_large(pud)    (pud_val(pud) && !(pud_val(pud) &
>> PUD_TABLE_BIT))
>> +#else
>> +#define pud_large(pmd) 0
>> +#endif
> 
> These seem rather different from the versions that Steve is proposing in
> the generic pagewalk series - can you reach an agreement on which
> implementation is preferred?

Indeed this doesn't match the version in my series although is quite
similar.

My desire is that p?d_large represents the hardware architectural
definition of large page/huge page/section (pick your naming). Although
now I look more closely this is actually broken in my series (I'll fix
that up and send a new version shortly) - p?d_sect() is similarly
conditional.

Is there a good reason not to use the existing p?d_sect() macros
available on arm64?

I'm also surprised by the CONFIG_PGTABLE_LEVEL conditions as they don't
match the existing conditions for p?d_sect(). Might be worth double
checking it actually does what you expect.

Steve
Logan Gunthorpe April 3, 2019, 5:32 p.m. UTC | #3
On 2019-04-02 10:30 p.m., Anshuman Khandual wrote:
> Memory removal from an arch perspective involves tearing down two different
> kernel based mappings i.e vmemmap and linear while releasing related page
> table pages allocated for the physical memory range to be removed.
> 
> Define a common kernel page table tear down helper remove_pagetable() which
> can be used to unmap given kernel virtual address range. In effect it can
> tear down both vmemap or kernel linear mappings. This new helper is called
> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> The argument 'direct' here identifies kernel linear mappings.
> 
> Vmemmap mappings page table pages are allocated through sparse mem helper
> functions like vmemmap_alloc_block() which does not cycle the pages through
> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
> destructor construct pgtable_page_dtor().
> 
> While here update arch_add_mempory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture.

I've been working on very similar things for RISC-V. In fact, I'm
currently in progress on a very similar stripped down version of
remove_pagetable(). (Though I'm fairly certain I've done a bunch of
stuff wrong.)

Would it be possible to move this work into common code that can be used
by all arches? Seems like, to start, we should be able to support both
arm64 and RISC-V... and maybe even x86 too.

I'd be happy to help integrate and test such functions in RISC-V.

Thanks,

Logan
Robin Murphy April 3, 2019, 5:57 p.m. UTC | #4
On 03/04/2019 18:32, Logan Gunthorpe wrote:
> 
> 
> On 2019-04-02 10:30 p.m., Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table pages allocated for the physical memory range to be removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>> The argument 'direct' here identifies kernel linear mappings.
>>
>> Vmemmap mappings page table pages are allocated through sparse mem helper
>> functions like vmemmap_alloc_block() which does not cycle the pages through
>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>> destructor construct pgtable_page_dtor().
>>
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
> 
> I've been working on very similar things for RISC-V. In fact, I'm
> currently in progress on a very similar stripped down version of
> remove_pagetable(). (Though I'm fairly certain I've done a bunch of
> stuff wrong.)
> 
> Would it be possible to move this work into common code that can be used
> by all arches? Seems like, to start, we should be able to support both
> arm64 and RISC-V... and maybe even x86 too.
> 
> I'd be happy to help integrate and test such functions in RISC-V.

Indeed, I had hoped we might be able to piggyback off generic code for 
this anyway, given that we have generic pagetable code which knows how 
to free process pagetables, and kernel pagetables are also pagetables.

I did actually hack up such a patch[1], and other than 
p?d_none_or_clear_bad() being loud it does actually appear to function 
OK in terms of withstanding repeated add/remove cycles and not crashing, 
but all the pagetable accounting and other stuff I don't really know 
about mean it's probably not viable without a lot more core work.

Robin.

[1] 
http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=75934a2c4f737ad9f26903861108d5b0658e86bb
Anshuman Khandual April 4, 2019, 5:39 a.m. UTC | #5
On 04/03/2019 06:07 PM, Robin Murphy wrote:
> [ +Steve ]
> 
> Hi Anshuman,
> 
> On 03/04/2019 05:30, Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table pages allocated for the physical memory range to be removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>> The argument 'direct' here identifies kernel linear mappings.
>>
>> Vmemmap mappings page table pages are allocated through sparse mem helper
>> functions like vmemmap_alloc_block() which does not cycle the pages through
>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>> destructor construct pgtable_page_dtor().
>>
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
> 
> A bit of a nit, but since this depends on at least patch #4 to work properly, it would be good to reorder the series appropriately.

Sure will move up the generic changes forward.

>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>   arch/arm64/Kconfig               |   3 +
>>   arch/arm64/include/asm/pgtable.h |  14 +++
>>   arch/arm64/mm/mmu.c              | 227 ++++++++++++++++++++++++++++++++++++++-
>>   3 files changed, 241 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index a2418fb..db3e625 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -266,6 +266,9 @@ config HAVE_GENERIC_GUP
>>   config ARCH_ENABLE_MEMORY_HOTPLUG
>>       def_bool y
>>   +config ARCH_ENABLE_MEMORY_HOTREMOVE
>> +    def_bool y
>> +
>>   config ARCH_MEMORY_PROBE
>>       bool "Enable /sys/devices/system/memory/probe interface"
>>       depends on MEMORY_HOTPLUG
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index de70c1e..858098e 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -355,6 +355,18 @@ static inline int pmd_protnone(pmd_t pmd)
>>   }
>>   #endif
>>   +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +#define pmd_large(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
>> +#else
>> +#define pmd_large(pmd) 0
>> +#endif
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +#define pud_large(pud)    (pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT))
>> +#else
>> +#define pud_large(pmd) 0
>> +#endif
> 
> These seem rather different from the versions that Steve is proposing in the generic pagewalk series - can you reach an agreement on which implementation is preferred?

Sure will take a look.

> 
>> +
>>   /*
>>    * THP definitions.
>>    */
>> @@ -555,6 +567,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
>>     #else
>>   +#define pmd_index(addr) 0
>>   #define pud_page_paddr(pud)    ({ BUILD_BUG(); 0; })
>>     /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
>> @@ -612,6 +625,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>>     #else
>>   +#define pud_index(adrr)    0
>>   #define pgd_page_paddr(pgd)    ({ BUILD_BUG(); 0;})
>>     /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index e97f018..ae0777b 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -714,6 +714,198 @@ int kern_addr_valid(unsigned long addr)
>>         return pfn_valid(pte_pfn(pte));
>>   }
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
> 
> Do these need to be __meminit? AFAICS it's effectively redundant with the containing #ifdef, and removal feels like it's inherently a later-than-init thing anyway.

I was confused here a bit but even X86 does exactly the same. All these functions
are still labeled __meminit and all wrapped under CONFIG_MEMORY_HOTPLUG. Is there
any concern to have __meminit here ?

> 
>> +{
>> +    unsigned long magic;
>> +    unsigned int nr_pages = 1 << order;
>> +
>> +    if (PageReserved(page)) {
>> +        __ClearPageReserved(page);
>> +
>> +        magic = (unsigned long)page->freelist;
>> +        if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>> +            while (nr_pages--)
>> +                put_page_bootmem(page++);
>> +        } else
>> +            while (nr_pages--)
>> +                free_reserved_page(page++);
>> +    } else
>> +        free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
>> +{
>> +    pte_t *pte;
>> +    int i;
>> +
>> +    for (i = 0; i < PTRS_PER_PTE; i++) {
>> +        pte = pte_start + i;
>> +        if (!pte_none(*pte))
>> +            return;
>> +    }
>> +
>> +    if (direct)
>> +        pgtable_page_dtor(pmd_page(*pmd));
>> +    free_pagetable(pmd_page(*pmd), 0);
>> +    spin_lock(&init_mm.page_table_lock);
>> +    pmd_clear(pmd);
>> +    spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#else
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
>> +{
>> +}
>> +#endif
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
>> +{
>> +    pmd_t *pmd;
>> +    int i;
>> +
>> +    for (i = 0; i < PTRS_PER_PMD; i++) {
>> +        pmd = pmd_start + i;
>> +        if (!pmd_none(*pmd))
>> +            return;
>> +    }
>> +
>> +    if (direct)
>> +        pgtable_page_dtor(pud_page(*pud));
>> +    free_pagetable(pud_page(*pud), 0);
>> +    spin_lock(&init_mm.page_table_lock);
>> +    pud_clear(pud);
>> +    spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
>> +{
>> +    pud_t *pud;
>> +    int i;
>> +
>> +    for (i = 0; i < PTRS_PER_PUD; i++) {
>> +        pud = pud_start + i;
>> +        if (!pud_none(*pud))
>> +            return;
>> +    }
>> +
>> +    if (direct)
>> +        pgtable_page_dtor(pgd_page(*pgd));
>> +    free_pagetable(pgd_page(*pgd), 0);
>> +    spin_lock(&init_mm.page_table_lock);
>> +    pgd_clear(pgd);
>> +    spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#else
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
>> +{
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
>> +{
>> +}
>> +#endif
>> +
>> +static void __meminit
>> +remove_pte_table(pte_t *pte_start, unsigned long addr,
>> +            unsigned long end, bool direct)
>> +{
>> +    pte_t *pte;
>> +
>> +    pte = pte_start + pte_index(addr);
>> +    for (; addr < end; addr += PAGE_SIZE, pte++) {
>> +        if (!pte_present(*pte))
>> +            continue;
>> +
>> +        if (!direct)
>> +            free_pagetable(pte_page(*pte), 0);
>> +        spin_lock(&init_mm.page_table_lock);
>> +        pte_clear(&init_mm, addr, pte);
>> +        spin_unlock(&init_mm.page_table_lock);
>> +    }
>> +}
>> +
>> +static void __meminit
>> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>> +            unsigned long end, bool direct)
>> +{
>> +    unsigned long next;
>> +    pte_t *pte_base;
>> +    pmd_t *pmd;
>> +
>> +    pmd = pmd_start + pmd_index(addr);
>> +    for (; addr < end; addr = next, pmd++) {
>> +        next = pmd_addr_end(addr, end);
>> +        if (!pmd_present(*pmd))
>> +            continue;
>> +
>> +        if (pmd_large(*pmd)) {
>> +            if (!direct)
>> +                free_pagetable(pmd_page(*pmd),
>> +                        get_order(PMD_SIZE));
>> +            spin_lock(&init_mm.page_table_lock);
>> +            pmd_clear(pmd);
>> +            spin_unlock(&init_mm.page_table_lock);
>> +            continue;
>> +        }
>> +        pte_base = pte_offset_kernel(pmd, 0UL);
>> +        remove_pte_table(pte_base, addr, next, direct);
>> +        free_pte_table(pte_base, pmd, direct);
>> +    }
>> +}
>> +
>> +static void __meminit
>> +remove_pud_table(pud_t *pud_start, unsigned long addr,
>> +            unsigned long end, bool direct)
>> +{
>> +    unsigned long next;
>> +    pmd_t *pmd_base;
>> +    pud_t *pud;
>> +
>> +    pud = pud_start + pud_index(addr);
>> +    for (; addr < end; addr = next, pud++) {
>> +        next = pud_addr_end(addr, end);
>> +        if (!pud_present(*pud))
>> +            continue;
>> +
>> +        if (pud_large(*pud)) {
>> +            if (!direct)
>> +                free_pagetable(pud_page(*pud),
>> +                        get_order(PUD_SIZE));
>> +            spin_lock(&init_mm.page_table_lock);
>> +            pud_clear(pud);
>> +            spin_unlock(&init_mm.page_table_lock);
>> +            continue;
>> +        }
>> +        pmd_base = pmd_offset(pud, 0UL);
>> +        remove_pmd_table(pmd_base, addr, next, direct);
>> +        free_pmd_table(pmd_base, pud, direct);
>> +    }
>> +}
>> +
>> +static void __meminit
>> +remove_pagetable(unsigned long start, unsigned long end, bool direct)
>> +{
>> +    unsigned long addr, next;
>> +    pud_t *pud_base;
>> +    pgd_t *pgd;
>> +
>> +    for (addr = start; addr < end; addr = next) {
>> +        next = pgd_addr_end(addr, end);
>> +        pgd = pgd_offset_k(addr);
>> +        if (!pgd_present(*pgd))
>> +            continue;
>> +
>> +        pud_base = pud_offset(pgd, 0UL);
>> +        remove_pud_table(pud_base, addr, next, direct);
>> +        free_pud_table(pud_base, pgd, direct);
>> +    }
>> +    flush_tlb_kernel_range(start, end);
>> +}
>> +#endif
>> +
>>   #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>   #if !ARM64_SWAPPER_USES_SECTION_MAPS
>>   int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>> @@ -758,9 +950,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>>       return 0;
>>   }
>>   #endif    /* CONFIG_ARM64_64K_PAGES */
>> -void vmemmap_free(unsigned long start, unsigned long end,
>> +void __ref vmemmap_free(unsigned long start, unsigned long end,
> 
> Why is the __ref needed? Presumably it's avoidable by addressing the __meminit thing above.

Right.

> 
>>           struct vmem_altmap *altmap)
>>   {
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +    remove_pagetable(start, end, false);
>> +#endif
>>   }
>>   #endif    /* CONFIG_SPARSEMEM_VMEMMAP */
>>   @@ -1046,10 +1241,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>>   }
>>     #ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
>> +{
>> +    WARN_ON(pgdir != init_mm.pgd);
>> +    remove_pagetable(start, start + size, true);
>> +}
>> +
>>   int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>>               bool want_memblock)
>>   {
>> -    int flags = 0;
>> +    int flags = 0, ret = 0;
> 
> Initialising ret here is unnecessary.

Sure. Will change.
Anshuman Khandual April 4, 2019, 6:51 a.m. UTC | #6
On 04/03/2019 06:45 PM, Steven Price wrote:
> On 03/04/2019 13:37, Robin Murphy wrote:
>> [ +Steve ]
>>
>> Hi Anshuman,

Hi Steve,

>>
>> On 03/04/2019 05:30, Anshuman Khandual wrote:
> 
> <snip>
> 
>>> diff --git a/arch/arm64/include/asm/pgtable.h
>>> b/arch/arm64/include/asm/pgtable.h
>>> index de70c1e..858098e 100644
>>> --- a/arch/arm64/include/asm/pgtable.h
>>> +++ b/arch/arm64/include/asm/pgtable.h
>>> @@ -355,6 +355,18 @@ static inline int pmd_protnone(pmd_t pmd)
>>>   }
>>>   #endif
>>>   +#if (CONFIG_PGTABLE_LEVELS > 2)
>>> +#define pmd_large(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) &
>>> PMD_TABLE_BIT))
>>> +#else
>>> +#define pmd_large(pmd) 0
>>> +#endif
>>> +
>>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>>> +#define pud_large(pud)    (pud_val(pud) && !(pud_val(pud) &
>>> PUD_TABLE_BIT))
>>> +#else
>>> +#define pud_large(pmd) 0
>>> +#endif
>>
>> These seem rather different from the versions that Steve is proposing in
>> the generic pagewalk series - can you reach an agreement on which
>> implementation is preferred?
> 
> Indeed this doesn't match the version in my series although is quite
> similar.
> 
> My desire is that p?d_large represents the hardware architectural
> definition of large page/huge page/section (pick your naming). Although
> now I look more closely this is actually broken in my series (I'll fix
> that up and send a new version shortly) - p?d_sect() is similarly
> conditional.
> 
> Is there a good reason not to use the existing p?d_sect() macros
> available on arm64?

Nothing specific. Now I just tried using pud|pmd_sect() which looks good on
multiple configs for 4K/16K/64K. Will migrate pmd|pud_large() to more arch
specific pmd|pud_sect() which would also help in staying clear from your
series.

> 
> I'm also surprised by the CONFIG_PGTABLE_LEVEL conditions as they don't
> match the existing conditions for p?d_sect(). Might be worth double
> checking it actually does what you expect.

Right they are bit different. Surely will check. But if pmd|pud_sect() works
out okay will probably go with it as its been there for sometime.
Anshuman Khandual April 4, 2019, 7:07 a.m. UTC | #7
On 04/03/2019 11:02 PM, Logan Gunthorpe wrote:
> 
> 
> On 2019-04-02 10:30 p.m., Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table pages allocated for the physical memory range to be removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>> The argument 'direct' here identifies kernel linear mappings.
>>
>> Vmemmap mappings page table pages are allocated through sparse mem helper
>> functions like vmemmap_alloc_block() which does not cycle the pages through
>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>> destructor construct pgtable_page_dtor().
>>
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
> 
> I've been working on very similar things for RISC-V. In fact, I'm
> currently in progress on a very similar stripped down version of
> remove_pagetable(). (Though I'm fairly certain I've done a bunch of
> stuff wrong.)
> 
> Would it be possible to move this work into common code that can be used
> by all arches? Seems like, to start, we should be able to support both
> arm64 and RISC-V... and maybe even x86 too.
> 
> I'd be happy to help integrate and test such functions in RISC-V.

Sure that will be great. The only impediment is pgtable_page_ctor() for kernel
linear mapping. This series is based on current arm64 where linear mapping
pgtable pages go through pgtable_page_ctor() init sequence but that might be
changing soon. If RISC-V does not have pgtable_page_ctor() init for linear
mapping and no other arch specific stuff later on we can try to consolidate
remove_pagetable() atleast for both the architectures.

Then I wondering whether I can transition pud|pmd_large() to pud|pmd_sect().
Anshuman Khandual April 4, 2019, 8:23 a.m. UTC | #8
On 04/03/2019 11:27 PM, Robin Murphy wrote:
> On 03/04/2019 18:32, Logan Gunthorpe wrote:
>>
>>
>> On 2019-04-02 10:30 p.m., Anshuman Khandual wrote:
>>> Memory removal from an arch perspective involves tearing down two different
>>> kernel based mappings i.e vmemmap and linear while releasing related page
>>> table pages allocated for the physical memory range to be removed.
>>>
>>> Define a common kernel page table tear down helper remove_pagetable() which
>>> can be used to unmap given kernel virtual address range. In effect it can
>>> tear down both vmemap or kernel linear mappings. This new helper is called
>>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>>> The argument 'direct' here identifies kernel linear mappings.
>>>
>>> Vmemmap mappings page table pages are allocated through sparse mem helper
>>> functions like vmemmap_alloc_block() which does not cycle the pages through
>>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>>> destructor construct pgtable_page_dtor().
>>>
>>> While here update arch_add_mempory() to handle __add_pages() failures by
>>> just unmapping recently added kernel linear mapping. Now enable memory hot
>>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>>
>>> This implementation is overall inspired from kernel page table tear down
>>> procedure on X86 architecture.
>>
>> I've been working on very similar things for RISC-V. In fact, I'm
>> currently in progress on a very similar stripped down version of
>> remove_pagetable(). (Though I'm fairly certain I've done a bunch of
>> stuff wrong.)
>>
>> Would it be possible to move this work into common code that can be used
>> by all arches? Seems like, to start, we should be able to support both
>> arm64 and RISC-V... and maybe even x86 too.
>>
>> I'd be happy to help integrate and test such functions in RISC-V.

I am more inclined towards consolidating remove_pagetable() across platforms
like arm64 and RISC-V (probably others). But there are clear distinctions
between user page table and kernel page table tear down process.

> 
> Indeed, I had hoped we might be able to piggyback off generic code for this anyway,
> given that we have generic pagetable code which knows how to free process pagetables,
> and kernel pagetables are also pagetables.

But there are differences. To list some

* Freeing mapped and pagetable pages

	- Memory hot remove deals with both vmemmap and linear mappings
	- Selectively call pgtable_page_dtor() for linear mappings (arch specific)
	- Not actually freeing PTE|PMD|PUD mapped pages for linear mappings
	- Freeing mapped pages for vmemap mappings

* TLB shootdown

	- User page table process uses mmu_gather mechanism for TLB flush
	- Kernel page table tear down can do with less TLB flush invocations
		- Dont have to care about flush deferral etc

* THP and HugeTLB

	- Kernel page table tear down procedure does not have to understand THP or HugeTLB
	- Though it has to understand possible arch specific special block mappings

		- Specific kernel linear mappings on arm64
			- PUD|PMD|CONT_PMD|CONT_PTE large page mappings

		- Specific vmemmap mappings on arm64
			- PMD large or PTE mappings

	-User page table tear down procedure needs to understand THP and HugeTLB

* Page table locking

	- Kernel procedure locks init_mm.page_table_lock while clearing an individual entry
	- Kernel procedure does not have to worry about mmap_sem

* ZONE_DEVICE struct vmem_altmap

	- Kernel page table tear down procedure needs to accommodate 'struct vmem_altmap' when
	vmemmap mappings are created with pages allocated from 'struct vmem_altmap' (ZONE_DEVICE)
	rather than buddy allocator or memblock.
Steven Price April 4, 2019, 9:16 a.m. UTC | #9
On 04/04/2019 08:07, Anshuman Khandual wrote:
> 
> 
> On 04/03/2019 11:02 PM, Logan Gunthorpe wrote:
>>
>>
>> On 2019-04-02 10:30 p.m., Anshuman Khandual wrote:
>>> Memory removal from an arch perspective involves tearing down two different
>>> kernel based mappings i.e vmemmap and linear while releasing related page
>>> table pages allocated for the physical memory range to be removed.
>>>
>>> Define a common kernel page table tear down helper remove_pagetable() which
>>> can be used to unmap given kernel virtual address range. In effect it can
>>> tear down both vmemap or kernel linear mappings. This new helper is called
>>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>>> The argument 'direct' here identifies kernel linear mappings.
>>>
>>> Vmemmap mappings page table pages are allocated through sparse mem helper
>>> functions like vmemmap_alloc_block() which does not cycle the pages through
>>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>>> destructor construct pgtable_page_dtor().
>>>
>>> While here update arch_add_mempory() to handle __add_pages() failures by
>>> just unmapping recently added kernel linear mapping. Now enable memory hot
>>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>>
>>> This implementation is overall inspired from kernel page table tear down
>>> procedure on X86 architecture.
>>
>> I've been working on very similar things for RISC-V. In fact, I'm
>> currently in progress on a very similar stripped down version of
>> remove_pagetable(). (Though I'm fairly certain I've done a bunch of
>> stuff wrong.)
>>
>> Would it be possible to move this work into common code that can be used
>> by all arches? Seems like, to start, we should be able to support both
>> arm64 and RISC-V... and maybe even x86 too.
>>
>> I'd be happy to help integrate and test such functions in RISC-V.
> 
> Sure that will be great. The only impediment is pgtable_page_ctor() for kernel
> linear mapping. This series is based on current arm64 where linear mapping
> pgtable pages go through pgtable_page_ctor() init sequence but that might be
> changing soon. If RISC-V does not have pgtable_page_ctor() init for linear
> mapping and no other arch specific stuff later on we can try to consolidate
> remove_pagetable() atleast for both the architectures.
> 
> Then I wondering whether I can transition pud|pmd_large() to pud|pmd_sect().

The first 10 patches of my generic page walk series[1] adds p?d_large()
as a common feature, so probably best sticking with p?d_large() if this
is going to be common and basing on top of those patches.

[1]
https://lore.kernel.org/lkml/20190403141627.11664-1-steven.price@arm.com/T/

Steve
Oscar Salvador April 4, 2019, 11:58 a.m. UTC | #10
On Thu, Apr 04, 2019 at 11:09:22AM +0530, Anshuman Khandual wrote:
> > Do these need to be __meminit? AFAICS it's effectively redundant with the containing #ifdef, and removal feels like it's inherently a later-than-init thing anyway.
> 
> I was confused here a bit but even X86 does exactly the same. All these functions
> are still labeled __meminit and all wrapped under CONFIG_MEMORY_HOTPLUG. Is there
> any concern to have __meminit here ?

We do not really need it as long as the code is within #ifdef CONFIG_MEMORY_HOTPLUG.
__meminit is being used when functions that are going to be need for hotplug need
to stay around.

/* Used for MEMORY_HOTPLUG */
#define __meminit        __section(.meminit.text) __cold notrace \
                                                  __latent_entropy

#if defined(CONFIG_MEMORY_HOTPLUG)
#define MEM_KEEP(sec)    *(.mem##sec)
#define MEM_DISCARD(sec)
#else
#define MEM_KEEP(sec)
#define MEM_DISCARD(sec) *(.mem##sec)
#endif

So it is kind of redundant to have both.
I will clean it up when reposting [1] and [2].

[1] https://patchwork.kernel.org/patch/10875019/
[2] https://patchwork.kernel.org/patch/10875021/
Anshuman Khandual April 4, 2019, 1:03 p.m. UTC | #11
On 04/04/2019 05:28 PM, Oscar Salvador wrote:
> On Thu, Apr 04, 2019 at 11:09:22AM +0530, Anshuman Khandual wrote:
>>> Do these need to be __meminit? AFAICS it's effectively redundant with the containing #ifdef, and removal feels like it's inherently a later-than-init thing anyway.
>>
>> I was confused here a bit but even X86 does exactly the same. All these functions
>> are still labeled __meminit and all wrapped under CONFIG_MEMORY_HOTPLUG. Is there
>> any concern to have __meminit here ?
> 
> We do not really need it as long as the code is within #ifdef CONFIG_MEMORY_HOTPLUG.
> __meminit is being used when functions that are going to be need for hotplug need
> to stay around.

Makes sense.

> 
> /* Used for MEMORY_HOTPLUG */
> #define __meminit        __section(.meminit.text) __cold notrace \
>                                                   __latent_entropy
> 
> #if defined(CONFIG_MEMORY_HOTPLUG)
> #define MEM_KEEP(sec)    *(.mem##sec)
> #define MEM_DISCARD(sec)
> #else
> #define MEM_KEEP(sec)
> #define MEM_DISCARD(sec) *(.mem##sec)
> #endif
> 
> So it is kind of redundant to have both.
> I will clean it up when reposting [1] and [2].
> 
> [1] https://patchwork.kernel.org/patch/10875019/
> [2] https://patchwork.kernel.org/patch/10875021/
> 

Sure. Will remove them from the proposed functions next time around.
Oscar Salvador April 4, 2019, 3:19 p.m. UTC | #12
On Thu, Apr 04, 2019 at 06:33:09PM +0530, Anshuman Khandual wrote:
> Sure. Will remove them from the proposed functions next time around.

Just need to make sure that the function is not calling directly or indirectly
another __meminit function, then it is safe to remove it.

E.g: 

sparse_add_one_section() is wrapped around CONFIG_MEMORY_HOTPLUG, but the
__meminit must stay because it calls sparse_index_init(),
sparse_init_one_section() and sparse_mem_map_populate(), all three marked as
__meminit because they are also used out of hotplug scope, during early boot.
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2418fb..db3e625 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -266,6 +266,9 @@  config HAVE_GENERIC_GUP
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+	def_bool y
+
 config ARCH_MEMORY_PROBE
 	bool "Enable /sys/devices/system/memory/probe interface"
 	depends on MEMORY_HOTPLUG
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index de70c1e..858098e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -355,6 +355,18 @@  static inline int pmd_protnone(pmd_t pmd)
 }
 #endif
 
+#if (CONFIG_PGTABLE_LEVELS > 2)
+#define pmd_large(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
+#else
+#define pmd_large(pmd) 0
+#endif
+
+#if (CONFIG_PGTABLE_LEVELS > 3)
+#define pud_large(pud)	(pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT))
+#else
+#define pud_large(pmd) 0
+#endif
+
 /*
  * THP definitions.
  */
@@ -555,6 +567,7 @@  static inline phys_addr_t pud_page_paddr(pud_t pud)
 
 #else
 
+#define pmd_index(addr) 0
 #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
 
 /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
@@ -612,6 +625,7 @@  static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 
 #else
 
+#define pud_index(adrr)	0
 #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
 
 /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e97f018..ae0777b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -714,6 +714,198 @@  int kern_addr_valid(unsigned long addr)
 
 	return pfn_valid(pte_pfn(pte));
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pagetable(struct page *page, int order)
+{
+	unsigned long magic;
+	unsigned int nr_pages = 1 << order;
+
+	if (PageReserved(page)) {
+		__ClearPageReserved(page);
+
+		magic = (unsigned long)page->freelist;
+		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else
+			while (nr_pages--)
+				free_reserved_page(page++);
+	} else
+		free_pages((unsigned long)page_address(page), order);
+}
+
+#if (CONFIG_PGTABLE_LEVELS > 2)
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	if (direct)
+		pgtable_page_dtor(pmd_page(*pmd));
+	free_pagetable(pmd_page(*pmd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pmd_clear(pmd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+#else
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
+{
+}
+#endif
+
+#if (CONFIG_PGTABLE_LEVELS > 3)
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	if (direct)
+		pgtable_page_dtor(pud_page(*pud));
+	free_pagetable(pud_page(*pud), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pud_clear(pud);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (!pud_none(*pud))
+			return;
+	}
+
+	if (direct)
+		pgtable_page_dtor(pgd_page(*pgd));
+	free_pagetable(pgd_page(*pgd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pgd_clear(pgd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+#else
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
+{
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
+{
+}
+#endif
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr,
+			unsigned long end, bool direct)
+{
+	pte_t *pte;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr += PAGE_SIZE, pte++) {
+		if (!pte_present(*pte))
+			continue;
+
+		if (!direct)
+			free_pagetable(pte_page(*pte), 0);
+		spin_lock(&init_mm.page_table_lock);
+		pte_clear(&init_mm, addr, pte);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+			unsigned long end, bool direct)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_large(*pmd)) {
+			if (!direct)
+				free_pagetable(pmd_page(*pmd),
+						get_order(PMD_SIZE));
+			spin_lock(&init_mm.page_table_lock);
+			pmd_clear(pmd);
+			spin_unlock(&init_mm.page_table_lock);
+			continue;
+		}
+		pte_base = pte_offset_kernel(pmd, 0UL);
+		remove_pte_table(pte_base, addr, next, direct);
+		free_pte_table(pte_base, pmd, direct);
+	}
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr,
+			unsigned long end, bool direct)
+{
+	unsigned long next;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_large(*pud)) {
+			if (!direct)
+				free_pagetable(pud_page(*pud),
+						get_order(PUD_SIZE));
+			spin_lock(&init_mm.page_table_lock);
+			pud_clear(pud);
+			spin_unlock(&init_mm.page_table_lock);
+			continue;
+		}
+		pmd_base = pmd_offset(pud, 0UL);
+		remove_pmd_table(pmd_base, addr, next, direct);
+		free_pmd_table(pmd_base, pud, direct);
+	}
+}
+
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+	unsigned long addr, next;
+	pud_t *pud_base;
+	pgd_t *pgd;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+		if (!pgd_present(*pgd))
+			continue;
+
+		pud_base = pud_offset(pgd, 0UL);
+		remove_pud_table(pud_base, addr, next, direct);
+		free_pud_table(pud_base, pgd, direct);
+	}
+	flush_tlb_kernel_range(start, end);
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -758,9 +950,12 @@  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	return 0;
 }
 #endif	/* CONFIG_ARM64_64K_PAGES */
-void vmemmap_free(unsigned long start, unsigned long end,
+void __ref vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
+#ifdef CONFIG_MEMORY_HOTPLUG
+	remove_pagetable(start, end, false);
+#endif
 }
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
 
@@ -1046,10 +1241,16 @@  int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
+{
+	WARN_ON(pgdir != init_mm.pgd);
+	remove_pagetable(start, start + size, true);
+}
+
 int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 		    bool want_memblock)
 {
-	int flags = 0;
+	int flags = 0, ret = 0;
 
 	if (rodata_full || debug_pagealloc_enabled())
 		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -1057,7 +1258,27 @@  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
 			     size, PAGE_KERNEL, pgd_pgtable_alloc, flags);
 
-	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   altmap, want_memblock);
+	if (ret)
+		__remove_pgd_mapping(swapper_pg_dir,
+					__phys_to_virt(start), size);
+	return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone = page_zone(pfn_to_page(start_pfn));
+	int ret;
+
+	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+	if (!ret)
+		__remove_pgd_mapping(swapper_pg_dir,
+					__phys_to_virt(start), size);
+	return ret;
+}
+#endif
 #endif