diff mbox series

[V2,2/2] arm64/mm: Enable memory hot remove

Message ID 1555221553-18845-3-git-send-email-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64/mm: Enable memory hot remove | expand

Commit Message

Anshuman Khandual April 14, 2019, 5:59 a.m. UTC
Memory removal from an arch perspective involves tearing down two different
kernel based mappings i.e vmemmap and linear while releasing related page
table pages allocated for the physical memory range to be removed.

Define a common kernel page table tear down helper remove_pagetable() which
can be used to unmap given kernel virtual address range. In effect it can
tear down both vmemap or kernel linear mappings. This new helper is called
from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
The argument 'direct' here identifies kernel linear mappings.

Vmemmap mappings page table pages are allocated through sparse mem helper
functions like vmemmap_alloc_block() which does not cycle the pages through
pgtable_page_ctor() constructs. Hence while removing it skips corresponding
destructor construct pgtable_page_dtor().

While here update arch_add_mempory() to handle __add_pages() failures by
just unmapping recently added kernel linear mapping. Now enable memory hot
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.

This implementation is overall inspired from kernel page table tear down
procedure on X86 architecture.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/Kconfig               |   3 +
 arch/arm64/include/asm/pgtable.h |   2 +
 arch/arm64/mm/mmu.c              | 221 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 224 insertions(+), 2 deletions(-)

Comments

Mark Rutland April 15, 2019, 1:48 p.m. UTC | #1
Hi Anshuman,

On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
> Memory removal from an arch perspective involves tearing down two different
> kernel based mappings i.e vmemmap and linear while releasing related page
> table pages allocated for the physical memory range to be removed.
> 
> Define a common kernel page table tear down helper remove_pagetable() which
> can be used to unmap given kernel virtual address range. In effect it can
> tear down both vmemap or kernel linear mappings. This new helper is called
> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> The argument 'direct' here identifies kernel linear mappings.

Can you please explain why we need to treat these differently? I thought
the next paragraph was going to do that, but as per my comment there it
doesn't seem to be relevant. :/

> Vmemmap mappings page table pages are allocated through sparse mem helper
> functions like vmemmap_alloc_block() which does not cycle the pages through
> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
> destructor construct pgtable_page_dtor().

I thought the ctor/dtor dance wasn't necessary for any init_mm tables,
so why do we need to mention it here specifically for the vmemmap
tables?

> While here update arch_add_mempory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. 

Is this a latent bug?

> Now enable memory hot remove on arm64 platforms by default with
> ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture.
> 
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>  arch/arm64/Kconfig               |   3 +
>  arch/arm64/include/asm/pgtable.h |   2 +
>  arch/arm64/mm/mmu.c              | 221 ++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 224 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index c383625..a870eb2 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -267,6 +267,9 @@ config HAVE_GENERIC_GUP
>  config ARCH_ENABLE_MEMORY_HOTPLUG
>  	def_bool y
>  
> +config ARCH_ENABLE_MEMORY_HOTREMOVE
> +	def_bool y
> +
>  config SMP
>  	def_bool y
>  
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index de70c1e..1ee22ff 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -555,6 +555,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
>  
>  #else
>  
> +#define pmd_index(addr) 0
>  #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
>  
>  /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
> @@ -612,6 +613,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>  
>  #else
>  
> +#define pud_index(adrr)	0
>  #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
>  
>  /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index ef82312..a4750fe 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -733,6 +733,194 @@ int kern_addr_valid(unsigned long addr)
>  
>  	return pfn_valid(pte_pfn(pte));
>  }
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_pagetable(struct page *page, int order)

On arm64, all of the stage-1 page tables other than the PGD are always
PAGE_SIZE. We shouldn't need to pass an order around in order to free
page tables.

It looks like this function is misnamed, and is used to free vmemmap
backing pages in addition to page tables used to map them. It would be
nicer to come up with a better naming scheme.

> +{
> +	unsigned long magic;
> +	unsigned int nr_pages = 1 << order;
> +
> +	if (PageReserved(page)) {
> +		__ClearPageReserved(page);
> +
> +		magic = (unsigned long)page->freelist;
> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {

Not a new problem, but it's unfortunate that the core code reuses the
page::freelist field for this, given it also uses page::private for the
section number. Using fields from different parts of the union doesn't
seem robust.

It would seem nicer to have a private2 field in the struct for anonymous
pages.

> +			while (nr_pages--)
> +				put_page_bootmem(page++);
> +		} else {
> +			while (nr_pages--)
> +				free_reserved_page(page++);
> +		}
> +	} else {
> +		free_pages((unsigned long)page_address(page), order);
> +	}
> +}
> +
> +#if (CONFIG_PGTABLE_LEVELS > 2)
> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)

As a general note, for arm64 please append a 'p' for pointers to
entries, i.e. these should be ptep and pmdp.

> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (!pte_none(*pte))
> +			return;
> +	}

You could get rid of the pte temporary, rename pte_start to ptep, and
simplify this to:

	for (i = 0; i < PTRS_PER_PTE; i++)
		if (!pte_none(ptep[i]))
			return;

Similar applies at the other levels.

I take it that some higher-level serialization prevents concurrent
modification to this table. Where does that happen?

> +
> +	free_pagetable(pmd_page(*pmd), 0);

Here we free the pte level of table...

> +	spin_lock(&init_mm.page_table_lock);
> +	pmd_clear(pmd);

... but only here do we disconnect it from the PMD level of table, and
we don't do any TLB maintenance just yet. The page could be poisoned
and/or reallocated before we invalidate the TLB, which is not safe. In
all cases, we must follow the sequence:

1) clear the pointer to a table
2) invalidate any corresponding TLB entries
3) free the table page

... or we risk a number of issues resulting from erroneous programming
of the TLBs. See pmd_free_pte_page() for an example of how to do this
correctly.

I'd have thought similar applied to x86, so that implementation looks
suspicious to me too...

> +	spin_unlock(&init_mm.page_table_lock);

What precisely is the page_table_lock intended to protect? 

It seems odd to me that we're happy to walk the tables without the lock,
but only grab the lock when performing a modification. That implies we
either have some higher-level mutual exclusion, or we're not holding the
lock in all cases we need to be.

> +}
> +#else
> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +}
> +#endif

I'm surprised that we never need to free a pte table for 2 level paging.
Is that definitely the case?

> +
> +#if (CONFIG_PGTABLE_LEVELS > 3)
> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (!pmd_none(*pmd))
> +			return;
> +	}
> +
> +	free_pagetable(pud_page(*pud), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pud_clear(pud);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (!pud_none(*pud))
> +			return;
> +	}
> +
> +	free_pagetable(pgd_page(*pgd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pgd_clear(pgd);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +#else
> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +}
> +
> +static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> +}
> +#endif

It seems very odd to me that we suddenly need both of these, rather than
requiring one before the other. Naively, I'd have expected that we'd
need:

- free_pte_table for CONFIG_PGTABLE_LEVELS > 1 (i.e. always)
- free_pmd_table for CONFIG_PGTABLE_LEVELS > 2
- free_pud_table for CONFIG_PGTABLE_LEVELS > 3

... matching the cases where the levels "really" exist. What am I
missing that ties the pmd and pud levels together?

> +static void
> +remove_pte_table(pte_t *pte_start, unsigned long addr,
> +			unsigned long end, bool direct)
> +{
> +	pte_t *pte;
> +
> +	pte = pte_start + pte_index(addr);
> +	for (; addr < end; addr += PAGE_SIZE, pte++) {
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		if (!direct)
> +			free_pagetable(pte_page(*pte), 0);

This is really confusing. Here we're freeing a page of memory backing
the vmemmap, which it _not_ a page table.

At the least, can we please rename "direct" to something like
"free_backing", inverting its polarity?

> +		spin_lock(&init_mm.page_table_lock);
> +		pte_clear(&init_mm, addr, pte);
> +		spin_unlock(&init_mm.page_table_lock);
> +	}
> +}

Rather than explicitly using pte_index(), the usual style for arm64 is
to pass the pmdp in and use pte_offset_kernel() to find the relevant
ptep, e.g.

static void remove pte_table(pmd_t *pmdp, unsigned long addr,
			     unsigned long end, bool direct)
{
	pte_t *ptep = pte_offset_kernel(pmdp, addr);

	do {
		if (!pte_present(*ptep)
			continue;

		...

	} while (ptep++, addr += PAGE_SIZE, addr != end);
}

... with similar applying at all levels.

Thanks,
Mark.
David Hildenbrand April 15, 2019, 1:55 p.m. UTC | #2
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
> +{
> +	unsigned long start_pfn = start >> PAGE_SHIFT;
> +	unsigned long nr_pages = size >> PAGE_SHIFT;
> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
> +	int ret;
> +
> +	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
> +	if (!ret)

Please note that I posted patches that remove all error handling
from arch_remove_memory and __remove_pages(). They are already in next/master

So this gets a lot simpler and more predictable.


Author: David Hildenbrand <david@redhat.com>
Date:   Wed Apr 10 11:02:27 2019 +1000

    mm/memory_hotplug: make __remove_pages() and arch_remove_memory() never fail
    
    All callers of arch_remove_memory() ignore errors.  And we should really
    try to remove any errors from the memory removal path.  No more errors are
    reported from __remove_pages().  BUG() in s390x code in case
    arch_remove_memory() is triggered.  We may implement that properly later.
    WARN in case powerpc code failed to remove the section mapping, which is
    better than ignoring the error completely right now.



> +		__remove_pgd_mapping(swapper_pg_dir,
> +					__phys_to_virt(start), size);
> +	return ret;
> +}
> +#endif
>  #endif
>
Anshuman Khandual April 16, 2019, 9:52 a.m. UTC | #3
On 04/15/2019 07:25 PM, David Hildenbrand wrote:
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
>> +{
>> +	unsigned long start_pfn = start >> PAGE_SHIFT;
>> +	unsigned long nr_pages = size >> PAGE_SHIFT;
>> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
>> +	int ret;
>> +
>> +	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
>> +	if (!ret)
> Please note that I posted patches that remove all error handling
> from arch_remove_memory and __remove_pages(). They are already in next/master
> 
> So this gets a lot simpler and more predictable.
> 
> 
> Author: David Hildenbrand <david@redhat.com>
> Date:   Wed Apr 10 11:02:27 2019 +1000
> 
>     mm/memory_hotplug: make __remove_pages() and arch_remove_memory() never fail
>     
>     All callers of arch_remove_memory() ignore errors.  And we should really
>     try to remove any errors from the memory removal path.  No more errors are
>     reported from __remove_pages().  BUG() in s390x code in case
>     arch_remove_memory() is triggered.  We may implement that properly later.
>     WARN in case powerpc code failed to remove the section mapping, which is
>     better than ignoring the error completely right now

Sure will follow suit next time around.
Anshuman Khandual April 17, 2019, 9:58 a.m. UTC | #4
On 04/15/2019 07:18 PM, Mark Rutland wrote:
> Hi Anshuman,
> 
> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table pages allocated for the physical memory range to be removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>> The argument 'direct' here identifies kernel linear mappings.
> 
> Can you please explain why we need to treat these differently? I thought
> the next paragraph was going to do that, but as per my comment there it
> doesn't seem to be relevant. :/

For linear mapping there is no actual allocated page which is mapped. Its the
pfn derived from physical address (from __va(PA)-->PA translation) which is
there in the page table entry and need not be freed any where during tear down.

But in case of vmemmap (struct page mapping for a given range) which is a real
virtual mapping (like vmalloc) real pages are allocated (buddy or memblock) and
are mapped in it's page table entries to effect the translation. These pages
need to be freed while tearing down the translation. But for both mappings
(linear and vmemmap) their page table pages need to be freed.

This differentiation is needed while deciding if [pte|pmd|pud]_page() at any
given level needs to be freed or not. Will update the commit message with this
explanation if required.

> 
>> Vmemmap mappings page table pages are allocated through sparse mem helper
>> functions like vmemmap_alloc_block() which does not cycle the pages through
>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>> destructor construct pgtable_page_dtor().
> 
> I thought the ctor/dtor dance wasn't necessary for any init_mm tables,
> so why do we need to mention it here specifically for the vmemmap
> tables?

Yeah not necessary any more. Will drop it.

> 
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. 
> 
> Is this a latent bug?

Did not get it. __add_pages() could fail because of __add_section() in which
case we should remove the linear mapping added previously in the first step.
Is there any concern here ?

> 
>> Now enable memory hot remove on arm64 platforms by default with
>> ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
>>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>  arch/arm64/Kconfig               |   3 +
>>  arch/arm64/include/asm/pgtable.h |   2 +
>>  arch/arm64/mm/mmu.c              | 221 ++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 224 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index c383625..a870eb2 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -267,6 +267,9 @@ config HAVE_GENERIC_GUP
>>  config ARCH_ENABLE_MEMORY_HOTPLUG
>>  	def_bool y
>>  
>> +config ARCH_ENABLE_MEMORY_HOTREMOVE
>> +	def_bool y
>> +
>>  config SMP
>>  	def_bool y
>>  
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index de70c1e..1ee22ff 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -555,6 +555,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
>>  
>>  #else
>>  
>> +#define pmd_index(addr) 0
>>  #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
>>  
>>  /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
>> @@ -612,6 +613,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>>  
>>  #else
>>  
>> +#define pud_index(adrr)	0
>>  #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
>>  
>>  /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index ef82312..a4750fe 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -733,6 +733,194 @@ int kern_addr_valid(unsigned long addr)
>>  
>>  	return pfn_valid(pte_pfn(pte));
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void free_pagetable(struct page *page, int order)
> 
> On arm64, all of the stage-1 page tables other than the PGD are always
> PAGE_SIZE. We shouldn't need to pass an order around in order to free
> page tables.
> 
> It looks like this function is misnamed, and is used to free vmemmap
> backing pages in addition to page tables used to map them. It would be
> nicer to come up with a better naming scheme.

free_pagetable() is being used both for freeing page table pages as well
mapped entries at various level (for vmemmap). As you rightly mentioned
page table pages are invariably PAGE_SIZE (other than pgd) but theses
mapped pages size can vary at various level. free_pagetable() is a very
generic helper which can accommodate pages allocated from buddy as well
as memblock. But I agree that the naming is misleading.

Will something like this will be better ?

void free_pagetable_mapped_page(struct page *page, int order)
{
	.......................
	.......................
}

void free_pagetable_page(struct page *page)
{
	free_pagetable_mapped_page(page, 0);
}

- Call free_pgtable_page() while freeing pagetable pages
- Call free_pgtable_mapped_page() while freeing mapped pages

> 
>> +{
>> +	unsigned long magic;
>> +	unsigned int nr_pages = 1 << order;
>> +
>> +	if (PageReserved(page)) {
>> +		__ClearPageReserved(page);
>> +
>> +		magic = (unsigned long)page->freelist;
>> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
> 
> Not a new problem, but it's unfortunate that the core code reuses the
> page::freelist field for this, given it also uses page::private for the
> section number. Using fields from different parts of the union doesn't
> seem robust.> 
> It would seem nicer to have a private2 field in the struct for anonymous
> pages.

Okay. But I guess its not something for us to investigate in this context.

> 
>> +			while (nr_pages--)
>> +				put_page_bootmem(page++);
>> +		} else {
>> +			while (nr_pages--)
>> +				free_reserved_page(page++);
>> +		}
>> +	} else {
>> +		free_pages((unsigned long)page_address(page), order);
>> +	}
>> +}
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
> 
> As a general note, for arm64 please append a 'p' for pointers to
> entries, i.e. these should be ptep and pmdp.

Sure will fix across the entire patch.

> 
>> +{
>> +	pte_t *pte;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PTE; i++) {
>> +		pte = pte_start + i;
>> +		if (!pte_none(*pte))
>> +			return;
>> +	}
> 
> You could get rid of the pte temporary, rename pte_start to ptep, and
> simplify this to:
> 
> 	for (i = 0; i < PTRS_PER_PTE; i++)
> 		if (!pte_none(ptep[i]))
> 			return;
> 
> Similar applies at the other levels.

Sure will do.

> 
> I take it that some higher-level serialization prevents concurrent
> modification to this table. Where does that happen?

mem_hotplug_begin()
mem_hotplug_end()

which operates on DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)

- arch_remove_memory() called from (__remove_memory || devm_memremap_pages_release)
- vmemmap_free() called from __remove_pages called from (arch_remove_memory || devm_memremap_pages_release)

Both __remove_memory() and devm_memremap_pages_release() are protected with
pair of these.

mem_hotplug_begin()
mem_hotplug_end()

vmemmap tear down happens before linear mapping and in sequence.

> 
>> +
>> +	free_pagetable(pmd_page(*pmd), 0);
> 
> Here we free the pte level of table...
> 
>> +	spin_lock(&init_mm.page_table_lock);
>> +	pmd_clear(pmd);
> 
> ... but only here do we disconnect it from the PMD level of table, and
> we don't do any TLB maintenance just yet. The page could be poisoned
> and/or reallocated before we invalidate the TLB, which is not safe. In
> all cases, we must follow the sequence:
> 
> 1) clear the pointer to a table
> 2) invalidate any corresponding TLB entries
> 3) free the table page
> 
> ... or we risk a number of issues resulting from erroneous programming
> of the TLBs. See pmd_free_pte_page() for an example of how to do this
> correctly.

Okay will send 'addr' into these functions and do somehting like this
at all levels as in case for pmd_free_pte_page().

        page = pud_page(*pudp);
        pud_clear(pudp);
        __flush_tlb_kernel_pgtable(addr);
        free_pgtable_page(page);


> 
> I'd have thought similar applied to x86, so that implementation looks
> suspicious to me too...
> 
>> +	spin_unlock(&init_mm.page_table_lock);
> 
> What precisely is the page_table_lock intended to protect?

Concurrent modification to kernel page table (init_mm) while clearing entries.

> 
> It seems odd to me that we're happy to walk the tables without the lock,
> but only grab the lock when performing a modification. That implies we
> either have some higher-level mutual exclusion, or we're not holding the
> lock in all cases we need to be.

On arm64

- linear mapping is half kernel virtual range (unlikely to share PGD with any other)
- vmemmap and vmalloc might or might not be aligned properly to avoid PGD/PUD/PMD overlap
- This kernel virtual space layout is not fixed and can change in future

Hence just to be on safer side lets take init_mm.page_table_lock for the entire tear
down process in remove_pagetable(). put_page_bootmem/free_reserved_page/free_pages should
not block for longer period unlike allocation paths. Hence it should be safe with overall
spin lock on init_mm.page_table_lock unless if there are some other concerns.

> 
>> +}
>> +#else
>> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +}
>> +#endif
> 
> I'm surprised that we never need to free a pte table for 2 level paging.
> Is that definitely the case?

Will fix it.

> 
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +	pmd_t *pmd;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PMD; i++) {
>> +		pmd = pmd_start + i;
>> +		if (!pmd_none(*pmd))
>> +			return;
>> +	}
>> +
>> +	free_pagetable(pud_page(*pud), 0);
>> +	spin_lock(&init_mm.page_table_lock);
>> +	pud_clear(pud);
>> +	spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
>> +{
>> +	pud_t *pud;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PUD; i++) {
>> +		pud = pud_start + i;
>> +		if (!pud_none(*pud))
>> +			return;
>> +	}
>> +
>> +	free_pagetable(pgd_page(*pgd), 0);
>> +	spin_lock(&init_mm.page_table_lock);
>> +	pgd_clear(pgd);
>> +	spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#else
>> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +}
>> +
>> +static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
>> +{
>> +}
>> +#endif
> 
> It seems very odd to me that we suddenly need both of these, rather than
> requiring one before the other. Naively, I'd have expected that we'd
> need:
> 
> - free_pte_table for CONFIG_PGTABLE_LEVELS > 1 (i.e. always)
> - free_pmd_table for CONFIG_PGTABLE_LEVELS > 2
> - free_pud_table for CONFIG_PGTABLE_LEVELS > 3
> 
> ... matching the cases where the levels "really" exist. What am I
> missing that ties the pmd and pud levels together?

Might have got somehting wrong here. Will fix it.

> 
>> +static void
>> +remove_pte_table(pte_t *pte_start, unsigned long addr,
>> +			unsigned long end, bool direct)
>> +{
>> +	pte_t *pte;
>> +
>> +	pte = pte_start + pte_index(addr);
>> +	for (; addr < end; addr += PAGE_SIZE, pte++) {
>> +		if (!pte_present(*pte))
>> +			continue;
>> +
>> +		if (!direct)
>> +			free_pagetable(pte_page(*pte), 0);
> 
> This is really confusing. Here we're freeing a page of memory backing
> the vmemmap, which it _not_ a page table.

Right. The new naming scheme proposed before should do take care.

> 
> At the least, can we please rename "direct" to something like
> "free_backing", inverting its polarity?

Will use 'sparse_vmap' instead and invert the polarity.

> 
>> +		spin_lock(&init_mm.page_table_lock);
>> +		pte_clear(&init_mm, addr, pte);
>> +		spin_unlock(&init_mm.page_table_lock);
>> +	}
>> +}
> 
> Rather than explicitly using pte_index(), the usual style for arm64 is
> to pass the pmdp in and use pte_offset_kernel() to find the relevant
> ptep, e.g.
> 
> static void remove pte_table(pmd_t *pmdp, unsigned long addr,
> 			     unsigned long end, bool direct)
> {
> 	pte_t *ptep = pte_offset_kernel(pmdp, addr);
> 
> 	do {
> 		if (!pte_present(*ptep)
> 			continue;
> 
> 		...
> 
> 	} while (ptep++, addr += PAGE_SIZE, addr != end);

Will probably stick with 'next = [pgd|pud|pmd_addr]_end(addr, end)' which
is used for iteration as well as for next level function.

> }
> 
> ... with similar applying at all levels.

Sure will change and this will probably get rid of additional [pmd|pud]_index()
definitions added here.
Mark Rutland April 17, 2019, 2:21 p.m. UTC | #5
On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
> On 04/15/2019 07:18 PM, Mark Rutland wrote:
> > On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
> >> Memory removal from an arch perspective involves tearing down two different
> >> kernel based mappings i.e vmemmap and linear while releasing related page
> >> table pages allocated for the physical memory range to be removed.
> >>
> >> Define a common kernel page table tear down helper remove_pagetable() which
> >> can be used to unmap given kernel virtual address range. In effect it can
> >> tear down both vmemap or kernel linear mappings. This new helper is called
> >> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
> >> The argument 'direct' here identifies kernel linear mappings.
> > 
> > Can you please explain why we need to treat these differently? I thought
> > the next paragraph was going to do that, but as per my comment there it
> > doesn't seem to be relevant. :/
> 
> For linear mapping there is no actual allocated page which is mapped. Its the
> pfn derived from physical address (from __va(PA)-->PA translation) which is
> there in the page table entry and need not be freed any where during tear down.
> 
> But in case of vmemmap (struct page mapping for a given range) which is a real
> virtual mapping (like vmalloc) real pages are allocated (buddy or memblock) and
> are mapped in it's page table entries to effect the translation. These pages
> need to be freed while tearing down the translation. But for both mappings
> (linear and vmemmap) their page table pages need to be freed.
> 
> This differentiation is needed while deciding if [pte|pmd|pud]_page() at any
> given level needs to be freed or not. Will update the commit message with this
> explanation if required.

Ok. I think you just need to say:

  When removing a vmemmap pagetable range, we must also free the pages
  used to back this range of the vmemmap.

> >> While here update arch_add_mempory() to handle __add_pages() failures by
> >> just unmapping recently added kernel linear mapping. 
> > 
> > Is this a latent bug?
> 
> Did not get it. __add_pages() could fail because of __add_section() in which
> case we should remove the linear mapping added previously in the first step.
> Is there any concern here ?

That is the question.

If that were to fail _before_ this series were applied, does that permit
anything bad to happen? e.g. is it permitted that when arch_add_memory()
fails, the relevant memory can be physically removed?

If so, that could result in a number of problems, and would be a latent
bug...

[...]

> >> +#ifdef CONFIG_MEMORY_HOTPLUG
> >> +static void free_pagetable(struct page *page, int order)
> > 
> > On arm64, all of the stage-1 page tables other than the PGD are always
> > PAGE_SIZE. We shouldn't need to pass an order around in order to free
> > page tables.
> > 
> > It looks like this function is misnamed, and is used to free vmemmap
> > backing pages in addition to page tables used to map them. It would be
> > nicer to come up with a better naming scheme.
> 
> free_pagetable() is being used both for freeing page table pages as well
> mapped entries at various level (for vmemmap). As you rightly mentioned
> page table pages are invariably PAGE_SIZE (other than pgd) but theses
> mapped pages size can vary at various level. free_pagetable() is a very
> generic helper which can accommodate pages allocated from buddy as well
> as memblock. But I agree that the naming is misleading.
> 
> Will something like this will be better ?
> 
> void free_pagetable_mapped_page(struct page *page, int order)
> {
> 	.......................
> 	.......................
> }
> 
> void free_pagetable_page(struct page *page)
> {
> 	free_pagetable_mapped_page(page, 0);
> }
> 
> - Call free_pgtable_page() while freeing pagetable pages
> - Call free_pgtable_mapped_page() while freeing mapped pages

I think the "pgtable" naming isn't necessary. These functions are passed
the relevant page, and free that page (or a range starting at that
page).

I think it would be better to have something like:

static void free_hotplug_page_range(struct page *page, unsigned long size)
{
	int order = get_order(size);
	int nr_pages = 1 << order;

	...
}

static void free_hotplug_page(struct page *page)
{
	free_hotplug_page_range(page, PAGE_SIZE);
}

... which avoids having to place get_order() in all the callers, and
makes things a bit easier to read.

> > 
> >> +{
> >> +	unsigned long magic;
> >> +	unsigned int nr_pages = 1 << order;
> >> +
> >> +	if (PageReserved(page)) {
> >> +		__ClearPageReserved(page);
> >> +
> >> +		magic = (unsigned long)page->freelist;
> >> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
> > 
> > Not a new problem, but it's unfortunate that the core code reuses the
> > page::freelist field for this, given it also uses page::private for the
> > section number. Using fields from different parts of the union doesn't
> > seem robust.> 
> > It would seem nicer to have a private2 field in the struct for anonymous
> > pages.
> 
> Okay. But I guess its not something for us to investigate in this context.
> 
> > 
> >> +			while (nr_pages--)
> >> +				put_page_bootmem(page++);
> >> +		} else {
> >> +			while (nr_pages--)
> >> +				free_reserved_page(page++);
> >> +		}
> >> +	} else {
> >> +		free_pages((unsigned long)page_address(page), order);
> >> +	}
> >> +}

Looking at this again, I'm surprised that we'd ever free bootmem pages.
I'd expect that we'd only remove memory that was added as part of a
hotplug, and that shouldn't have come from bootmem.

Will we ever really try to free bootmem pages like this?

[...]

> > I take it that some higher-level serialization prevents concurrent
> > modification to this table. Where does that happen?
> 
> mem_hotplug_begin()
> mem_hotplug_end()
> 
> which operates on DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)
> 
> - arch_remove_memory() called from (__remove_memory || devm_memremap_pages_release)
> - vmemmap_free() called from __remove_pages called from (arch_remove_memory || devm_memremap_pages_release)
> 
> Both __remove_memory() and devm_memremap_pages_release() are protected with
> pair of these.
> 
> mem_hotplug_begin()
> mem_hotplug_end()
> 
> vmemmap tear down happens before linear mapping and in sequence.
> 
> > 
> >> +
> >> +	free_pagetable(pmd_page(*pmd), 0);
> > 
> > Here we free the pte level of table...
> > 
> >> +	spin_lock(&init_mm.page_table_lock);
> >> +	pmd_clear(pmd);
> > 
> > ... but only here do we disconnect it from the PMD level of table, and
> > we don't do any TLB maintenance just yet. The page could be poisoned
> > and/or reallocated before we invalidate the TLB, which is not safe. In
> > all cases, we must follow the sequence:
> > 
> > 1) clear the pointer to a table
> > 2) invalidate any corresponding TLB entries
> > 3) free the table page
> > 
> > ... or we risk a number of issues resulting from erroneous programming
> > of the TLBs. See pmd_free_pte_page() for an example of how to do this
> > correctly.
> 
> Okay will send 'addr' into these functions and do somehting like this
> at all levels as in case for pmd_free_pte_page().
> 
>         page = pud_page(*pudp);
>         pud_clear(pudp);
>         __flush_tlb_kernel_pgtable(addr);
>         free_pgtable_page(page);

That looks correct to me!

> > I'd have thought similar applied to x86, so that implementation looks
> > suspicious to me too...
> > 
> >> +	spin_unlock(&init_mm.page_table_lock);
> > 
> > What precisely is the page_table_lock intended to protect?
> 
> Concurrent modification to kernel page table (init_mm) while clearing entries.

Concurrent modification by what code?

If something else can *modify* the portion of the table that we're
manipulating, then I don't see how we can safely walk the table up to
this point without holding the lock, nor how we can safely add memory.

Even if this is to protect something else which *reads* the tables,
other code in arm64 which modifies the kernel page tables doesn't take
the lock.

Usually, if you can do a lockless walk you have to verify that things
didn't change once you've taken the lock, but we don't follow that
pattern here.

As things stand it's not clear to me whether this is necessary or
sufficient.

> > It seems odd to me that we're happy to walk the tables without the lock,
> > but only grab the lock when performing a modification. That implies we
> > either have some higher-level mutual exclusion, or we're not holding the
> > lock in all cases we need to be.
> 
> On arm64
> 
> - linear mapping is half kernel virtual range (unlikely to share PGD with any other)
> - vmemmap and vmalloc might or might not be aligned properly to avoid PGD/PUD/PMD overlap
> - This kernel virtual space layout is not fixed and can change in future
> 
> Hence just to be on safer side lets take init_mm.page_table_lock for the entire tear
> down process in remove_pagetable(). put_page_bootmem/free_reserved_page/free_pages should
> not block for longer period unlike allocation paths. Hence it should be safe with overall
> spin lock on init_mm.page_table_lock unless if there are some other concerns.

Given the other issues with the x86 hot-remove code, it's not clear to
me whether that locking is correct or necessary. I think that before we
make any claim as to whether that's safe we should figure out how the
lock is actually used today.

I do not think we should mindlessly copy that.

Thanks,
Mark.
Anshuman Khandual April 17, 2019, 4:45 p.m. UTC | #6
On 04/17/2019 07:51 PM, Mark Rutland wrote:
> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>>>> Memory removal from an arch perspective involves tearing down two different
>>>> kernel based mappings i.e vmemmap and linear while releasing related page
>>>> table pages allocated for the physical memory range to be removed.
>>>>
>>>> Define a common kernel page table tear down helper remove_pagetable() which
>>>> can be used to unmap given kernel virtual address range. In effect it can
>>>> tear down both vmemap or kernel linear mappings. This new helper is called
>>>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>>>> The argument 'direct' here identifies kernel linear mappings.
>>>
>>> Can you please explain why we need to treat these differently? I thought
>>> the next paragraph was going to do that, but as per my comment there it
>>> doesn't seem to be relevant. :/
>>
>> For linear mapping there is no actual allocated page which is mapped. Its the
>> pfn derived from physical address (from __va(PA)-->PA translation) which is
>> there in the page table entry and need not be freed any where during tear down.
>>
>> But in case of vmemmap (struct page mapping for a given range) which is a real
>> virtual mapping (like vmalloc) real pages are allocated (buddy or memblock) and
>> are mapped in it's page table entries to effect the translation. These pages
>> need to be freed while tearing down the translation. But for both mappings
>> (linear and vmemmap) their page table pages need to be freed.
>>
>> This differentiation is needed while deciding if [pte|pmd|pud]_page() at any
>> given level needs to be freed or not. Will update the commit message with this
>> explanation if required.
> 
> Ok. I think you just need to say:
> 
>   When removing a vmemmap pagetable range, we must also free the pages
>   used to back this range of the vmemmap.

Sure will update the commit message with something similar.

> 
>>>> While here update arch_add_mempory() to handle __add_pages() failures by
>>>> just unmapping recently added kernel linear mapping. 
>>>
>>> Is this a latent bug?
>>
>> Did not get it. __add_pages() could fail because of __add_section() in which
>> case we should remove the linear mapping added previously in the first step.
>> Is there any concern here ?
> 
> That is the question.
> 
> If that were to fail _before_ this series were applied, does that permit
> anything bad to happen? e.g. is it permitted that when arch_add_memory()

Before this patch ? Yeah if __add_pages() fail then there are no memory
sections created for range (which means it can never be onlined) but they
have got linear mapping which is not good.
 > fails, the relevant memory can be physically removed?

Yeah you can remove it but there are linear mapping entries in init_mm
which when attempted will cause load/store instruction abort even after
a successful MMU translation and should raise an exception on the CPU
doing it.

> 
> If so, that could result in a number of problems, and would be a latent
> bug...

IIUC your question on the possibility of a latent bug. Yes if we just leave
the linear mapping intact which does not have valid memory block sections,
struct pages vmemmap etc it can cause unexpected behavior.

> 
> [...]
> 
>>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>>> +static void free_pagetable(struct page *page, int order)
>>>
>>> On arm64, all of the stage-1 page tables other than the PGD are always
>>> PAGE_SIZE. We shouldn't need to pass an order around in order to free
>>> page tables.
>>>
>>> It looks like this function is misnamed, and is used to free vmemmap
>>> backing pages in addition to page tables used to map them. It would be
>>> nicer to come up with a better naming scheme.
>>
>> free_pagetable() is being used both for freeing page table pages as well
>> mapped entries at various level (for vmemmap). As you rightly mentioned
>> page table pages are invariably PAGE_SIZE (other than pgd) but theses
>> mapped pages size can vary at various level. free_pagetable() is a very
>> generic helper which can accommodate pages allocated from buddy as well
>> as memblock. But I agree that the naming is misleading.
>>
>> Will something like this will be better ?
>>
>> void free_pagetable_mapped_page(struct page *page, int order)
>> {
>> 	.......................
>> 	.......................
>> }
>>
>> void free_pagetable_page(struct page *page)
>> {
>> 	free_pagetable_mapped_page(page, 0);
>> }
>>
>> - Call free_pgtable_page() while freeing pagetable pages
>> - Call free_pgtable_mapped_page() while freeing mapped pages
> 
> I think the "pgtable" naming isn't necessary. These functions are passed
> the relevant page, and free that page (or a range starting at that
> page).

Though just freeing pages, these helpers free pages belonging to kernel page
table (init_mm) which either contains entries or memory which is mapped in
those entries. Appending 'pgtable' to their name seems appropriate and also
help in preventing others from using it as a generic wrapper to free pages.
But also I guess it is just a matter of code choice.

> 
> I think it would be better to have something like:
> 
> static void free_hotplug_page_range(struct page *page, unsigned long size)
> {
> 	int order = get_order(size);
> 	int nr_pages = 1 << order;
> 
> 	...
> }
> 
> static void free_hotplug_page(struct page *page)
> {
> 	free_hotplug_page_range(page, PAGE_SIZE);
> }
> 
> ... which avoids having to place get_order() in all the callers, and
> makes things a bit easier to read.

Dont have a strong opinion on this but I guess its just code preference.

> 
>>>
>>>> +{
>>>> +	unsigned long magic;
>>>> +	unsigned int nr_pages = 1 << order;
>>>> +
>>>> +	if (PageReserved(page)) {
>>>> +		__ClearPageReserved(page);
>>>> +
>>>> +		magic = (unsigned long)page->freelist;
>>>> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>>>
>>> Not a new problem, but it's unfortunate that the core code reuses the
>>> page::freelist field for this, given it also uses page::private for the
>>> section number. Using fields from different parts of the union doesn't
>>> seem robust.> 
>>> It would seem nicer to have a private2 field in the struct for anonymous
>>> pages.
>>
>> Okay. But I guess its not something for us to investigate in this context.
>>
>>>
>>>> +			while (nr_pages--)
>>>> +				put_page_bootmem(page++);
>>>> +		} else {
>>>> +			while (nr_pages--)
>>>> +				free_reserved_page(page++);
>>>> +		}
>>>> +	} else {
>>>> +		free_pages((unsigned long)page_address(page), order);
>>>> +	}
>>>> +}
> 
> Looking at this again, I'm surprised that we'd ever free bootmem pages.
> I'd expect that we'd only remove memory that was added as part of a
> hotplug, and that shouldn't have come from bootmem.
> 
> Will we ever really try to free bootmem pages like this?
> 
> [...]
> 

There are page table pages and page table mapped pages for each mapping.

VMEMMAP Mapping:

1) vmemmap_populate (ARM64_4K_PAGES)

a) Vmemmap backing pages:
	- vmemmap_alloc_block_buf(PMD_SIZE, node)
		- sparse_buffer_alloc(size)
			- memblock_alloc_try_nid_raw()	(NA as sparse_buffer is gone [1])

		- OR

		-  vmemmap_alloc_block(size, node)
			- alloc_pages_node()		-> When slab available (runtime)
			- __earlyonly_bootmem_alloc 	-> When slab not available (NA)

b) Vmemmap pgtable pages:

vmemmap_pgd_populate()
vmemmap_pud_populate()
	 vmemmap_alloc_block_zero()
		vmemmap_alloc_block()
			alloc_pages_node()		-> When slab available (runtime)
			__earlyonly_bootmem_alloc()	-> When slab not available (NA)

2) vmemmap_populate (AR64_64K_PAGES || ARM64_16K_PAGES)

a) Vmemmap backing pages:
	vmemmap_pte_populate()
		vmemmap_alloc_block_buf(PAGE_SIZE, node)
		- sparse_buffer_alloc(size)
			- memblock_alloc_try_nid_raw()	(NA as sparse_buffer is gone [1])

		- OR

		-  vmemmap_alloc_block(size, node)
			- alloc_pages_node()		-> When slab available (runtime)
			- __earlyonly_bootmem_alloc 	-> When slab not available (NA)


b) Vmemmap pgtable pages:

vmemmap_pgd_populate()
vmemmap_pud_populate()
vmemmap_pmd_populate()
	 vmemmap_alloc_block_zero()
		vmemmap_alloc_block()
			alloc_pages_node()		-> When slab available (runtime)
			__earlyonly_bootmem_alloc()	-> When slab not available (NA)
			
LINEAR Mapping:

a) There are no backing pages here

b) Linear pgtable pages:

	Gets allocated through __pgd_pgtable_alloc() --> __get_free_page() which is buddy.

You might be right that we never allocate from memblock during memory hotplug but I
would still suggest keeping PageRserved() check to deal with 'accidental/unexpected'
memblock pages. If we are really sure that this is never going to happen then at least
lets do an WARN_ON() if this functions ever gets one non-buddy page.

I did give this a try (just calling free_pages) on couple of configs and did not see
any problem/crash or bad_page() errors. Will do more investigation and tests tomorrow.

[1] Though sparse_buffer gets allocated from memblock it does not stay till runtime.
Early memory uses chunks from it but rest gets released back. Runtime hotplug ideally
should not have got vmemmap backing memory from memblock. 

 
>>> I take it that some higher-level serialization prevents concurrent
>>> modification to this table. Where does that happen?
>>
>> mem_hotplug_begin()
>> mem_hotplug_end()
>>
>> which operates on DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)
>>
>> - arch_remove_memory() called from (__remove_memory || devm_memremap_pages_release)
>> - vmemmap_free() called from __remove_pages called from (arch_remove_memory || devm_memremap_pages_release)
>>
>> Both __remove_memory() and devm_memremap_pages_release() are protected with
>> pair of these.
>>
>> mem_hotplug_begin()
>> mem_hotplug_end()
>>
>> vmemmap tear down happens before linear mapping and in sequence.
>>
>>>
>>>> +
>>>> +	free_pagetable(pmd_page(*pmd), 0);
>>>
>>> Here we free the pte level of table...
>>>
>>>> +	spin_lock(&init_mm.page_table_lock);
>>>> +	pmd_clear(pmd);
>>>
>>> ... but only here do we disconnect it from the PMD level of table, and
>>> we don't do any TLB maintenance just yet. The page could be poisoned
>>> and/or reallocated before we invalidate the TLB, which is not safe. In
>>> all cases, we must follow the sequence:
>>>
>>> 1) clear the pointer to a table
>>> 2) invalidate any corresponding TLB entries
>>> 3) free the table page
>>>
>>> ... or we risk a number of issues resulting from erroneous programming
>>> of the TLBs. See pmd_free_pte_page() for an example of how to do this
>>> correctly.
>>
>> Okay will send 'addr' into these functions and do somehting like this
>> at all levels as in case for pmd_free_pte_page().
>>
>>         page = pud_page(*pudp);
>>         pud_clear(pudp);
>>         __flush_tlb_kernel_pgtable(addr);
>>         free_pgtable_page(page);
> 
> That looks correct to me!
> 
>>> I'd have thought similar applied to x86, so that implementation looks
>>> suspicious to me too...
>>>
>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>
>>> What precisely is the page_table_lock intended to protect?
>>
>> Concurrent modification to kernel page table (init_mm) while clearing entries.
> 
> Concurrent modification by what code?
> 
> If something else can *modify* the portion of the table that we're
> manipulating, then I don't see how we can safely walk the table up to
> this point without holding the lock, nor how we can safely add memory.
> 
> Even if this is to protect something else which *reads* the tables,
> other code in arm64 which modifies the kernel page tables doesn't take
> the lock.
> 
> Usually, if you can do a lockless walk you have to verify that things
> didn't change once you've taken the lock, but we don't follow that
> pattern here.
> 
> As things stand it's not clear to me whether this is necessary or
> sufficient.

Hence lets take more conservative approach and wrap the entire process of
remove_pagetable() under init_mm.page_table_lock which looks safe unless
in the worst case when free_pages() gets stuck for some reason in which
case we have bigger memory problem to deal with than a soft lock up.

> 
>>> It seems odd to me that we're happy to walk the tables without the lock,
>>> but only grab the lock when performing a modification. That implies we
>>> either have some higher-level mutual exclusion, or we're not holding the
>>> lock in all cases we need to be.
>>
>> On arm64
>>
>> - linear mapping is half kernel virtual range (unlikely to share PGD with any other)
>> - vmemmap and vmalloc might or might not be aligned properly to avoid PGD/PUD/PMD overlap
>> - This kernel virtual space layout is not fixed and can change in future
>>
>> Hence just to be on safer side lets take init_mm.page_table_lock for the entire tear
>> down process in remove_pagetable(). put_page_bootmem/free_reserved_page/free_pages should
>> not block for longer period unlike allocation paths. Hence it should be safe with overall
>> spin lock on init_mm.page_table_lock unless if there are some other concerns.
> 
> Given the other issues with the x86 hot-remove code, it's not clear to
> me whether that locking is correct or necessary. I think that before we
> make any claim as to whether that's safe we should figure out how the
> lock is actually used today.

Did not get you. What is the exact concern with the proposed lock which covers
end to end remove_pagetable() and guarantees that kernel pagetable init_mm wont
be modified cuncurrently. Thats what the definition of mm->page_table_lock says.
Is not that enough ?

struct mm_struct {
................
................
................
spinlock_t page_table_lock; /* Protects page tables and some
                             * counters
                             */
...............
...............
}

The worst would be hot-remove is bit slower which should be an acceptable
proposition IMHO but again I am open to ideas on this.

> 
> I do not think we should mindlessly copy that.

The overall protection for remove_pagetable() with init_mm.page_table_lock is a
deviation from what x86 is doing right now.
Mark Rutland April 17, 2019, 5:39 p.m. UTC | #7
On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
> On 04/17/2019 07:51 PM, Mark Rutland wrote:
> > On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
> >> On 04/15/2019 07:18 PM, Mark Rutland wrote:
> >>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:

> >>>> +	spin_unlock(&init_mm.page_table_lock);
> >>>
> >>> What precisely is the page_table_lock intended to protect?
> >>
> >> Concurrent modification to kernel page table (init_mm) while clearing entries.
> > 
> > Concurrent modification by what code?
> > 
> > If something else can *modify* the portion of the table that we're
> > manipulating, then I don't see how we can safely walk the table up to
> > this point without holding the lock, nor how we can safely add memory.
> > 
> > Even if this is to protect something else which *reads* the tables,
> > other code in arm64 which modifies the kernel page tables doesn't take
> > the lock.
> > 
> > Usually, if you can do a lockless walk you have to verify that things
> > didn't change once you've taken the lock, but we don't follow that
> > pattern here.
> > 
> > As things stand it's not clear to me whether this is necessary or
> > sufficient.
> 
> Hence lets take more conservative approach and wrap the entire process of
> remove_pagetable() under init_mm.page_table_lock which looks safe unless
> in the worst case when free_pages() gets stuck for some reason in which
> case we have bigger memory problem to deal with than a soft lock up.

Sorry, but I'm not happy with _any_ solution until we understand where
and why we need to take the init_mm ptl, and have made some effort to
ensure that the kernel correctly does so elsewhere. It is not sufficient
to consider this code in isolation.

IIUC, before this patch we never clear non-leaf entries in the kernel
page tables, so readers don't presently need to take the ptl in order to
safely walk down to a leaf entry.

For example, the arm64 ptdump code never takes the ptl, and as of this
patch it will blow up if it races with a hot-remove, regardless of
whether the hot-remove code itself holds the ptl.

Note that the same applies to the x86 ptdump code; we cannot assume that
just because x86 does something that it happens to be correct.

I strongly suspect there are other cases that would fall afoul of this,
in both arm64 and generic code.

Thanks,
Mark.
Anshuman Khandual April 18, 2019, 5:28 a.m. UTC | #8
On 04/17/2019 11:09 PM, Mark Rutland wrote:
> On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
>> On 04/17/2019 07:51 PM, Mark Rutland wrote:
>>> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>>>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
> 
>>>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>>>
>>>>> What precisely is the page_table_lock intended to protect?
>>>>
>>>> Concurrent modification to kernel page table (init_mm) while clearing entries.
>>>
>>> Concurrent modification by what code?
>>>
>>> If something else can *modify* the portion of the table that we're
>>> manipulating, then I don't see how we can safely walk the table up to
>>> this point without holding the lock, nor how we can safely add memory.
>>>
>>> Even if this is to protect something else which *reads* the tables,
>>> other code in arm64 which modifies the kernel page tables doesn't take
>>> the lock.
>>>
>>> Usually, if you can do a lockless walk you have to verify that things
>>> didn't change once you've taken the lock, but we don't follow that
>>> pattern here.
>>>
>>> As things stand it's not clear to me whether this is necessary or
>>> sufficient.
>>
>> Hence lets take more conservative approach and wrap the entire process of
>> remove_pagetable() under init_mm.page_table_lock which looks safe unless
>> in the worst case when free_pages() gets stuck for some reason in which
>> case we have bigger memory problem to deal with than a soft lock up.
> 
> Sorry, but I'm not happy with _any_ solution until we understand where
> and why we need to take the init_mm ptl, and have made some effort to
> ensure that the kernel correctly does so elsewhere. It is not sufficient
> to consider this code in isolation.

We will have to take the kernel page table lock to prevent assumption regarding
present or future possible kernel VA space layout. Wrapping around the entire
remove_pagetable() will be at coarse granularity but I dont see why it should
not sufficient atleast from this particular tear down operation regardless of
how this might affect other kernel pgtable walkers.

IIUC your concern is regarding other parts of kernel code (arm64/generic) which
assume that kernel page table wont be changing and hence they normally walk the
table without holding pgtable lock. Hence those current pgtabe walker will be
affected after this change.

> 
> IIUC, before this patch we never clear non-leaf entries in the kernel
> page tables, so readers don't presently need to take the ptl in order to
> safely walk down to a leaf entry.

Got it. Will look into this.

> 
> For example, the arm64 ptdump code never takes the ptl, and as of this
> patch it will blow up if it races with a hot-remove, regardless of
> whether the hot-remove code itself holds the ptl.

Got it. Are there there more such examples where this can be problematic. I
will be happy to investigate all such places and change/add locking scheme
in there to make them work with memory hot remove.

> 
> Note that the same applies to the x86 ptdump code; we cannot assume that
> just because x86 does something that it happens to be correct.

I understand. Will look into other non-x86 platforms as well on how they are
dealing with this.

> 
> I strongly suspect there are other cases that would fall afoul of this,
> in both arm64 and generic code.

Will start looking into all such possible cases both on arm64 and generic.
Mean while more such pointers would be really helpful.
Anshuman Khandual April 23, 2019, 7:31 a.m. UTC | #9
On 04/18/2019 10:58 AM, Anshuman Khandual wrote:
> On 04/17/2019 11:09 PM, Mark Rutland wrote:
>> On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
>>> On 04/17/2019 07:51 PM, Mark Rutland wrote:
>>>> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>>>>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>>>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>>
>>>>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>>>>
>>>>>> What precisely is the page_table_lock intended to protect?
>>>>>
>>>>> Concurrent modification to kernel page table (init_mm) while clearing entries.
>>>>
>>>> Concurrent modification by what code?
>>>>
>>>> If something else can *modify* the portion of the table that we're
>>>> manipulating, then I don't see how we can safely walk the table up to
>>>> this point without holding the lock, nor how we can safely add memory.
>>>>
>>>> Even if this is to protect something else which *reads* the tables,
>>>> other code in arm64 which modifies the kernel page tables doesn't take
>>>> the lock.
>>>>
>>>> Usually, if you can do a lockless walk you have to verify that things
>>>> didn't change once you've taken the lock, but we don't follow that
>>>> pattern here.
>>>>
>>>> As things stand it's not clear to me whether this is necessary or
>>>> sufficient.
>>>
>>> Hence lets take more conservative approach and wrap the entire process of
>>> remove_pagetable() under init_mm.page_table_lock which looks safe unless
>>> in the worst case when free_pages() gets stuck for some reason in which
>>> case we have bigger memory problem to deal with than a soft lock up.
>>
>> Sorry, but I'm not happy with _any_ solution until we understand where
>> and why we need to take the init_mm ptl, and have made some effort to
>> ensure that the kernel correctly does so elsewhere. It is not sufficient
>> to consider this code in isolation.
> 
> We will have to take the kernel page table lock to prevent assumption regarding
> present or future possible kernel VA space layout. Wrapping around the entire
> remove_pagetable() will be at coarse granularity but I dont see why it should
> not sufficient atleast from this particular tear down operation regardless of
> how this might affect other kernel pgtable walkers.
> 
> IIUC your concern is regarding other parts of kernel code (arm64/generic) which
> assume that kernel page table wont be changing and hence they normally walk the
> table without holding pgtable lock. Hence those current pgtabe walker will be
> affected after this change.
> 
>>
>> IIUC, before this patch we never clear non-leaf entries in the kernel
>> page tables, so readers don't presently need to take the ptl in order to
>> safely walk down to a leaf entry.
> 
> Got it. Will look into this.
> 
>>
>> For example, the arm64 ptdump code never takes the ptl, and as of this
>> patch it will blow up if it races with a hot-remove, regardless of
>> whether the hot-remove code itself holds the ptl.
> 
> Got it. Are there there more such examples where this can be problematic. I
> will be happy to investigate all such places and change/add locking scheme
> in there to make them work with memory hot remove.
> 
>>
>> Note that the same applies to the x86 ptdump code; we cannot assume that
>> just because x86 does something that it happens to be correct.
> 
> I understand. Will look into other non-x86 platforms as well on how they are
> dealing with this.
> 
>>
>> I strongly suspect there are other cases that would fall afoul of this,
>> in both arm64 and generic code.

On X86

- kernel_physical_mapping_init() takes the lock for pgtable page allocations as well
  as all leaf level entries including large mappings.

On Power

- remove_pagetable() take an overall high level init_mm.page_table_lock as I had
  suggested before. __map_kernel_page() calls [pud|pmd|pte]_alloc_[kernel] which
  allocates page table pages are protected with init_mm.page_table_lock but then
  the actual setting of the leaf entries are not (unlike x86)

	arch_add_memory()
		create_section_mapping()
			radix__create_section_mapping()
				create_physical_mapping()
					__map_kernel_page()
On arm64.

Both kernel page table dump and linear mapping (__create_pgd_mapping on init_mm)
will require init_mm.page_table_lock to be really safe against this new memory
hot remove code. I will do the necessary changes as part of this series next time
around. IIUC there is no equivalent generic feature for ARM64_PTDUMP_CORE/DEBUGFS.
	 > 
> Will start looking into all such possible cases both on arm64 and generic.
> Mean while more such pointers would be really helpful.

Generic usage for init_mm.pagetable_lock

Unless I have missed something else these are the generic init_mm kernel page table
modifiers at runtime (at least which uses init_mm.page_table_lock)

	1. ioremap_page_range()		/* Mapped I/O memory area */
	2. apply_to_page_range()	/* Change existing kernel linear map */
	3. vmap_page_range()		/* Vmalloc area */

A. IOREMAP

ioremap_page_range()
	ioremap_p4d_range()
		p4d_alloc()
		ioremap_try_huge_p4d() -> p4d_set_huge() -> set_p4d()
		ioremap_pud_range()
			pud_alloc()
			ioremap_try_huge_pud() -> pud_set_huge() -> set_pud()
			ioremap_pmd_range()
				pmd_alloc()
				ioremap_try_huge_pmd() -> pmd_set_huge() -> set_pmd()
				ioremap_pte_range()
					pte_alloc_kernel()
						set_pte_at() -> set_pte()
B. APPLY_TO_PAGE_RANGE

apply_to_page_range()
	apply_to_p4d_range()
		p4d_alloc()
		apply_to_pud_range()
			pud_alloc()
			apply_to_pmd_range()
				pmd_alloc()
				apply_to_pte_range()
					pte_alloc_kernel()

C. VMAP_PAGE_RANGE

vmap_page_range()
vmap_page_range_noflush()
	vmap_p4d_range()
		p4d_alloc()
		vmap_pud_range()
			pud_alloc()
			vmap_pmd_range()
				pmd_alloc()
				vmap_pte_range()
					pte_alloc_kernel()
					set_pte_at()

In all of the above.

- Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are protected with init_mm.page_table_lock
- Should not it require init_mm.page_table_lock for all leaf level (PUD|PMD|PTE) modification as well ?
- Should not this require init_mm.page_table_lock for page table walk itself ?

Not taking an overall lock for all these three operations will potentially race with an ongoing memory
hot remove operation which takes an overall lock as proposed. Wondering if this has this been safe till
now ?
David Hildenbrand April 23, 2019, 7:37 a.m. UTC | #10
On 23.04.19 09:31, Anshuman Khandual wrote:
> 
> 
> On 04/18/2019 10:58 AM, Anshuman Khandual wrote:
>> On 04/17/2019 11:09 PM, Mark Rutland wrote:
>>> On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
>>>> On 04/17/2019 07:51 PM, Mark Rutland wrote:
>>>>> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>>>>>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>>>>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>>>
>>>>>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>>>>>
>>>>>>> What precisely is the page_table_lock intended to protect?
>>>>>>
>>>>>> Concurrent modification to kernel page table (init_mm) while clearing entries.
>>>>>
>>>>> Concurrent modification by what code?
>>>>>
>>>>> If something else can *modify* the portion of the table that we're
>>>>> manipulating, then I don't see how we can safely walk the table up to
>>>>> this point without holding the lock, nor how we can safely add memory.
>>>>>
>>>>> Even if this is to protect something else which *reads* the tables,
>>>>> other code in arm64 which modifies the kernel page tables doesn't take
>>>>> the lock.
>>>>>
>>>>> Usually, if you can do a lockless walk you have to verify that things
>>>>> didn't change once you've taken the lock, but we don't follow that
>>>>> pattern here.
>>>>>
>>>>> As things stand it's not clear to me whether this is necessary or
>>>>> sufficient.
>>>>
>>>> Hence lets take more conservative approach and wrap the entire process of
>>>> remove_pagetable() under init_mm.page_table_lock which looks safe unless
>>>> in the worst case when free_pages() gets stuck for some reason in which
>>>> case we have bigger memory problem to deal with than a soft lock up.
>>>
>>> Sorry, but I'm not happy with _any_ solution until we understand where
>>> and why we need to take the init_mm ptl, and have made some effort to
>>> ensure that the kernel correctly does so elsewhere. It is not sufficient
>>> to consider this code in isolation.
>>
>> We will have to take the kernel page table lock to prevent assumption regarding
>> present or future possible kernel VA space layout. Wrapping around the entire
>> remove_pagetable() will be at coarse granularity but I dont see why it should
>> not sufficient atleast from this particular tear down operation regardless of
>> how this might affect other kernel pgtable walkers.
>>
>> IIUC your concern is regarding other parts of kernel code (arm64/generic) which
>> assume that kernel page table wont be changing and hence they normally walk the
>> table without holding pgtable lock. Hence those current pgtabe walker will be
>> affected after this change.
>>
>>>
>>> IIUC, before this patch we never clear non-leaf entries in the kernel
>>> page tables, so readers don't presently need to take the ptl in order to
>>> safely walk down to a leaf entry.
>>
>> Got it. Will look into this.
>>
>>>
>>> For example, the arm64 ptdump code never takes the ptl, and as of this
>>> patch it will blow up if it races with a hot-remove, regardless of
>>> whether the hot-remove code itself holds the ptl.
>>
>> Got it. Are there there more such examples where this can be problematic. I
>> will be happy to investigate all such places and change/add locking scheme
>> in there to make them work with memory hot remove.
>>
>>>
>>> Note that the same applies to the x86 ptdump code; we cannot assume that
>>> just because x86 does something that it happens to be correct.
>>
>> I understand. Will look into other non-x86 platforms as well on how they are
>> dealing with this.
>>
>>>
>>> I strongly suspect there are other cases that would fall afoul of this,
>>> in both arm64 and generic code.
> 
> On X86
> 
> - kernel_physical_mapping_init() takes the lock for pgtable page allocations as well
>   as all leaf level entries including large mappings.
> 
> On Power
> 
> - remove_pagetable() take an overall high level init_mm.page_table_lock as I had
>   suggested before. __map_kernel_page() calls [pud|pmd|pte]_alloc_[kernel] which
>   allocates page table pages are protected with init_mm.page_table_lock but then
>   the actual setting of the leaf entries are not (unlike x86)
> 
> 	arch_add_memory()
> 		create_section_mapping()
> 			radix__create_section_mapping()
> 				create_physical_mapping()
> 					__map_kernel_page()
> On arm64.
> 
> Both kernel page table dump and linear mapping (__create_pgd_mapping on init_mm)
> will require init_mm.page_table_lock to be really safe against this new memory
> hot remove code. I will do the necessary changes as part of this series next time
> around. IIUC there is no equivalent generic feature for ARM64_PTDUMP_CORE/DEBUGFS.
> 	 > 
>> Will start looking into all such possible cases both on arm64 and generic.
>> Mean while more such pointers would be really helpful.
> 
> Generic usage for init_mm.pagetable_lock
> 
> Unless I have missed something else these are the generic init_mm kernel page table
> modifiers at runtime (at least which uses init_mm.page_table_lock)
> 
> 	1. ioremap_page_range()		/* Mapped I/O memory area */
> 	2. apply_to_page_range()	/* Change existing kernel linear map */
> 	3. vmap_page_range()		/* Vmalloc area */
> 
> A. IOREMAP
> 
> ioremap_page_range()
> 	ioremap_p4d_range()
> 		p4d_alloc()
> 		ioremap_try_huge_p4d() -> p4d_set_huge() -> set_p4d()
> 		ioremap_pud_range()
> 			pud_alloc()
> 			ioremap_try_huge_pud() -> pud_set_huge() -> set_pud()
> 			ioremap_pmd_range()
> 				pmd_alloc()
> 				ioremap_try_huge_pmd() -> pmd_set_huge() -> set_pmd()
> 				ioremap_pte_range()
> 					pte_alloc_kernel()
> 						set_pte_at() -> set_pte()
> B. APPLY_TO_PAGE_RANGE
> 
> apply_to_page_range()
> 	apply_to_p4d_range()
> 		p4d_alloc()
> 		apply_to_pud_range()
> 			pud_alloc()
> 			apply_to_pmd_range()
> 				pmd_alloc()
> 				apply_to_pte_range()
> 					pte_alloc_kernel()
> 
> C. VMAP_PAGE_RANGE
> 
> vmap_page_range()
> vmap_page_range_noflush()
> 	vmap_p4d_range()
> 		p4d_alloc()
> 		vmap_pud_range()
> 			pud_alloc()
> 			vmap_pmd_range()
> 				pmd_alloc()
> 				vmap_pte_range()
> 					pte_alloc_kernel()
> 					set_pte_at()
> 
> In all of the above.
> 
> - Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are protected with init_mm.page_table_lock
> - Should not it require init_mm.page_table_lock for all leaf level (PUD|PMD|PTE) modification as well ?
> - Should not this require init_mm.page_table_lock for page table walk itself ?
> 
> Not taking an overall lock for all these three operations will potentially race with an ongoing memory
> hot remove operation which takes an overall lock as proposed. Wondering if this has this been safe till
> now ?
> 

All memory add/remove operations are currently guarded by
mem_hotplug_lock as far as I know.
Anshuman Khandual April 23, 2019, 7:45 a.m. UTC | #11
On 04/23/2019 01:07 PM, David Hildenbrand wrote:
> On 23.04.19 09:31, Anshuman Khandual wrote:
>>
>>
>> On 04/18/2019 10:58 AM, Anshuman Khandual wrote:
>>> On 04/17/2019 11:09 PM, Mark Rutland wrote:
>>>> On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
>>>>> On 04/17/2019 07:51 PM, Mark Rutland wrote:
>>>>>> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>>>>>>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>>>>>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>>>>
>>>>>>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>>>>>>
>>>>>>>> What precisely is the page_table_lock intended to protect?
>>>>>>>
>>>>>>> Concurrent modification to kernel page table (init_mm) while clearing entries.
>>>>>>
>>>>>> Concurrent modification by what code?
>>>>>>
>>>>>> If something else can *modify* the portion of the table that we're
>>>>>> manipulating, then I don't see how we can safely walk the table up to
>>>>>> this point without holding the lock, nor how we can safely add memory.
>>>>>>
>>>>>> Even if this is to protect something else which *reads* the tables,
>>>>>> other code in arm64 which modifies the kernel page tables doesn't take
>>>>>> the lock.
>>>>>>
>>>>>> Usually, if you can do a lockless walk you have to verify that things
>>>>>> didn't change once you've taken the lock, but we don't follow that
>>>>>> pattern here.
>>>>>>
>>>>>> As things stand it's not clear to me whether this is necessary or
>>>>>> sufficient.
>>>>>
>>>>> Hence lets take more conservative approach and wrap the entire process of
>>>>> remove_pagetable() under init_mm.page_table_lock which looks safe unless
>>>>> in the worst case when free_pages() gets stuck for some reason in which
>>>>> case we have bigger memory problem to deal with than a soft lock up.
>>>>
>>>> Sorry, but I'm not happy with _any_ solution until we understand where
>>>> and why we need to take the init_mm ptl, and have made some effort to
>>>> ensure that the kernel correctly does so elsewhere. It is not sufficient
>>>> to consider this code in isolation.
>>>
>>> We will have to take the kernel page table lock to prevent assumption regarding
>>> present or future possible kernel VA space layout. Wrapping around the entire
>>> remove_pagetable() will be at coarse granularity but I dont see why it should
>>> not sufficient atleast from this particular tear down operation regardless of
>>> how this might affect other kernel pgtable walkers.
>>>
>>> IIUC your concern is regarding other parts of kernel code (arm64/generic) which
>>> assume that kernel page table wont be changing and hence they normally walk the
>>> table without holding pgtable lock. Hence those current pgtabe walker will be
>>> affected after this change.
>>>
>>>>
>>>> IIUC, before this patch we never clear non-leaf entries in the kernel
>>>> page tables, so readers don't presently need to take the ptl in order to
>>>> safely walk down to a leaf entry.
>>>
>>> Got it. Will look into this.
>>>
>>>>
>>>> For example, the arm64 ptdump code never takes the ptl, and as of this
>>>> patch it will blow up if it races with a hot-remove, regardless of
>>>> whether the hot-remove code itself holds the ptl.
>>>
>>> Got it. Are there there more such examples where this can be problematic. I
>>> will be happy to investigate all such places and change/add locking scheme
>>> in there to make them work with memory hot remove.
>>>
>>>>
>>>> Note that the same applies to the x86 ptdump code; we cannot assume that
>>>> just because x86 does something that it happens to be correct.
>>>
>>> I understand. Will look into other non-x86 platforms as well on how they are
>>> dealing with this.
>>>
>>>>
>>>> I strongly suspect there are other cases that would fall afoul of this,
>>>> in both arm64 and generic code.
>>
>> On X86
>>
>> - kernel_physical_mapping_init() takes the lock for pgtable page allocations as well
>>   as all leaf level entries including large mappings.
>>
>> On Power
>>
>> - remove_pagetable() take an overall high level init_mm.page_table_lock as I had
>>   suggested before. __map_kernel_page() calls [pud|pmd|pte]_alloc_[kernel] which
>>   allocates page table pages are protected with init_mm.page_table_lock but then
>>   the actual setting of the leaf entries are not (unlike x86)
>>
>> 	arch_add_memory()
>> 		create_section_mapping()
>> 			radix__create_section_mapping()
>> 				create_physical_mapping()
>> 					__map_kernel_page()
>> On arm64.
>>
>> Both kernel page table dump and linear mapping (__create_pgd_mapping on init_mm)
>> will require init_mm.page_table_lock to be really safe against this new memory
>> hot remove code. I will do the necessary changes as part of this series next time
>> around. IIUC there is no equivalent generic feature for ARM64_PTDUMP_CORE/DEBUGFS.
>> 	 > 
>>> Will start looking into all such possible cases both on arm64 and generic.
>>> Mean while more such pointers would be really helpful.
>>
>> Generic usage for init_mm.pagetable_lock
>>
>> Unless I have missed something else these are the generic init_mm kernel page table
>> modifiers at runtime (at least which uses init_mm.page_table_lock)
>>
>> 	1. ioremap_page_range()		/* Mapped I/O memory area */
>> 	2. apply_to_page_range()	/* Change existing kernel linear map */
>> 	3. vmap_page_range()		/* Vmalloc area */
>>
>> A. IOREMAP
>>
>> ioremap_page_range()
>> 	ioremap_p4d_range()
>> 		p4d_alloc()
>> 		ioremap_try_huge_p4d() -> p4d_set_huge() -> set_p4d()
>> 		ioremap_pud_range()
>> 			pud_alloc()
>> 			ioremap_try_huge_pud() -> pud_set_huge() -> set_pud()
>> 			ioremap_pmd_range()
>> 				pmd_alloc()
>> 				ioremap_try_huge_pmd() -> pmd_set_huge() -> set_pmd()
>> 				ioremap_pte_range()
>> 					pte_alloc_kernel()
>> 						set_pte_at() -> set_pte()
>> B. APPLY_TO_PAGE_RANGE
>>
>> apply_to_page_range()
>> 	apply_to_p4d_range()
>> 		p4d_alloc()
>> 		apply_to_pud_range()
>> 			pud_alloc()
>> 			apply_to_pmd_range()
>> 				pmd_alloc()
>> 				apply_to_pte_range()
>> 					pte_alloc_kernel()
>>
>> C. VMAP_PAGE_RANGE
>>
>> vmap_page_range()
>> vmap_page_range_noflush()
>> 	vmap_p4d_range()
>> 		p4d_alloc()
>> 		vmap_pud_range()
>> 			pud_alloc()
>> 			vmap_pmd_range()
>> 				pmd_alloc()
>> 				vmap_pte_range()
>> 					pte_alloc_kernel()
>> 					set_pte_at()
>>
>> In all of the above.
>>
>> - Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are protected with init_mm.page_table_lock
>> - Should not it require init_mm.page_table_lock for all leaf level (PUD|PMD|PTE) modification as well ?
>> - Should not this require init_mm.page_table_lock for page table walk itself ?
>>
>> Not taking an overall lock for all these three operations will potentially race with an ongoing memory
>> hot remove operation which takes an overall lock as proposed. Wondering if this has this been safe till
>> now ?
>>
> 
> All memory add/remove operations are currently guarded by
> mem_hotplug_lock as far as I know.

Right but it seems like it guards against concurrent memory hot add or remove operations with
respect to memory block, sections, sysfs etc. But does it cover with respect to other init_mm
modifiers or accessors ?
David Hildenbrand April 23, 2019, 7:51 a.m. UTC | #12
On 23.04.19 09:45, Anshuman Khandual wrote:
> 
> 
> On 04/23/2019 01:07 PM, David Hildenbrand wrote:
>> On 23.04.19 09:31, Anshuman Khandual wrote:
>>>
>>>
>>> On 04/18/2019 10:58 AM, Anshuman Khandual wrote:
>>>> On 04/17/2019 11:09 PM, Mark Rutland wrote:
>>>>> On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
>>>>>> On 04/17/2019 07:51 PM, Mark Rutland wrote:
>>>>>>> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>>>>>>>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>>>>>>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>>>>>
>>>>>>>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>>>>>>>
>>>>>>>>> What precisely is the page_table_lock intended to protect?
>>>>>>>>
>>>>>>>> Concurrent modification to kernel page table (init_mm) while clearing entries.
>>>>>>>
>>>>>>> Concurrent modification by what code?
>>>>>>>
>>>>>>> If something else can *modify* the portion of the table that we're
>>>>>>> manipulating, then I don't see how we can safely walk the table up to
>>>>>>> this point without holding the lock, nor how we can safely add memory.
>>>>>>>
>>>>>>> Even if this is to protect something else which *reads* the tables,
>>>>>>> other code in arm64 which modifies the kernel page tables doesn't take
>>>>>>> the lock.
>>>>>>>
>>>>>>> Usually, if you can do a lockless walk you have to verify that things
>>>>>>> didn't change once you've taken the lock, but we don't follow that
>>>>>>> pattern here.
>>>>>>>
>>>>>>> As things stand it's not clear to me whether this is necessary or
>>>>>>> sufficient.
>>>>>>
>>>>>> Hence lets take more conservative approach and wrap the entire process of
>>>>>> remove_pagetable() under init_mm.page_table_lock which looks safe unless
>>>>>> in the worst case when free_pages() gets stuck for some reason in which
>>>>>> case we have bigger memory problem to deal with than a soft lock up.
>>>>>
>>>>> Sorry, but I'm not happy with _any_ solution until we understand where
>>>>> and why we need to take the init_mm ptl, and have made some effort to
>>>>> ensure that the kernel correctly does so elsewhere. It is not sufficient
>>>>> to consider this code in isolation.
>>>>
>>>> We will have to take the kernel page table lock to prevent assumption regarding
>>>> present or future possible kernel VA space layout. Wrapping around the entire
>>>> remove_pagetable() will be at coarse granularity but I dont see why it should
>>>> not sufficient atleast from this particular tear down operation regardless of
>>>> how this might affect other kernel pgtable walkers.
>>>>
>>>> IIUC your concern is regarding other parts of kernel code (arm64/generic) which
>>>> assume that kernel page table wont be changing and hence they normally walk the
>>>> table without holding pgtable lock. Hence those current pgtabe walker will be
>>>> affected after this change.
>>>>
>>>>>
>>>>> IIUC, before this patch we never clear non-leaf entries in the kernel
>>>>> page tables, so readers don't presently need to take the ptl in order to
>>>>> safely walk down to a leaf entry.
>>>>
>>>> Got it. Will look into this.
>>>>
>>>>>
>>>>> For example, the arm64 ptdump code never takes the ptl, and as of this
>>>>> patch it will blow up if it races with a hot-remove, regardless of
>>>>> whether the hot-remove code itself holds the ptl.
>>>>
>>>> Got it. Are there there more such examples where this can be problematic. I
>>>> will be happy to investigate all such places and change/add locking scheme
>>>> in there to make them work with memory hot remove.
>>>>
>>>>>
>>>>> Note that the same applies to the x86 ptdump code; we cannot assume that
>>>>> just because x86 does something that it happens to be correct.
>>>>
>>>> I understand. Will look into other non-x86 platforms as well on how they are
>>>> dealing with this.
>>>>
>>>>>
>>>>> I strongly suspect there are other cases that would fall afoul of this,
>>>>> in both arm64 and generic code.
>>>
>>> On X86
>>>
>>> - kernel_physical_mapping_init() takes the lock for pgtable page allocations as well
>>>   as all leaf level entries including large mappings.
>>>
>>> On Power
>>>
>>> - remove_pagetable() take an overall high level init_mm.page_table_lock as I had
>>>   suggested before. __map_kernel_page() calls [pud|pmd|pte]_alloc_[kernel] which
>>>   allocates page table pages are protected with init_mm.page_table_lock but then
>>>   the actual setting of the leaf entries are not (unlike x86)
>>>
>>> 	arch_add_memory()
>>> 		create_section_mapping()
>>> 			radix__create_section_mapping()
>>> 				create_physical_mapping()
>>> 					__map_kernel_page()
>>> On arm64.
>>>
>>> Both kernel page table dump and linear mapping (__create_pgd_mapping on init_mm)
>>> will require init_mm.page_table_lock to be really safe against this new memory
>>> hot remove code. I will do the necessary changes as part of this series next time
>>> around. IIUC there is no equivalent generic feature for ARM64_PTDUMP_CORE/DEBUGFS.
>>> 	 > 
>>>> Will start looking into all such possible cases both on arm64 and generic.
>>>> Mean while more such pointers would be really helpful.
>>>
>>> Generic usage for init_mm.pagetable_lock
>>>
>>> Unless I have missed something else these are the generic init_mm kernel page table
>>> modifiers at runtime (at least which uses init_mm.page_table_lock)
>>>
>>> 	1. ioremap_page_range()		/* Mapped I/O memory area */
>>> 	2. apply_to_page_range()	/* Change existing kernel linear map */
>>> 	3. vmap_page_range()		/* Vmalloc area */
>>>
>>> A. IOREMAP
>>>
>>> ioremap_page_range()
>>> 	ioremap_p4d_range()
>>> 		p4d_alloc()
>>> 		ioremap_try_huge_p4d() -> p4d_set_huge() -> set_p4d()
>>> 		ioremap_pud_range()
>>> 			pud_alloc()
>>> 			ioremap_try_huge_pud() -> pud_set_huge() -> set_pud()
>>> 			ioremap_pmd_range()
>>> 				pmd_alloc()
>>> 				ioremap_try_huge_pmd() -> pmd_set_huge() -> set_pmd()
>>> 				ioremap_pte_range()
>>> 					pte_alloc_kernel()
>>> 						set_pte_at() -> set_pte()
>>> B. APPLY_TO_PAGE_RANGE
>>>
>>> apply_to_page_range()
>>> 	apply_to_p4d_range()
>>> 		p4d_alloc()
>>> 		apply_to_pud_range()
>>> 			pud_alloc()
>>> 			apply_to_pmd_range()
>>> 				pmd_alloc()
>>> 				apply_to_pte_range()
>>> 					pte_alloc_kernel()
>>>
>>> C. VMAP_PAGE_RANGE
>>>
>>> vmap_page_range()
>>> vmap_page_range_noflush()
>>> 	vmap_p4d_range()
>>> 		p4d_alloc()
>>> 		vmap_pud_range()
>>> 			pud_alloc()
>>> 			vmap_pmd_range()
>>> 				pmd_alloc()
>>> 				vmap_pte_range()
>>> 					pte_alloc_kernel()
>>> 					set_pte_at()
>>>
>>> In all of the above.
>>>
>>> - Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are protected with init_mm.page_table_lock
>>> - Should not it require init_mm.page_table_lock for all leaf level (PUD|PMD|PTE) modification as well ?
>>> - Should not this require init_mm.page_table_lock for page table walk itself ?
>>>
>>> Not taking an overall lock for all these three operations will potentially race with an ongoing memory
>>> hot remove operation which takes an overall lock as proposed. Wondering if this has this been safe till
>>> now ?
>>>
>>
>> All memory add/remove operations are currently guarded by
>> mem_hotplug_lock as far as I know.
> 
> Right but it seems like it guards against concurrent memory hot add or remove operations with
> respect to memory block, sections, sysfs etc. But does it cover with respect to other init_mm
> modifiers or accessors ?
> 

Don't think so, it's purely for memory add/remove via
add_memory/remove_memory/devm_memremap_pages, not anything beyond that.
Whoever uses get_online_mems/put_online_mems is save in respect to that
- mostly slab/slub.
Anshuman Khandual April 23, 2019, 8:37 a.m. UTC | #13
On 04/23/2019 01:21 PM, David Hildenbrand wrote:
> On 23.04.19 09:45, Anshuman Khandual wrote:
>>
>>
>> On 04/23/2019 01:07 PM, David Hildenbrand wrote:
>>> On 23.04.19 09:31, Anshuman Khandual wrote:
>>>>
>>>>
>>>> On 04/18/2019 10:58 AM, Anshuman Khandual wrote:
>>>>> On 04/17/2019 11:09 PM, Mark Rutland wrote:
>>>>>> On Wed, Apr 17, 2019 at 10:15:35PM +0530, Anshuman Khandual wrote:
>>>>>>> On 04/17/2019 07:51 PM, Mark Rutland wrote:
>>>>>>>> On Wed, Apr 17, 2019 at 03:28:18PM +0530, Anshuman Khandual wrote:
>>>>>>>>> On 04/15/2019 07:18 PM, Mark Rutland wrote:
>>>>>>>>>> On Sun, Apr 14, 2019 at 11:29:13AM +0530, Anshuman Khandual wrote:
>>>>>>
>>>>>>>>>>> +	spin_unlock(&init_mm.page_table_lock);
>>>>>>>>>>
>>>>>>>>>> What precisely is the page_table_lock intended to protect?
>>>>>>>>>
>>>>>>>>> Concurrent modification to kernel page table (init_mm) while clearing entries.
>>>>>>>>
>>>>>>>> Concurrent modification by what code?
>>>>>>>>
>>>>>>>> If something else can *modify* the portion of the table that we're
>>>>>>>> manipulating, then I don't see how we can safely walk the table up to
>>>>>>>> this point without holding the lock, nor how we can safely add memory.
>>>>>>>>
>>>>>>>> Even if this is to protect something else which *reads* the tables,
>>>>>>>> other code in arm64 which modifies the kernel page tables doesn't take
>>>>>>>> the lock.
>>>>>>>>
>>>>>>>> Usually, if you can do a lockless walk you have to verify that things
>>>>>>>> didn't change once you've taken the lock, but we don't follow that
>>>>>>>> pattern here.
>>>>>>>>
>>>>>>>> As things stand it's not clear to me whether this is necessary or
>>>>>>>> sufficient.
>>>>>>>
>>>>>>> Hence lets take more conservative approach and wrap the entire process of
>>>>>>> remove_pagetable() under init_mm.page_table_lock which looks safe unless
>>>>>>> in the worst case when free_pages() gets stuck for some reason in which
>>>>>>> case we have bigger memory problem to deal with than a soft lock up.
>>>>>>
>>>>>> Sorry, but I'm not happy with _any_ solution until we understand where
>>>>>> and why we need to take the init_mm ptl, and have made some effort to
>>>>>> ensure that the kernel correctly does so elsewhere. It is not sufficient
>>>>>> to consider this code in isolation.
>>>>>
>>>>> We will have to take the kernel page table lock to prevent assumption regarding
>>>>> present or future possible kernel VA space layout. Wrapping around the entire
>>>>> remove_pagetable() will be at coarse granularity but I dont see why it should
>>>>> not sufficient atleast from this particular tear down operation regardless of
>>>>> how this might affect other kernel pgtable walkers.
>>>>>
>>>>> IIUC your concern is regarding other parts of kernel code (arm64/generic) which
>>>>> assume that kernel page table wont be changing and hence they normally walk the
>>>>> table without holding pgtable lock. Hence those current pgtabe walker will be
>>>>> affected after this change.
>>>>>
>>>>>>
>>>>>> IIUC, before this patch we never clear non-leaf entries in the kernel
>>>>>> page tables, so readers don't presently need to take the ptl in order to
>>>>>> safely walk down to a leaf entry.
>>>>>
>>>>> Got it. Will look into this.
>>>>>
>>>>>>
>>>>>> For example, the arm64 ptdump code never takes the ptl, and as of this
>>>>>> patch it will blow up if it races with a hot-remove, regardless of
>>>>>> whether the hot-remove code itself holds the ptl.
>>>>>
>>>>> Got it. Are there there more such examples where this can be problematic. I
>>>>> will be happy to investigate all such places and change/add locking scheme
>>>>> in there to make them work with memory hot remove.
>>>>>
>>>>>>
>>>>>> Note that the same applies to the x86 ptdump code; we cannot assume that
>>>>>> just because x86 does something that it happens to be correct.
>>>>>
>>>>> I understand. Will look into other non-x86 platforms as well on how they are
>>>>> dealing with this.
>>>>>
>>>>>>
>>>>>> I strongly suspect there are other cases that would fall afoul of this,
>>>>>> in both arm64 and generic code.
>>>>
>>>> On X86
>>>>
>>>> - kernel_physical_mapping_init() takes the lock for pgtable page allocations as well
>>>>   as all leaf level entries including large mappings.
>>>>
>>>> On Power
>>>>
>>>> - remove_pagetable() take an overall high level init_mm.page_table_lock as I had
>>>>   suggested before. __map_kernel_page() calls [pud|pmd|pte]_alloc_[kernel] which
>>>>   allocates page table pages are protected with init_mm.page_table_lock but then
>>>>   the actual setting of the leaf entries are not (unlike x86)
>>>>
>>>> 	arch_add_memory()
>>>> 		create_section_mapping()
>>>> 			radix__create_section_mapping()
>>>> 				create_physical_mapping()
>>>> 					__map_kernel_page()
>>>> On arm64.
>>>>
>>>> Both kernel page table dump and linear mapping (__create_pgd_mapping on init_mm)
>>>> will require init_mm.page_table_lock to be really safe against this new memory
>>>> hot remove code. I will do the necessary changes as part of this series next time
>>>> around. IIUC there is no equivalent generic feature for ARM64_PTDUMP_CORE/DEBUGFS.
>>>> 	 > 
>>>>> Will start looking into all such possible cases both on arm64 and generic.
>>>>> Mean while more such pointers would be really helpful.
>>>>
>>>> Generic usage for init_mm.pagetable_lock
>>>>
>>>> Unless I have missed something else these are the generic init_mm kernel page table
>>>> modifiers at runtime (at least which uses init_mm.page_table_lock)
>>>>
>>>> 	1. ioremap_page_range()		/* Mapped I/O memory area */
>>>> 	2. apply_to_page_range()	/* Change existing kernel linear map */
>>>> 	3. vmap_page_range()		/* Vmalloc area */
>>>>
>>>> A. IOREMAP
>>>>
>>>> ioremap_page_range()
>>>> 	ioremap_p4d_range()
>>>> 		p4d_alloc()
>>>> 		ioremap_try_huge_p4d() -> p4d_set_huge() -> set_p4d()
>>>> 		ioremap_pud_range()
>>>> 			pud_alloc()
>>>> 			ioremap_try_huge_pud() -> pud_set_huge() -> set_pud()
>>>> 			ioremap_pmd_range()
>>>> 				pmd_alloc()
>>>> 				ioremap_try_huge_pmd() -> pmd_set_huge() -> set_pmd()
>>>> 				ioremap_pte_range()
>>>> 					pte_alloc_kernel()
>>>> 						set_pte_at() -> set_pte()
>>>> B. APPLY_TO_PAGE_RANGE
>>>>
>>>> apply_to_page_range()
>>>> 	apply_to_p4d_range()
>>>> 		p4d_alloc()
>>>> 		apply_to_pud_range()
>>>> 			pud_alloc()
>>>> 			apply_to_pmd_range()
>>>> 				pmd_alloc()
>>>> 				apply_to_pte_range()
>>>> 					pte_alloc_kernel()
>>>>
>>>> C. VMAP_PAGE_RANGE
>>>>
>>>> vmap_page_range()
>>>> vmap_page_range_noflush()
>>>> 	vmap_p4d_range()
>>>> 		p4d_alloc()
>>>> 		vmap_pud_range()
>>>> 			pud_alloc()
>>>> 			vmap_pmd_range()
>>>> 				pmd_alloc()
>>>> 				vmap_pte_range()
>>>> 					pte_alloc_kernel()
>>>> 					set_pte_at()
>>>>
>>>> In all of the above.
>>>>
>>>> - Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are protected with init_mm.page_table_lock
>>>> - Should not it require init_mm.page_table_lock for all leaf level (PUD|PMD|PTE) modification as well ?
>>>> - Should not this require init_mm.page_table_lock for page table walk itself ?
>>>>
>>>> Not taking an overall lock for all these three operations will potentially race with an ongoing memory
>>>> hot remove operation which takes an overall lock as proposed. Wondering if this has this been safe till
>>>> now ?
>>>>
>>>
>>> All memory add/remove operations are currently guarded by
>>> mem_hotplug_lock as far as I know.
>>
>> Right but it seems like it guards against concurrent memory hot add or remove operations with
>> respect to memory block, sections, sysfs etc. But does it cover with respect to other init_mm
>> modifiers or accessors ?
>>
> 
> Don't think so, it's purely for memory add/remove via
> add_memory/remove_memory/devm_memremap_pages, not anything beyond that.
> Whoever uses get_online_mems/put_online_mems is save in respect to that
> - mostly slab/slub.

There are two distinct locking requirements here

* Protect against clearing of intermediate pgtable pages to prevent unhandled aborts while walking
  the kernel page table.

	- pgtable page clearing happens only through memory hot-remove not with any other modifiers
	- At minimum we would require get_online_mems() for all concurrent walkers of kernel page table

		- Kernel page table dump on arm64 (with ARM64_PTDUMP_CORE_[DEBUGFS])
		- Existing generic MM modifiers. Without get_online_mems() not sure how these can be
		  really safe against memory hot-remove.

			1. ioremap_page_range()
			2. apply_to_page_range()
			3. vmap_page_range()

* Protect against each other's accesses (apart from get_online_mems)

	- Take overall init_mm.page_table_lock for [ioremap|apply_to|vmap]_page_range()
	- Take overall lock while creating/destroying linear mapping on each platform
	- Hotplug lock through get_online_mems only protects against clearing/adding of intermediate pgtable
	  pages not general modifications by non-hotplug accessors.

Will appreciate if folks could review the assumptions made above as I might have missed something.
Mark Rutland April 23, 2019, 4:05 p.m. UTC | #14
On Tue, Apr 23, 2019 at 01:01:58PM +0530, Anshuman Khandual wrote:
> Generic usage for init_mm.pagetable_lock
> 
> Unless I have missed something else these are the generic init_mm kernel page table
> modifiers at runtime (at least which uses init_mm.page_table_lock)
> 
> 	1. ioremap_page_range()		/* Mapped I/O memory area */
> 	2. apply_to_page_range()	/* Change existing kernel linear map */
> 	3. vmap_page_range()		/* Vmalloc area */

Internally, those all use the __p??_alloc() functions to handle racy
additions by transiently taking the PTL when installing a new table, but
otherwise walk kernel tables _without_ the PTL held. Note that none of
these ever free an intermediate level of table.

I believe that the idea is that operations on separate VMAs should never
conflict at the leaf level, and operations on the same VMA should be
serialised somehow w.r.t. that VMA.

AFAICT, these functions are _never_ called on the linear/direct map or
vmemmap VA ranges, and whether or not these can conflict with hot-remove
is entirely dependent on whether those ranges can share a level of table
with the vmalloc region.

Do you know how likely that is to occur? e.g. what proportion of the
vmalloc region may share a level of table with the linear or vmemmap
regions in a typical arm64 or x86 configuration? Can we deliberately
provoke this failure case?

[...]

> In all of the above.
> 
> - Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are
>   protected with init_mm.page_table_lock

Racy addition is protect in this manner.

> - Should not it require init_mm.page_table_lock for all leaf level
>   (PUD|PMD|PTE) modification as well ?

As above, I believe that the PTL is assumed to not be necessary there
since other mutual exclusion should be in effect to prevent racy
modification of leaf entries.

> - Should not this require init_mm.page_table_lock for page table walk
>   itself ?
> 
> Not taking an overall lock for all these three operations will
> potentially race with an ongoing memory hot remove operation which
> takes an overall lock as proposed. Wondering if this has this been
> safe till now ?

I suspect that the answer is that hot-remove is not thoroughly
stress-tested today, and conflicts are possible but rare.

As above, can we figure out how likely conflicts are, and try to come up
with a stress test?

Is it possible to avoid these specific conflicts (ignoring ptdump) by
aligning VA regions such that they cannot share intermediate levels of
table?

Thanks,
Mark.
Anshuman Khandual April 24, 2019, 5:59 a.m. UTC | #15
On 04/23/2019 09:35 PM, Mark Rutland wrote:
> On Tue, Apr 23, 2019 at 01:01:58PM +0530, Anshuman Khandual wrote:
>> Generic usage for init_mm.pagetable_lock
>>
>> Unless I have missed something else these are the generic init_mm kernel page table
>> modifiers at runtime (at least which uses init_mm.page_table_lock)
>>
>> 	1. ioremap_page_range()		/* Mapped I/O memory area */
>> 	2. apply_to_page_range()	/* Change existing kernel linear map */
>> 	3. vmap_page_range()		/* Vmalloc area */
> 
> Internally, those all use the __p??_alloc() functions to handle racy
> additions by transiently taking the PTL when installing a new table, but
> otherwise walk kernel tables _without_ the PTL held. Note that none of
> these ever free an intermediate level of table.

Right they dont free intermediate level page table but I was curious about the
only the leaf level modifications.

> 
> I believe that the idea is that operations on separate VMAs should never

I guess you meant kernel virtual range with 'VMA' but not the actual VMA which is
vm_area_struct applicable only for the user space not the kernel.

> conflict at the leaf level, and operations on the same VMA should be
> serialised somehow w.r.t. that VMA.

AFAICT see there is nothing other than hotplug lock i.e mem_hotplug_lock which
prevents concurrent init_mm modifications and the current situation is only safe
because some how these VA areas dont overlap with respect to intermediate page
table level spans.

> 
> AFAICT, these functions are _never_ called on the linear/direct map or
> vmemmap VA ranges, and whether or not these can conflict with hot-remove
> is entirely dependent on whether those ranges can share a level of table
> with the vmalloc region.

Right but all these VA ranges (linear, vmemmap, vmalloc) are wired in on init_mm
hence wondering if it is prudent to assume layout scheme which varies a lot based
on different architectures while deciding possible race protections. Wondering why
these user should not call [get|put]_online_mems() to prevent race with hotplug.
Will try this out.

Unless generic MM expects these VA ranges (linear, vmemmap, vmalloc) layout to be
in certain manner from the platform guaranteeing non-overlap at intermediate level
page table spans. Only then we would not a lock.
 
> 
> Do you know how likely that is to occur? e.g. what proportion of the

TBH I dont know.

> vmalloc region may share a level of table with the linear or vmemmap
> regions in a typical arm64 or x86 configuration? Can we deliberately
> provoke this failure case?

I have not enumerated those yet but there are multiple configs on arm64 and
probably on x86 which decides kernel VA space layout causing these potential
races. But regardless its not right to assume on vmalloc range span and not
take a lock.

Not sure how to provoke this failure case from user space with simple hotplug
because vmalloc physical allocation normally cannot be controlled without a
hacked kernel change.

> 
> [...]
> 
>> In all of the above.
>>
>> - Page table pages [p4d|pud|pmd|pte]_alloc_[kernel] settings are
>>   protected with init_mm.page_table_lock
> 
> Racy addition is protect in this manner.

Right.

> 
>> - Should not it require init_mm.page_table_lock for all leaf level
>>   (PUD|PMD|PTE) modification as well ?
> 
> As above, I believe that the PTL is assumed to not be necessary there
> since other mutual exclusion should be in effect to prevent racy
> modification of leaf entries.

Wondering what are those mutual exclusions other than the memory hotplug lock.
Again if its on kernel VA space layout assumptions its not a good idea.

> 
>> - Should not this require init_mm.page_table_lock for page table walk
>>   itself ?
>>
>> Not taking an overall lock for all these three operations will
>> potentially race with an ongoing memory hot remove operation which
>> takes an overall lock as proposed. Wondering if this has this been
>> safe till now ?
> 
> I suspect that the answer is that hot-remove is not thoroughly
> stress-tested today, and conflicts are possible but rare.

Will make these generic modifiers call [get|put]_online_mems() in a separate
patch at least to protect themselves from memory hot remove operation.

> 
> As above, can we figure out how likely conflicts are, and try to come up
> with a stress test?

Will try something out by hot plugging a memory range without actually onlining it
while there is another vmalloc stress running on the system.

> 
> Is it possible to avoid these specific conflicts (ignoring ptdump) by
> aligning VA regions such that they cannot share intermediate levels of
> table?

Kernel VA space layout is platform specific where core MM does not mandate much. 
Hence generic modifiers should not make any assumptions regarding it but protect
themselves with locks. Doing any thing other than that is just pushing the problem
to future.
Mark Rutland April 24, 2019, 8:19 a.m. UTC | #16
On Wed, Apr 24, 2019 at 11:29:28AM +0530, Anshuman Khandual wrote:
> On 04/23/2019 09:35 PM, Mark Rutland wrote:
> > On Tue, Apr 23, 2019 at 01:01:58PM +0530, Anshuman Khandual wrote:
> >> Generic usage for init_mm.pagetable_lock
> >>
> >> Unless I have missed something else these are the generic init_mm kernel page table
> >> modifiers at runtime (at least which uses init_mm.page_table_lock)
> >>
> >> 	1. ioremap_page_range()		/* Mapped I/O memory area */
> >> 	2. apply_to_page_range()	/* Change existing kernel linear map */
> >> 	3. vmap_page_range()		/* Vmalloc area */
> > 
> > Internally, those all use the __p??_alloc() functions to handle racy
> > additions by transiently taking the PTL when installing a new table, but
> > otherwise walk kernel tables _without_ the PTL held. Note that none of
> > these ever free an intermediate level of table.
> 
> Right they dont free intermediate level page table but I was curious about the
> only the leaf level modifications.

Sure thing; I just wanted to point that out explicitly for everyone else's
benefit. :)

> > I believe that the idea is that operations on separate VMAs should never
> 
> I guess you meant kernel virtual range with 'VMA' but not the actual VMA which is
> vm_area_struct applicable only for the user space not the kernel.

Sure. In the kernel we'd reserve a kernel VA range with a vm_struct via
get_vm_area() or similar.

The key point is that we reserve page-granular VA ranges which cannot overlap
at the leaf level (but may share intermediate levels of table). Whoever owns
that area is in charge of necessary mutual exclusion for the leaf entries.

> > conflict at the leaf level, and operations on the same VMA should be
> > serialised somehow w.r.t. that VMA.
> 
> AFAICT see there is nothing other than hotplug lock i.e mem_hotplug_lock which
> prevents concurrent init_mm modifications and the current situation is only safe
> because some how these VA areas dont overlap with respect to intermediate page
> table level spans.

Here I was ignoring hotplug to describe the general principle (which I've
expanded upon above).

> > AFAICT, these functions are _never_ called on the linear/direct map or
> > vmemmap VA ranges, and whether or not these can conflict with hot-remove
> > is entirely dependent on whether those ranges can share a level of table
> > with the vmalloc region.
> 
> Right but all these VA ranges (linear, vmemmap, vmalloc) are wired in on init_mm
> hence wondering if it is prudent to assume layout scheme which varies a lot based
> on different architectures while deciding possible race protections.

One thing to consider is that we could turn this implicit assumption into a
requirement, if this isn't too invasive.

> Wondering why these user should not call [get|put]_online_mems() to prevent
> race with hotplug.

I suspect that this is because they were written before memory hotplug was
added, and they were never reconsidered in the presence of hot-remove.

> Will try this out.
> 
> Unless generic MM expects these VA ranges (linear, vmemmap, vmalloc) layout to be
> in certain manner from the platform guaranteeing non-overlap at intermediate level
> page table spans. Only then we would not a lock.

I think that we might be able to make this a requirement for hot-remove. I
suspect this happens to be the case in practice by chance, even if it isn't
strictly guaranteed.

> > Do you know how likely that is to occur? e.g. what proportion of the
> 
> TBH I dont know.
> 
> > vmalloc region may share a level of table with the linear or vmemmap
> > regions in a typical arm64 or x86 configuration? Can we deliberately
> > provoke this failure case?
> 
> I have not enumerated those yet but there are multiple configs on arm64 and
> probably on x86 which decides kernel VA space layout causing these potential
> races. But regardless its not right to assume on vmalloc range span and not
> take a lock.
> 
> Not sure how to provoke this failure case from user space with simple hotplug
> because vmalloc physical allocation normally cannot be controlled without a
> hacked kernel change.

I believe that we can write a module which:

 - Looks at the various memory ranges, and determines whether they may share an
   intermediate level of table.
 - reserves a potentially-conflicting region with __get_vm_area_node()
 - in a loop, maps/unmaps something in that range

... while in parallel, adding/removing a potentially-conflicting region of
memory.

So long as we have the same sort of serialization we'd have for a regular
vmalloc(), ioremap() of vmap(), that would be sufficient to demonstrate that
this is a real problem.

[...]

> > Is it possible to avoid these specific conflicts (ignoring ptdump) by
> > aligning VA regions such that they cannot share intermediate levels of
> > table?
> 
> Kernel VA space layout is platform specific where core MM does not mandate much. 
> Hence generic modifiers should not make any assumptions regarding it but protect
> themselves with locks. Doing any thing other than that is just pushing the problem
> to future.

My point is that we can make this a _requirement_ of the core code, which we
could document and enforce.

Thanks,
Mark.
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c383625..a870eb2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -267,6 +267,9 @@  config HAVE_GENERIC_GUP
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+	def_bool y
+
 config SMP
 	def_bool y
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index de70c1e..1ee22ff 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -555,6 +555,7 @@  static inline phys_addr_t pud_page_paddr(pud_t pud)
 
 #else
 
+#define pmd_index(addr) 0
 #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
 
 /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
@@ -612,6 +613,7 @@  static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 
 #else
 
+#define pud_index(adrr)	0
 #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
 
 /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef82312..a4750fe 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -733,6 +733,194 @@  int kern_addr_valid(unsigned long addr)
 
 	return pfn_valid(pte_pfn(pte));
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pagetable(struct page *page, int order)
+{
+	unsigned long magic;
+	unsigned int nr_pages = 1 << order;
+
+	if (PageReserved(page)) {
+		__ClearPageReserved(page);
+
+		magic = (unsigned long)page->freelist;
+		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			while (nr_pages--)
+				free_reserved_page(page++);
+		}
+	} else {
+		free_pages((unsigned long)page_address(page), order);
+	}
+}
+
+#if (CONFIG_PGTABLE_LEVELS > 2)
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	free_pagetable(pmd_page(*pmd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pmd_clear(pmd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+#else
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+}
+#endif
+
+#if (CONFIG_PGTABLE_LEVELS > 3)
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	free_pagetable(pud_page(*pud), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pud_clear(pud);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (!pud_none(*pud))
+			return;
+	}
+
+	free_pagetable(pgd_page(*pgd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pgd_clear(pgd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+#else
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+}
+#endif
+
+static void
+remove_pte_table(pte_t *pte_start, unsigned long addr,
+			unsigned long end, bool direct)
+{
+	pte_t *pte;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr += PAGE_SIZE, pte++) {
+		if (!pte_present(*pte))
+			continue;
+
+		if (!direct)
+			free_pagetable(pte_page(*pte), 0);
+		spin_lock(&init_mm.page_table_lock);
+		pte_clear(&init_mm, addr, pte);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+}
+
+static void
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+			unsigned long end, bool direct)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_sect(*pmd)) {
+			if (!direct)
+				free_pagetable(pmd_page(*pmd),
+						get_order(PMD_SIZE));
+			spin_lock(&init_mm.page_table_lock);
+			pmd_clear(pmd);
+			spin_unlock(&init_mm.page_table_lock);
+			continue;
+		}
+		pte_base = pte_offset_kernel(pmd, 0UL);
+		remove_pte_table(pte_base, addr, next, direct);
+		free_pte_table(pte_base, pmd);
+	}
+}
+
+static void
+remove_pud_table(pud_t *pud_start, unsigned long addr,
+			unsigned long end, bool direct)
+{
+	unsigned long next;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_sect(*pud)) {
+			if (!direct)
+				free_pagetable(pud_page(*pud),
+						get_order(PUD_SIZE));
+			spin_lock(&init_mm.page_table_lock);
+			pud_clear(pud);
+			spin_unlock(&init_mm.page_table_lock);
+			continue;
+		}
+		pmd_base = pmd_offset(pud, 0UL);
+		remove_pmd_table(pmd_base, addr, next, direct);
+		free_pmd_table(pmd_base, pud);
+	}
+}
+
+static void
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+	unsigned long addr, next;
+	pud_t *pud_base;
+	pgd_t *pgd;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+		if (!pgd_present(*pgd))
+			continue;
+
+		pud_base = pud_offset(pgd, 0UL);
+		remove_pud_table(pud_base, addr, next, direct);
+		free_pud_table(pud_base, pgd);
+	}
+	flush_tlb_kernel_range(start, end);
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -780,6 +968,9 @@  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
+#ifdef CONFIG_MEMORY_HOTPLUG
+	remove_pagetable(start, end, false);
+#endif
 }
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
 
@@ -1065,10 +1256,16 @@  int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
+{
+	WARN_ON(pgdir != init_mm.pgd);
+	remove_pagetable(start, start + size, true);
+}
+
 int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 		    bool want_memblock)
 {
-	int flags = 0;
+	int ret, flags = 0;
 
 	if (rodata_full || debug_pagealloc_enabled())
 		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -1076,7 +1273,27 @@  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
 			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
 
-	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   altmap, want_memblock);
+	if (ret)
+		__remove_pgd_mapping(swapper_pg_dir,
+					__phys_to_virt(start), size);
+	return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone = page_zone(pfn_to_page(start_pfn));
+	int ret;
+
+	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+	if (!ret)
+		__remove_pgd_mapping(swapper_pg_dir,
+					__phys_to_virt(start), size);
+	return ret;
+}
+#endif
 #endif