diff mbox series

[v4,3/3] x86/vmemmap: Handle unpopulated sub-pmd ranges

Message ID 20210301083230.30924-4-osalvador@suse.de (mailing list archive)
State New
Headers show
Series Cleanup and fixups for vmemmap handling | expand

Commit Message

Oscar Salvador March 1, 2021, 8:32 a.m. UTC
When the size of a struct page is not multiple of 2MB, sections do
not span a PMD anymore and so when populating them some parts of the
PMD will remain unused.
Because of this, PMDs will be left behind when depopulating sections
since remove_pmd_table() thinks that those unused parts are still in
use.

Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv()
will do the right thing and will let us free the PMD when the last user
of it is gone.

This patch is based on a similar patch by David Hildenbrand:

https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/
https://lore.kernel.org/linux-mm/20200722094558.9828-10-david@redhat.com/

Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/x86/mm/init_64.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 98 insertions(+), 8 deletions(-)

Comments

Dave Hansen March 4, 2021, 5:02 p.m. UTC | #1
On 3/1/21 12:32 AM, Oscar Salvador wrote:
> When the size of a struct page is not multiple of 2MB, sections do
> not span a PMD anymore and so when populating them some parts of the
> PMD will remain unused.

Multiples of 2MB are 2MB, 4MB, 6MB, etc...

I think you meant here that 2MB must be a multiple of the 'struct page'
size.  I don't think there are any non-power-of-2 factors of 2MB, so I
think it's probably simpler and more accurate to say:

	When sizeof(struct page) is not a power of 2...

I also don't think I realized that 2MB of 'struct page'
(2M/sizeof(struct page)=32k) fit perfectly into a 128MB section
(32k/64=128M).

> Because of this, PMDs will be left behind when depopulating sections
> since remove_pmd_table() thinks that those unused parts are still in
> use.
> 
> Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv()
> will do the right thing and will let us free the PMD when the last user
> of it is gone.
> 
> This patch is based on a similar patch by David Hildenbrand:
> 
> https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/
> https://lore.kernel.org/linux-mm/20200722094558.9828-10-david@redhat.com/
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Reviewed-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/x86/mm/init_64.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 98 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 9ecb3c488ac8..7e8de63f02b3 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -871,7 +871,93 @@ int arch_add_memory(int nid, u64 start, u64 size,
>  	return add_pages(nid, start_pfn, nr_pages, params);
>  }
>  
> -#define PAGE_INUSE 0xFD
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +#define PAGE_UNUSED 0xFD
> +/*
> + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
> + * from unused_pmd_start to next PMD_SIZE boundary.
> + */
> +static unsigned long unused_pmd_start __meminitdata;

This whole 'unused_pmd_start' thing was unmentioned in the changelog.

I also kept reading this and thinking it was a 'pmd_t *', not a 'struct
page *'.  The naming is rather unfortunate that way.

So, is this here so that the memset()s can be avoided?  It's just an
optimization to say: "This is unused, but instead of marking it with
PAGE_UNUSED (which would be slow) I keep a pointer to it"?

> +static void __meminit vmemmap_flush_unused_pmd(void)
> +{
> +	if (!unused_pmd_start)
> +		return;
> +	/*
> +	 * Clears (unused_pmd_start, PMD_END]
> +	 */
> +	memset((void *)unused_pmd_start, PAGE_UNUSED,
> +	       ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
> +	unused_pmd_start = 0;
> +}

Oh, and this means: "stop using the unused_pmd_start optimization.  Just
memset the thing".

> +/* Returns true if the PMD is completely unused and thus it can be freed */
> +static bool __meminit vmemmap_unuse_sub_pmd(unsigned long addr, unsigned long end)
> +{
> +	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> +
> +	vmemmap_flush_unused_pmd();

That probably needs a comment like:

	Flush the unused range cache to ensure that the memchr_inv()
	will work for the whole range.

> +	memset((void *)addr, PAGE_UNUSED, end - addr);
> +
> +	return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
> +}

Also, logically, it would make a lot of sense if you can move the actual
PMD freeing logic in here.  That way, the caller is just saying, "unuse
this PMD region", and then this takes care of the rest.  As it stands,
it's a bit weird that the caller takes care of the freeing.

> +static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
> +{
> +	/*
> +	 * As we expect to add in the same granularity as we remove, it's
> +	 * sufficient to mark only some piece used to block the memmap page from
> +	 * getting removed when removing some other adjacent memmap (just in
> +	 * case the first memmap never gets initialized e.g., because the memory
> +	 * block never gets onlined).
> +	 */
> +	memset((void *)start, 0, sizeof(struct page));
> +}
> +
> +static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
> +{
> +	/*
> +	 * We only optimize if the new used range directly follows the
> +	 * previously unused range (esp., when populating consecutive sections).
> +	 */
> +	if (unused_pmd_start == start) {
> +		if (likely(IS_ALIGNED(end, PMD_SIZE)))
> +			unused_pmd_start = 0;
> +		else
> +			unused_pmd_start = end;
> +		return;
> +	}
> +
> +	vmemmap_flush_unused_pmd();
> +	__vmemmap_use_sub_pmd(start);
> +}
> +
> +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
> +{
> +	vmemmap_flush_unused_pmd();
> +
> +	/*
> +	 * Could be our memmap page is filled with PAGE_UNUSED already from a
> +	 * previous remove.
> +	 */
> +	__vmemmap_use_sub_pmd(start);
> +
> +	/*
> +	 * Mark the unused parts of the new memmap range
> +	 */
> +	if (!IS_ALIGNED(start, PMD_SIZE))
> +		memset((void *)start, PAGE_UNUSED,
> +		       start - ALIGN_DOWN(start, PMD_SIZE));
> +	/*
> +	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
> +	 * consecutive sections. Remember for the last added PMD the last
> +	 * unused range in the populated PMD.
> +	 */
> +	if (!IS_ALIGNED(end, PMD_SIZE))
> +		unused_pmd_start = end;
> +}
> +#endif
>  
>  static void __meminit free_pagetable(struct page *page, int order)
>  {
> @@ -1006,7 +1092,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  	unsigned long next, pages = 0;
>  	pte_t *pte_base;
>  	pmd_t *pmd;
> -	void *page_addr;
>  
>  	pmd = pmd_start + pmd_index(addr);
>  	for (; addr < end; addr = next, pmd++) {
> @@ -1027,12 +1112,11 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  				spin_unlock(&init_mm.page_table_lock);
>  				pages++;
>  			} else {
> -				/* If here, we are freeing vmemmap pages. */
> -				memset((void *)addr, PAGE_INUSE, next - addr);
> -
> -				page_addr = page_address(pmd_page(*pmd));
> -				if (!memchr_inv(page_addr, PAGE_INUSE,
> -						PMD_SIZE)) {
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +				/*
> +				 * Free the PMD if the whole range is unused.
> +				 */
> +				if (vmemmap_unuse_sub_pmd(addr, next)) {
>  					free_hugepage_table(pmd_page(*pmd),
>  							    altmap);
>  
> @@ -1040,6 +1124,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  					pmd_clear(pmd);
>  					spin_unlock(&init_mm.page_table_lock);
>  				}
> +#endif
>  			}

This doesn't look like the world's longest if() statement, but it might
be nice to use the IS_ENABLED() syntax instead of an #ifdef.  I suspect
the compiler could even make quick work of the static functions that
never get called as a result.

>  			continue;
> @@ -1492,11 +1577,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
>  
>  				addr_end = addr + PMD_SIZE;
>  				p_end = p + PMD_SIZE;
> +
> +				if (!IS_ALIGNED(addr, PMD_SIZE) ||
> +				    !IS_ALIGNED(next, PMD_SIZE))
> +					vmemmap_use_new_sub_pmd(addr, next);
>  				continue;
>  			} else if (altmap)
>  				return -ENOMEM; /* no fallback */
>  		} else if (pmd_large(*pmd)) {
>  			vmemmap_verify((pte_t *)pmd, node, addr, next);
> +			vmemmap_use_sub_pmd(addr, next);
>  			continue;
>  		}
>  		if (vmemmap_populate_basepages(addr, next, node, NULL))
> 

This overall looks like a good thing to do.  The implementation is even
pretty nice and simple. But, it took me an awfully long time to figure
out what was going on.

I wonder if you could take one more pass at these and especially see if
you can better explain the use of 'unused_pmd_start'.
Dave Hansen March 4, 2021, 5:08 p.m. UTC | #2
On 3/4/21 9:02 AM, Dave Hansen wrote:
>> +#define PAGE_UNUSED 0xFD
>> +/*
>> + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
>> + * from unused_pmd_start to next PMD_SIZE boundary.
>> + */
>> +static unsigned long unused_pmd_start __meminitdata;
> This whole 'unused_pmd_start' thing was unmentioned in the changelog.

One tiny suggestion: *Sometimes* for these optimizations, it's easiest
to write the code up without it in one patch, then add the optimization
in the next patch.

It makes it 100% clear what is part of the "core" algorithm and what is
pure optimization.

I don't know if it will work here, but it might be worth taking a look.
David Hildenbrand March 5, 2021, 6:26 p.m. UTC | #3
On 04.03.21 18:08, Dave Hansen wrote:
> On 3/4/21 9:02 AM, Dave Hansen wrote:
>>> +#define PAGE_UNUSED 0xFD
>>> +/*
>>> + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
>>> + * from unused_pmd_start to next PMD_SIZE boundary.
>>> + */
>>> +static unsigned long unused_pmd_start __meminitdata;
>> This whole 'unused_pmd_start' thing was unmentioned in the changelog.
> 
> One tiny suggestion: *Sometimes* for these optimizations, it's easiest
> to write the code up without it in one patch, then add the optimization
> in the next patch.
> 
> It makes it 100% clear what is part of the "core" algorithm and what is
> pure optimization.
> 
> I don't know if it will work here, but it might be worth taking a look.
> 

For this reason the s390x part by me (see patch description) was two 
separate patches. Maybe it also makes sense to split it up here.
Oscar Salvador March 8, 2021, 6:43 p.m. UTC | #4
On Thu, Mar 04, 2021 at 09:02:36AM -0800, Dave Hansen wrote:
> On 3/1/21 12:32 AM, Oscar Salvador wrote:
> > When the size of a struct page is not multiple of 2MB, sections do
> > not span a PMD anymore and so when populating them some parts of the
> > PMD will remain unused.
> 
> Multiples of 2MB are 2MB, 4MB, 6MB, etc...
> 
> I think you meant here that 2MB must be a multiple of the 'struct page'
> size.  I don't think there are any non-power-of-2 factors of 2MB, so I
> think it's probably simpler and more accurate to say:
> 
> 	When sizeof(struct page) is not a power of 2...

Yes, I think this was a poor choice of words.
It was the other way around, that PMD must be multiple of struct page size.
"When sizeof(struct page) is not a power of 2..." sounds definitely better.

> > +static unsigned long unused_pmd_start __meminitdata;
> 
> This whole 'unused_pmd_start' thing was unmentioned in the changelog.

Sorry, I will expand some more.

> I also kept reading this and thinking it was a 'pmd_t *', not a 'struct
> page *'.  The naming is rather unfortunate that way.

Well, it is the pmd range.

> 
> So, is this here so that the memset()s can be avoided?  It's just an
> optimization to say: "This is unused, but instead of marking it with
> PAGE_UNUSED (which would be slow) I keep a pointer to it"?

Yes, it is an optimization that let us avoid a memset() in case the sections
we are adding are consetuvie.

> > +static void __meminit vmemmap_flush_unused_pmd(void)
> > +{
> > +	if (!unused_pmd_start)
> > +		return;
> > +	/*
> > +	 * Clears (unused_pmd_start, PMD_END]
> > +	 */
> > +	memset((void *)unused_pmd_start, PAGE_UNUSED,
> > +	       ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
> > +	unused_pmd_start = 0;
> > +}
> 
> Oh, and this means: "stop using the unused_pmd_start optimization.  Just
> memset the thing".

Yes, as stated above, this optimization only takes place as long as the sections
are being added consecutive.
As soon as that is not the case, we memset the last range recorded and reset
unused_pmd_start.

> > +/* Returns true if the PMD is completely unused and thus it can be freed */
> > +static bool __meminit vmemmap_unuse_sub_pmd(unsigned long addr, unsigned long end)
> > +{
> > +	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> > +
> > +	vmemmap_flush_unused_pmd();
> 
> That probably needs a comment like:
> 
> 	Flush the unused range cache to ensure that the memchr_inv()
> 	will work for the whole range.

Sure, I will add a comment.

> 
> > +	memset((void *)addr, PAGE_UNUSED, end - addr);
> > +
> > +	return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
> > +}
> 
> Also, logically, it would make a lot of sense if you can move the actual
> PMD freeing logic in here.  That way, the caller is just saying, "unuse
> this PMD region", and then this takes care of the rest.  As it stands,
> it's a bit weird that the caller takes care of the freeing.

You mean to move the 

 free_hugepage_table(pmd_page(*pmd), altmap);
 spin_lock(&init_mm.page_table_lock);
 pmd_clear(pmd);
 spin_unlock(&init_mm.page_table_lock);

block in there?

Well, from where I see it, it is more like:

 if (is_the_range_unused())
  : if so, free everything

But I agree with you what it might make some sense to move it there.
Since I do not feel strong about this, I will move it.

> > +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> > +				/*
> > +				 * Free the PMD if the whole range is unused.
> > +				 */
> > +				if (vmemmap_unuse_sub_pmd(addr, next)) {
> >  					free_hugepage_table(pmd_page(*pmd),
> >  							    altmap);
> >  
> > @@ -1040,6 +1124,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
> >  					pmd_clear(pmd);
> >  					spin_unlock(&init_mm.page_table_lock);
> >  				}
> > +#endif
> >  			}
> 
> This doesn't look like the world's longest if() statement, but it might
> be nice to use the IS_ENABLED() syntax instead of an #ifdef.  I suspect
> the compiler could even make quick work of the static functions that
> never get called as a result.

Sure, I will replace the #ifdef with IS_ENABLED.

> 
> >  			continue;
> > @@ -1492,11 +1577,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
> >  
> >  				addr_end = addr + PMD_SIZE;
> >  				p_end = p + PMD_SIZE;
> > +
> > +				if (!IS_ALIGNED(addr, PMD_SIZE) ||
> > +				    !IS_ALIGNED(next, PMD_SIZE))
> > +					vmemmap_use_new_sub_pmd(addr, next);
> >  				continue;
> >  			} else if (altmap)
> >  				return -ENOMEM; /* no fallback */
> >  		} else if (pmd_large(*pmd)) {
> >  			vmemmap_verify((pte_t *)pmd, node, addr, next);
> > +			vmemmap_use_sub_pmd(addr, next);
> >  			continue;
> >  		}
> >  		if (vmemmap_populate_basepages(addr, next, node, NULL))
> > 
> 
> This overall looks like a good thing to do.  The implementation is even
> pretty nice and simple. But, it took me an awfully long time to figure
> out what was going on.
> 
> I wonder if you could take one more pass at these and especially see if
> you can better explain the use of 'unused_pmd_start'.

In order to ease the review, I will split the core-changes and the optimization.
So I will place the whole unused_pmd_start opimization in a different patch.

Thanks for the feedback Dave!
Oscar Salvador March 9, 2021, 8:25 a.m. UTC | #5
On Mon, Mar 08, 2021 at 07:43:30PM +0100, Oscar Salvador wrote:
> On Thu, Mar 04, 2021 at 09:02:36AM -0800, Dave Hansen wrote:
> > Also, logically, it would make a lot of sense if you can move the actual
> > PMD freeing logic in here.  That way, the caller is just saying, "unuse
> > this PMD region", and then this takes care of the rest.  As it stands,
> > it's a bit weird that the caller takes care of the freeing.
> 
> You mean to move the 
> 
>  free_hugepage_table(pmd_page(*pmd), altmap);
>  spin_lock(&init_mm.page_table_lock);
>  pmd_clear(pmd);
>  spin_unlock(&init_mm.page_table_lock);
> 
> block in there?
> 
> Well, from where I see it, it is more like:
> 
>  if (is_the_range_unused())
>   : if so, free everything
> 
> But I agree with you what it might make some sense to move it there.
> Since I do not feel strong about this, I will move it.

hi Dave,

So, after splitting this patch and re-shape it to address all the
feedback, I am still not sure about this one.
Honestly, I think the freeing logic would be better off kept in
remove_pmd_table.

The reason for me is that vmemmap_unuse_sub_pmd only 1) marks the range
to be removed as unused and 2) checks whether after that, the whole
PMD range is unused.

I think the confusion comes from the name.
"vmemmap_pmd_is_unused" might be a better fit?

What do you think? Do you feel strong about moving the log in there
regardless of the name?
Dave Hansen March 9, 2021, 2:50 p.m. UTC | #6
On 3/9/21 12:25 AM, Oscar Salvador wrote:
> 
> I think the confusion comes from the name.
> "vmemmap_pmd_is_unused" might be a better fit?
> 
> What do you think? Do you feel strong about moving the log in there
> regardless of the name?

No, not really.  The name is probably worth adjusting, but I think the
code can probably stay put.
Zi Yan March 9, 2021, 7:39 p.m. UTC | #7
On 1 Mar 2021, at 3:32, Oscar Salvador wrote:

> When the size of a struct page is not multiple of 2MB, sections do
> not span a PMD anymore and so when populating them some parts of the
> PMD will remain unused.
> Because of this, PMDs will be left behind when depopulating sections
> since remove_pmd_table() thinks that those unused parts are still in
> use.
>
> Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv()
> will do the right thing and will let us free the PMD when the last user
> of it is gone.
>
> This patch is based on a similar patch by David Hildenbrand:
>
> https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/
> https://lore.kernel.org/linux-mm/20200722094558.9828-10-david@redhat.com/
>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Reviewed-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/x86/mm/init_64.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 98 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 9ecb3c488ac8..7e8de63f02b3 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -871,7 +871,93 @@ int arch_add_memory(int nid, u64 start, u64 size,
>  	return add_pages(nid, start_pfn, nr_pages, params);
>  }
>
> -#define PAGE_INUSE 0xFD
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +#define PAGE_UNUSED 0xFD
> +
> +/*
> + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
> + * from unused_pmd_start to next PMD_SIZE boundary.
> + */
> +static unsigned long unused_pmd_start __meminitdata;
> +
> +static void __meminit vmemmap_flush_unused_pmd(void)
> +{
> +	if (!unused_pmd_start)
> +		return;
> +	/*
> +	 * Clears (unused_pmd_start, PMD_END]
> +	 */
> +	memset((void *)unused_pmd_start, PAGE_UNUSED,
> +	       ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
> +	unused_pmd_start = 0;
> +}
> +
> +/* Returns true if the PMD is completely unused and thus it can be freed */
> +static bool __meminit vmemmap_unuse_sub_pmd(unsigned long addr, unsigned long end)
> +{
> +	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> +
> +	vmemmap_flush_unused_pmd();
> +	memset((void *)addr, PAGE_UNUSED, end - addr);
> +
> +	return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
> +}
> +
> +static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
> +{
> +	/*
> +	 * As we expect to add in the same granularity as we remove, it's
> +	 * sufficient to mark only some piece used to block the memmap page from
> +	 * getting removed when removing some other adjacent memmap (just in
> +	 * case the first memmap never gets initialized e.g., because the memory
> +	 * block never gets onlined).
> +	 */
> +	memset((void *)start, 0, sizeof(struct page));
> +}
> +
> +static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
> +{
> +	/*
> +	 * We only optimize if the new used range directly follows the
> +	 * previously unused range (esp., when populating consecutive sections).
> +	 */
> +	if (unused_pmd_start == start) {
> +		if (likely(IS_ALIGNED(end, PMD_SIZE)))
> +			unused_pmd_start = 0;
> +		else
> +			unused_pmd_start = end;
> +		return;
> +	}
> +
> +	vmemmap_flush_unused_pmd();
> +	__vmemmap_use_sub_pmd(start);
> +}
> +
> +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
> +{
> +	vmemmap_flush_unused_pmd();
> +
> +	/*
> +	 * Could be our memmap page is filled with PAGE_UNUSED already from a
> +	 * previous remove.
> +	 */
> +	__vmemmap_use_sub_pmd(start);
> +
> +	/*
> +	 * Mark the unused parts of the new memmap range
> +	 */
> +	if (!IS_ALIGNED(start, PMD_SIZE))
> +		memset((void *)start, PAGE_UNUSED,
> +		       start - ALIGN_DOWN(start, PMD_SIZE));
> +	/*
> +	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
> +	 * consecutive sections. Remember for the last added PMD the last
> +	 * unused range in the populated PMD.
> +	 */
> +	if (!IS_ALIGNED(end, PMD_SIZE))
> +		unused_pmd_start = end;
> +}
> +#endif
>
>  static void __meminit free_pagetable(struct page *page, int order)
>  {
> @@ -1006,7 +1092,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  	unsigned long next, pages = 0;
>  	pte_t *pte_base;
>  	pmd_t *pmd;
> -	void *page_addr;
>
>  	pmd = pmd_start + pmd_index(addr);
>  	for (; addr < end; addr = next, pmd++) {
> @@ -1027,12 +1112,11 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  				spin_unlock(&init_mm.page_table_lock);
>  				pages++;
>  			} else {
> -				/* If here, we are freeing vmemmap pages. */
> -				memset((void *)addr, PAGE_INUSE, next - addr);
> -
> -				page_addr = page_address(pmd_page(*pmd));
> -				if (!memchr_inv(page_addr, PAGE_INUSE,
> -						PMD_SIZE)) {
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +				/*
> +				 * Free the PMD if the whole range is unused.
> +				 */
> +				if (vmemmap_unuse_sub_pmd(addr, next)) {
>  					free_hugepage_table(pmd_page(*pmd),
>  							    altmap);
>
> @@ -1040,6 +1124,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  					pmd_clear(pmd);
>  					spin_unlock(&init_mm.page_table_lock);
>  				}
> +#endif
>  			}
>
>  			continue;
> @@ -1492,11 +1577,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
>
>  				addr_end = addr + PMD_SIZE;
>  				p_end = p + PMD_SIZE;
> +
> +				if (!IS_ALIGNED(addr, PMD_SIZE) ||
> +				    !IS_ALIGNED(next, PMD_SIZE))
> +					vmemmap_use_new_sub_pmd(addr, next);
>  				continue;
>  			} else if (altmap)
>  				return -ENOMEM; /* no fallback */
>  		} else if (pmd_large(*pmd)) {
>  			vmemmap_verify((pte_t *)pmd, node, addr, next);
> +			vmemmap_use_sub_pmd(addr, next);

Hi Oscar,

vmemmap_use_new_sub_pmd and vmemmap_use_sub_pmd are not defined when CONFIG_SPARSEMEM_VMEMMAP
and !CONFIG_MEMORY_HOTPLUG. It leads to compilation errors.


—
Best Regards,
Yan Zi
Oscar Salvador March 9, 2021, 9:26 p.m. UTC | #8
On Tue, Mar 09, 2021 at 02:39:02PM -0500, Zi Yan wrote:
> Hi Oscar,
> 
> vmemmap_use_new_sub_pmd and vmemmap_use_sub_pmd are not defined when CONFIG_SPARSEMEM_VMEMMAP
> and !CONFIG_MEMORY_HOTPLUG. It leads to compilation errors.

Meh, yeah, that's right.

I fixed that up, I will send a v6.

Thanks for noticing!
diff mbox series

Patch

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9ecb3c488ac8..7e8de63f02b3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -871,7 +871,93 @@  int arch_add_memory(int nid, u64 start, u64 size,
 	return add_pages(nid, start_pfn, nr_pages, params);
 }
 
-#define PAGE_INUSE 0xFD
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define PAGE_UNUSED 0xFD
+
+/*
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_pmd_start to next PMD_SIZE boundary.
+ */
+static unsigned long unused_pmd_start __meminitdata;
+
+static void __meminit vmemmap_flush_unused_pmd(void)
+{
+	if (!unused_pmd_start)
+		return;
+	/*
+	 * Clears (unused_pmd_start, PMD_END]
+	 */
+	memset((void *)unused_pmd_start, PAGE_UNUSED,
+	       ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
+	unused_pmd_start = 0;
+}
+
+/* Returns true if the PMD is completely unused and thus it can be freed */
+static bool __meminit vmemmap_unuse_sub_pmd(unsigned long addr, unsigned long end)
+{
+	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+	vmemmap_flush_unused_pmd();
+	memset((void *)addr, PAGE_UNUSED, end - addr);
+
+	return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
+}
+
+static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
+{
+	/*
+	 * As we expect to add in the same granularity as we remove, it's
+	 * sufficient to mark only some piece used to block the memmap page from
+	 * getting removed when removing some other adjacent memmap (just in
+	 * case the first memmap never gets initialized e.g., because the memory
+	 * block never gets onlined).
+	 */
+	memset((void *)start, 0, sizeof(struct page));
+}
+
+static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+	/*
+	 * We only optimize if the new used range directly follows the
+	 * previously unused range (esp., when populating consecutive sections).
+	 */
+	if (unused_pmd_start == start) {
+		if (likely(IS_ALIGNED(end, PMD_SIZE)))
+			unused_pmd_start = 0;
+		else
+			unused_pmd_start = end;
+		return;
+	}
+
+	vmemmap_flush_unused_pmd();
+	__vmemmap_use_sub_pmd(start);
+}
+
+static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+	vmemmap_flush_unused_pmd();
+
+	/*
+	 * Could be our memmap page is filled with PAGE_UNUSED already from a
+	 * previous remove.
+	 */
+	__vmemmap_use_sub_pmd(start);
+
+	/*
+	 * Mark the unused parts of the new memmap range
+	 */
+	if (!IS_ALIGNED(start, PMD_SIZE))
+		memset((void *)start, PAGE_UNUSED,
+		       start - ALIGN_DOWN(start, PMD_SIZE));
+	/*
+	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+	 * consecutive sections. Remember for the last added PMD the last
+	 * unused range in the populated PMD.
+	 */
+	if (!IS_ALIGNED(end, PMD_SIZE))
+		unused_pmd_start = end;
+}
+#endif
 
 static void __meminit free_pagetable(struct page *page, int order)
 {
@@ -1006,7 +1092,6 @@  remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 	unsigned long next, pages = 0;
 	pte_t *pte_base;
 	pmd_t *pmd;
-	void *page_addr;
 
 	pmd = pmd_start + pmd_index(addr);
 	for (; addr < end; addr = next, pmd++) {
@@ -1027,12 +1112,11 @@  remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else {
-				/* If here, we are freeing vmemmap pages. */
-				memset((void *)addr, PAGE_INUSE, next - addr);
-
-				page_addr = page_address(pmd_page(*pmd));
-				if (!memchr_inv(page_addr, PAGE_INUSE,
-						PMD_SIZE)) {
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+				/*
+				 * Free the PMD if the whole range is unused.
+				 */
+				if (vmemmap_unuse_sub_pmd(addr, next)) {
 					free_hugepage_table(pmd_page(*pmd),
 							    altmap);
 
@@ -1040,6 +1124,7 @@  remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 					pmd_clear(pmd);
 					spin_unlock(&init_mm.page_table_lock);
 				}
+#endif
 			}
 
 			continue;
@@ -1492,11 +1577,16 @@  static int __meminit vmemmap_populate_hugepages(unsigned long start,
 
 				addr_end = addr + PMD_SIZE;
 				p_end = p + PMD_SIZE;
+
+				if (!IS_ALIGNED(addr, PMD_SIZE) ||
+				    !IS_ALIGNED(next, PMD_SIZE))
+					vmemmap_use_new_sub_pmd(addr, next);
 				continue;
 			} else if (altmap)
 				return -ENOMEM; /* no fallback */
 		} else if (pmd_large(*pmd)) {
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
+			vmemmap_use_sub_pmd(addr, next);
 			continue;
 		}
 		if (vmemmap_populate_basepages(addr, next, node, NULL))