diff mbox series

[RFC,2/3] mm/memory_hotplug: Create __shrink_pages and move it to offline_pages

Message ID 20180807133757.18352-3-osalvador@techadventures.net (mailing list archive)
State New, archived
Headers show
Series Do not touch pages in remove_memory path | expand

Commit Message

Oscar Salvador Aug. 7, 2018, 1:37 p.m. UTC
From: Oscar Salvador <osalvador@suse.de>

Currently, we decrement zone/node spanned_pages when we
__remove__ the memory.
This is not really great.

Incrementing of spanned pages is done in online_pages() path,
decrementing spanned pages should be moved to offline_pages().

This, besides making the core more consistent, helps in fixing
the bug explained in the cover letter, since from now on,
we will not touch any pages in remove_memory path.

This does all the work to accomplish this.
It removes __remove_zone() and creates __shrink_pages(), which does pretty much the same.

It also changes find_smallest/biggest_section_pfn to check for:

!online_section(ms)

instead of

!valid_section(ms)

as we offline the section in:

__offline_pages
 offline_isolated_pages
  offline_isolated_pages_cb
   __offline_isolated_pages
    offline_mem_sections

right before shrinking the pages.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 arch/ia64/mm/init.c            |  4 +--
 arch/powerpc/mm/mem.c          | 11 +------
 arch/sh/mm/init.c              |  4 +--
 arch/x86/mm/init_32.c          |  4 +--
 arch/x86/mm/init_64.c          |  8 +-----
 include/linux/memory_hotplug.h |  6 ++--
 kernel/memremap.c              |  5 ++++
 mm/hmm.c                       |  2 +-
 mm/memory_hotplug.c            | 65 +++++++++++++++++++++---------------------
 mm/sparse.c                    |  4 +--
 10 files changed, 50 insertions(+), 63 deletions(-)

Comments

Jerome Glisse Aug. 7, 2018, 1:52 p.m. UTC | #1
On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> From: Oscar Salvador <osalvador@suse.de>

[...]

> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 9bd629944c91..e33555651e46 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c

[...]

>  /**
>   * __remove_pages() - remove sections of pages from a zone
> - * @zone: zone from which pages need to be removed
> + * @nid: node which pages belong to
>   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
>   * @nr_pages: number of pages to remove (must be multiple of section size)
>   * @altmap: alternative device page map or %NULL if default memmap is used
> @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
>   * sure that pages are marked reserved and zones are adjust properly by
>   * calling offline_pages().
>   */
> -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> +int __remove_pages(int nid, unsigned long phys_start_pfn,
>  		 unsigned long nr_pages, struct vmem_altmap *altmap)
>  {
>  	unsigned long i;
> @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
>  	int sections_to_remove, ret = 0;
>  
>  	/* In the ZONE_DEVICE case device driver owns the memory region */
> -	if (is_dev_zone(zone)) {
> -		if (altmap)
> -			map_offset = vmem_altmap_offset(altmap);
> -	} else {
> +	if (altmap)
> +		map_offset = vmem_altmap_offset(altmap);
> +	else {

This will break ZONE_DEVICE at least for HMM. While i think that
altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
is not true ie ZONE_DEVICE does not necessarily imply altmap. So
with the above changes you change the expected behavior. You do
need the zone to know if it is a ZONE_DEVICE. You could also lookup
one of the struct page but my understanding is that this is what
you want to avoid in the first place.

Cheers,
Jérôme
David Hildenbrand Aug. 7, 2018, 2:54 p.m. UTC | #2
On 07.08.2018 15:52, Jerome Glisse wrote:
> On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
>> From: Oscar Salvador <osalvador@suse.de>
> 
> [...]
> 
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 9bd629944c91..e33555651e46 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
> 
> [...]
> 
>>  /**
>>   * __remove_pages() - remove sections of pages from a zone
>> - * @zone: zone from which pages need to be removed
>> + * @nid: node which pages belong to
>>   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
>>   * @nr_pages: number of pages to remove (must be multiple of section size)
>>   * @altmap: alternative device page map or %NULL if default memmap is used
>> @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
>>   * sure that pages are marked reserved and zones are adjust properly by
>>   * calling offline_pages().
>>   */
>> -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
>> +int __remove_pages(int nid, unsigned long phys_start_pfn,
>>  		 unsigned long nr_pages, struct vmem_altmap *altmap)
>>  {
>>  	unsigned long i;
>> @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
>>  	int sections_to_remove, ret = 0;
>>  
>>  	/* In the ZONE_DEVICE case device driver owns the memory region */
>> -	if (is_dev_zone(zone)) {
>> -		if (altmap)
>> -			map_offset = vmem_altmap_offset(altmap);
>> -	} else {
>> +	if (altmap)
>> +		map_offset = vmem_altmap_offset(altmap);
>> +	else {
> 
> This will break ZONE_DEVICE at least for HMM. While i think that
> altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> with the above changes you change the expected behavior. You do
> need the zone to know if it is a ZONE_DEVICE. You could also lookup
> one of the struct page but my understanding is that this is what
> you want to avoid in the first place.

I wonder if we could instead forward from the callers whether we are
dealing with ZONE_DEVICE memory (is_device ...), at least that seems
feasible in hmm code. Not having looked at details yet.

> 
> Cheers,
> Jérôme
>
Michal Hocko Aug. 7, 2018, 2:59 p.m. UTC | #3
On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > From: Oscar Salvador <osalvador@suse.de>
> 
> [...]
> 
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index 9bd629944c91..e33555651e46 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> 
> [...]
> 
> >  /**
> >   * __remove_pages() - remove sections of pages from a zone
> > - * @zone: zone from which pages need to be removed
> > + * @nid: node which pages belong to
> >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> >   * @nr_pages: number of pages to remove (must be multiple of section size)
> >   * @altmap: alternative device page map or %NULL if default memmap is used
> > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> >   * sure that pages are marked reserved and zones are adjust properly by
> >   * calling offline_pages().
> >   */
> > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> >  {
> >  	unsigned long i;
> > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> >  	int sections_to_remove, ret = 0;
> >  
> >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > -	if (is_dev_zone(zone)) {
> > -		if (altmap)
> > -			map_offset = vmem_altmap_offset(altmap);
> > -	} else {
> > +	if (altmap)
> > +		map_offset = vmem_altmap_offset(altmap);
> > +	else {
> 
> This will break ZONE_DEVICE at least for HMM. While i think that
> altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> with the above changes you change the expected behavior.

Could you be more specific what is the expected behavior here?
Is this about calling release_mem_region_adjustable? Why does is it not
suitable for zone device ranges?
Jerome Glisse Aug. 7, 2018, 3:18 p.m. UTC | #4
On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > From: Oscar Salvador <osalvador@suse.de>
> > 
> > [...]
> > 
> > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > index 9bd629944c91..e33555651e46 100644
> > > --- a/mm/memory_hotplug.c
> > > +++ b/mm/memory_hotplug.c
> > 
> > [...]
> > 
> > >  /**
> > >   * __remove_pages() - remove sections of pages from a zone
> > > - * @zone: zone from which pages need to be removed
> > > + * @nid: node which pages belong to
> > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > >   * sure that pages are marked reserved and zones are adjust properly by
> > >   * calling offline_pages().
> > >   */
> > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > >  {
> > >  	unsigned long i;
> > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > >  	int sections_to_remove, ret = 0;
> > >  
> > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > -	if (is_dev_zone(zone)) {
> > > -		if (altmap)
> > > -			map_offset = vmem_altmap_offset(altmap);
> > > -	} else {
> > > +	if (altmap)
> > > +		map_offset = vmem_altmap_offset(altmap);
> > > +	else {
> > 
> > This will break ZONE_DEVICE at least for HMM. While i think that
> > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > with the above changes you change the expected behavior.
> 
> Could you be more specific what is the expected behavior here?
> Is this about calling release_mem_region_adjustable? Why does is it not
> suitable for zone device ranges?

Correct, you should not call release_mem_region_adjustable() the device
region is not part of regular iomem resource as it might not necessarily
be enumerated through known ways to the kernel (ie only the device driver
can discover the region and core kernel do not know about it).

One of the issue to adding this region to iomem resource is that they
really need to be ignored by core kernel because you can not assume that
CPU can actually access them. Moreover, if CPU can access them it is
likely that CPU can not do atomic operation on them (ie what happens on
a CPU atomic instruction is undefined). So they are _special_ and only
make sense to be use in conjunction with a device driver.


Also in the case they do exist in iomem resource it is as PCIE BAR so
as IORESOURCE_IO (iirc) and thus release_mem_region_adjustable() would
return -EINVAL. Thought nothing bad happens because of that, only a
warning message that might confuse the user.

Cheers,
Jérôme
Jerome Glisse Aug. 7, 2018, 3:19 p.m. UTC | #5
On Tue, Aug 07, 2018 at 04:54:57PM +0200, David Hildenbrand wrote:
> On 07.08.2018 15:52, Jerome Glisse wrote:
> > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> >> From: Oscar Salvador <osalvador@suse.de>
> > 
> > [...]
> > 
> >> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> >> index 9bd629944c91..e33555651e46 100644
> >> --- a/mm/memory_hotplug.c
> >> +++ b/mm/memory_hotplug.c
> > 
> > [...]
> > 
> >>  /**
> >>   * __remove_pages() - remove sections of pages from a zone
> >> - * @zone: zone from which pages need to be removed
> >> + * @nid: node which pages belong to
> >>   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> >>   * @nr_pages: number of pages to remove (must be multiple of section size)
> >>   * @altmap: alternative device page map or %NULL if default memmap is used
> >> @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> >>   * sure that pages are marked reserved and zones are adjust properly by
> >>   * calling offline_pages().
> >>   */
> >> -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> >> +int __remove_pages(int nid, unsigned long phys_start_pfn,
> >>  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> >>  {
> >>  	unsigned long i;
> >> @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> >>  	int sections_to_remove, ret = 0;
> >>  
> >>  	/* In the ZONE_DEVICE case device driver owns the memory region */
> >> -	if (is_dev_zone(zone)) {
> >> -		if (altmap)
> >> -			map_offset = vmem_altmap_offset(altmap);
> >> -	} else {
> >> +	if (altmap)
> >> +		map_offset = vmem_altmap_offset(altmap);
> >> +	else {
> > 
> > This will break ZONE_DEVICE at least for HMM. While i think that
> > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > with the above changes you change the expected behavior. You do
> > need the zone to know if it is a ZONE_DEVICE. You could also lookup
> > one of the struct page but my understanding is that this is what
> > you want to avoid in the first place.
> 
> I wonder if we could instead forward from the callers whether we are
> dealing with ZONE_DEVICE memory (is_device ...), at least that seems
> feasible in hmm code. Not having looked at details yet.
> 

Yes i believe this is doable, this add one more argument, to me it
looked like passing down the zone was good idea, i think with the
struct zone you can even remove the altmap argument.

Is there a reason why you do not want to pass down the struct zone ?

Cheers,
Jérôme
David Hildenbrand Aug. 7, 2018, 3:28 p.m. UTC | #6
On 07.08.2018 17:19, Jerome Glisse wrote:
> On Tue, Aug 07, 2018 at 04:54:57PM +0200, David Hildenbrand wrote:
>> On 07.08.2018 15:52, Jerome Glisse wrote:
>>> On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
>>>> From: Oscar Salvador <osalvador@suse.de>
>>>
>>> [...]
>>>
>>>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>>>> index 9bd629944c91..e33555651e46 100644
>>>> --- a/mm/memory_hotplug.c
>>>> +++ b/mm/memory_hotplug.c
>>>
>>> [...]
>>>
>>>>  /**
>>>>   * __remove_pages() - remove sections of pages from a zone
>>>> - * @zone: zone from which pages need to be removed
>>>> + * @nid: node which pages belong to
>>>>   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
>>>>   * @nr_pages: number of pages to remove (must be multiple of section size)
>>>>   * @altmap: alternative device page map or %NULL if default memmap is used
>>>> @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
>>>>   * sure that pages are marked reserved and zones are adjust properly by
>>>>   * calling offline_pages().
>>>>   */
>>>> -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
>>>> +int __remove_pages(int nid, unsigned long phys_start_pfn,
>>>>  		 unsigned long nr_pages, struct vmem_altmap *altmap)
>>>>  {
>>>>  	unsigned long i;
>>>> @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
>>>>  	int sections_to_remove, ret = 0;
>>>>  
>>>>  	/* In the ZONE_DEVICE case device driver owns the memory region */
>>>> -	if (is_dev_zone(zone)) {
>>>> -		if (altmap)
>>>> -			map_offset = vmem_altmap_offset(altmap);
>>>> -	} else {
>>>> +	if (altmap)
>>>> +		map_offset = vmem_altmap_offset(altmap);
>>>> +	else {
>>>
>>> This will break ZONE_DEVICE at least for HMM. While i think that
>>> altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
>>> is not true ie ZONE_DEVICE does not necessarily imply altmap. So
>>> with the above changes you change the expected behavior. You do
>>> need the zone to know if it is a ZONE_DEVICE. You could also lookup
>>> one of the struct page but my understanding is that this is what
>>> you want to avoid in the first place.
>>
>> I wonder if we could instead forward from the callers whether we are
>> dealing with ZONE_DEVICE memory (is_device ...), at least that seems
>> feasible in hmm code. Not having looked at details yet.
>>
> 
> Yes i believe this is doable, this add one more argument, to me it
> looked like passing down the zone was good idea, i think with the
> struct zone you can even remove the altmap argument.
> 
> Is there a reason why you do not want to pass down the struct zone ?

If struct pages are not initialized (for ordinary memory, because memory
was never onlined), the zone might be random, and e.g. ZONE_DEVICE
although it really isn't (or worse, a not existent zone).

The zone of ordinary memory will be decided once memory is onlined. So
the zone really should not belong into memory removal code (online in
offlining code, we would have to lie about it in case !ZONE_DEVICE here).

> 
> Cheers,
> Jérôme
>
Oscar Salvador Aug. 7, 2018, 8:48 p.m. UTC | #7
On Tue, Aug 07, 2018 at 04:54:57PM +0200, David Hildenbrand wrote:
> I wonder if we could instead forward from the callers whether we are
> dealing with ZONE_DEVICE memory (is_device ...), at least that seems
> feasible in hmm code. Not having looked at details yet.

Yes, this looks like the most straightforward way right now.
We would have to pass it from arch_remove_memory to __remove_pages though.

It is not the most elegant way, but looking at the code of devm_memremap_pages_release
and hmm_devmem_release I cannot really think of anything better.

In hmm_devmem_release is should be easy because AFAIK (unless I am missing something), hmm always works
with ZONE_DEVICE.
At least hmm_devmem_pages_create() moves the range to ZONE_DEVICE.

After looking at devm_memremap_pages(), I think it does the same:

...
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
					align_start >> PAGE_SHIFT,
					align_size >> PAGE_SHIFT, altmap);
...

So I guess it is safe to assume that arch_remove_memory/__remove_pages are called
from those functions while zone being ZONE_DEVICE.

Is that right, Jerome?

And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
I think this can be done easily.

Thanks
Jerome Glisse Aug. 7, 2018, 10:13 p.m. UTC | #8
On Tue, Aug 07, 2018 at 10:48:34PM +0200, Oscar Salvador wrote:
> On Tue, Aug 07, 2018 at 04:54:57PM +0200, David Hildenbrand wrote:
> > I wonder if we could instead forward from the callers whether we are
> > dealing with ZONE_DEVICE memory (is_device ...), at least that seems
> > feasible in hmm code. Not having looked at details yet.
> 
> Yes, this looks like the most straightforward way right now.
> We would have to pass it from arch_remove_memory to __remove_pages though.
> 
> It is not the most elegant way, but looking at the code of devm_memremap_pages_release
> and hmm_devmem_release I cannot really think of anything better.
> 
> In hmm_devmem_release is should be easy because AFAIK (unless I am missing something), hmm always works
> with ZONE_DEVICE.
> At least hmm_devmem_pages_create() moves the range to ZONE_DEVICE.
> 
> After looking at devm_memremap_pages(), I think it does the same:
> 
> ...
> move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> 					align_start >> PAGE_SHIFT,
> 					align_size >> PAGE_SHIFT, altmap);
> ...
> 
> So I guess it is safe to assume that arch_remove_memory/__remove_pages are called
> from those functions while zone being ZONE_DEVICE.
> 
> Is that right, Jerome?

Correct, both HMM and devm always deal with ZONE_DEVICE page. So
any call to arch_remove_memory/__remove_pages in those context
can assume ZONE_DEVICE.

> 
> And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
> I think this can be done easily.

This might change down road but for now this is correct. They are
talks to enumerate device memory through standard platform mechanisms
and thus the kernel might see new types of resources down the road and
maybe we will want to hotplug them directly from regular hotplug path
as ZONE_DEVICE (lot of hypothetical at this point ;)).

Cheers,
Jérôme
Michal Hocko Aug. 8, 2018, 6:47 a.m. UTC | #9
On Tue 07-08-18 11:18:10, Jerome Glisse wrote:
> On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> > On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > > From: Oscar Salvador <osalvador@suse.de>
> > > 
> > > [...]
> > > 
> > > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > > index 9bd629944c91..e33555651e46 100644
> > > > --- a/mm/memory_hotplug.c
> > > > +++ b/mm/memory_hotplug.c
> > > 
> > > [...]
> > > 
> > > >  /**
> > > >   * __remove_pages() - remove sections of pages from a zone
> > > > - * @zone: zone from which pages need to be removed
> > > > + * @nid: node which pages belong to
> > > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > > >   * sure that pages are marked reserved and zones are adjust properly by
> > > >   * calling offline_pages().
> > > >   */
> > > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > > >  {
> > > >  	unsigned long i;
> > > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > >  	int sections_to_remove, ret = 0;
> > > >  
> > > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > > -	if (is_dev_zone(zone)) {
> > > > -		if (altmap)
> > > > -			map_offset = vmem_altmap_offset(altmap);
> > > > -	} else {
> > > > +	if (altmap)
> > > > +		map_offset = vmem_altmap_offset(altmap);
> > > > +	else {
> > > 
> > > This will break ZONE_DEVICE at least for HMM. While i think that
> > > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > > with the above changes you change the expected behavior.
> > 
> > Could you be more specific what is the expected behavior here?
> > Is this about calling release_mem_region_adjustable? Why does is it not
> > suitable for zone device ranges?
> 
> Correct, you should not call release_mem_region_adjustable() the device
> region is not part of regular iomem resource as it might not necessarily
> be enumerated through known ways to the kernel (ie only the device driver
> can discover the region and core kernel do not know about it).

If there is no region registered with the range then the call should be
mere nop, no? So why do we have to special case?

[...]

> Also in the case they do exist in iomem resource it is as PCIE BAR so
> as IORESOURCE_IO (iirc) and thus release_mem_region_adjustable() would
> return -EINVAL. Thought nothing bad happens because of that, only a
> warning message that might confuse the user.

I am not sure I have understood this correctly. Are you referring to the
current state when we would call release_mem_region_adjustable
unconditionally or the case that the resource would be added also for
zone device ranges?

If the former then I do not see any reason why we couldn't simply
refactor the code to expect a failure and drop the warning in that path.
Oscar Salvador Aug. 8, 2018, 7:38 a.m. UTC | #10
On Tue, Aug 07, 2018 at 06:13:45PM -0400, Jerome Glisse wrote:
> > And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
> > I think this can be done easily.
> 
> This might change down road but for now this is correct. They are
> talks to enumerate device memory through standard platform mechanisms
> and thus the kernel might see new types of resources down the road and
> maybe we will want to hotplug them directly from regular hotplug path
> as ZONE_DEVICE (lot of hypothetical at this point ;)).

Well, I think that if that happens this whole thing will become
much easier, since we will not have several paths for doing the same thing.

Another thing that I realized is that while we want to move all operation-pages
from remove_memory() path to offline_pages(), this can get tricky.

Unless I am missing something, the devices from HMM and devm are not being registered
against "memory_subsys" struct, and so, they never get to call memory_subsys_offline()
and so offline_pages().

Which means that we would have to call __remove_zone() from those paths.
But this alone will not work.

find_smallest/biggest_section_pfn are two functions that are being called from

shrink_pgdat_span
and
shrink_zone_span

to adjust zone_first_pfn/node_first_pfn and the spanned pages.

Currently, find_smallest/biggest_section_pfn checks for the secion to be valid,
and this is fine since we are removing those sections from the remove_memory path.

But if we want to move __remove_zone() to offline_pages(), we have to use
online_section() instead of valid_section().

This is all fine from offline_pages because the sections get offlined in:

__offline_pages
 offline_isolated_pages
  offline_isolated_pages_cb
   __offline_isolated_pages
    offline_mem_sections


But this does not happen in HMM/devm path.

I am pretty sure this is a dumb question, but why HMM/devm path
do not call online_pages/offline_pages?

Thanks
David Hildenbrand Aug. 8, 2018, 7:45 a.m. UTC | #11
On 08.08.2018 09:38, Oscar Salvador wrote:
> On Tue, Aug 07, 2018 at 06:13:45PM -0400, Jerome Glisse wrote:
>>> And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
>>> I think this can be done easily.
>>
>> This might change down road but for now this is correct. They are
>> talks to enumerate device memory through standard platform mechanisms
>> and thus the kernel might see new types of resources down the road and
>> maybe we will want to hotplug them directly from regular hotplug path
>> as ZONE_DEVICE (lot of hypothetical at this point ;)).
> 
> Well, I think that if that happens this whole thing will become
> much easier, since we will not have several paths for doing the same thing.
> 
> Another thing that I realized is that while we want to move all operation-pages
> from remove_memory() path to offline_pages(), this can get tricky.
> 
> Unless I am missing something, the devices from HMM and devm are not being registered
> against "memory_subsys" struct, and so, they never get to call memory_subsys_offline()
> and so offline_pages().
> 
> Which means that we would have to call __remove_zone() from those paths.
> But this alone will not work.

I mean, they move it to the zone ("replacing online/offlining code"), so
they should take of removing it again.

> 
> find_smallest/biggest_section_pfn are two functions that are being called from
> 
> shrink_pgdat_span
> and
> shrink_zone_span
> 
> to adjust zone_first_pfn/node_first_pfn and the spanned pages.
> 
> Currently, find_smallest/biggest_section_pfn checks for the secion to be valid,
> and this is fine since we are removing those sections from the remove_memory path.
> 
> But if we want to move __remove_zone() to offline_pages(), we have to use
> online_section() instead of valid_section().
> 
> This is all fine from offline_pages because the sections get offlined in:
> 
> __offline_pages
>  offline_isolated_pages
>   offline_isolated_pages_cb
>    __offline_isolated_pages
>     offline_mem_sections
> 
> 
> But this does not happen in HMM/devm path.
> 
> I am pretty sure this is a dumb question, but why HMM/devm path
> do not call online_pages/offline_pages?
> 
> Thanks
>
David Hildenbrand Aug. 8, 2018, 7:51 a.m. UTC | #12
On 08.08.2018 09:38, Oscar Salvador wrote:
> On Tue, Aug 07, 2018 at 06:13:45PM -0400, Jerome Glisse wrote:
>>> And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
>>> I think this can be done easily.
>>
>> This might change down road but for now this is correct. They are
>> talks to enumerate device memory through standard platform mechanisms
>> and thus the kernel might see new types of resources down the road and
>> maybe we will want to hotplug them directly from regular hotplug path
>> as ZONE_DEVICE (lot of hypothetical at this point ;)).
> 
> Well, I think that if that happens this whole thing will become
> much easier, since we will not have several paths for doing the same thing.
> 
> Another thing that I realized is that while we want to move all operation-pages
> from remove_memory() path to offline_pages(), this can get tricky.
> 
> Unless I am missing something, the devices from HMM and devm are not being registered
> against "memory_subsys" struct, and so, they never get to call memory_subsys_offline()
> and so offline_pages().
> 
> Which means that we would have to call __remove_zone() from those paths.
> But this alone will not work.
> 
> find_smallest/biggest_section_pfn are two functions that are being called from
> 
> shrink_pgdat_span
> and
> shrink_zone_span
> 
> to adjust zone_first_pfn/node_first_pfn and the spanned pages.
> 
> Currently, find_smallest/biggest_section_pfn checks for the secion to be valid,
> and this is fine since we are removing those sections from the remove_memory path.
> 
> But if we want to move __remove_zone() to offline_pages(), we have to use
> online_section() instead of valid_section().
> 
> This is all fine from offline_pages because the sections get offlined in:
> 
> __offline_pages
>  offline_isolated_pages
>   offline_isolated_pages_cb
>    __offline_isolated_pages
>     offline_mem_sections
> 
> 
> But this does not happen in HMM/devm path.
> 
> I am pretty sure this is a dumb question, but why HMM/devm path
> do not call online_pages/offline_pages?

I think mainly because onlining/offlining (wild guesses)

- calls memory notifiers
- works with memory blocks

(and does some more things not applicable to ZONE_DEVICE memory)

> 
> Thanks
>
Oscar Salvador Aug. 8, 2018, 7:56 a.m. UTC | #13
On Wed, Aug 08, 2018 at 09:45:37AM +0200, David Hildenbrand wrote:
> On 08.08.2018 09:38, Oscar Salvador wrote:
> > On Tue, Aug 07, 2018 at 06:13:45PM -0400, Jerome Glisse wrote:
> >>> And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
> >>> I think this can be done easily.
> >>
> >> This might change down road but for now this is correct. They are
> >> talks to enumerate device memory through standard platform mechanisms
> >> and thus the kernel might see new types of resources down the road and
> >> maybe we will want to hotplug them directly from regular hotplug path
> >> as ZONE_DEVICE (lot of hypothetical at this point ;)).
> > 
> > Well, I think that if that happens this whole thing will become
> > much easier, since we will not have several paths for doing the same thing.
> > 
> > Another thing that I realized is that while we want to move all operation-pages
> > from remove_memory() path to offline_pages(), this can get tricky.
> > 
> > Unless I am missing something, the devices from HMM and devm are not being registered
> > against "memory_subsys" struct, and so, they never get to call memory_subsys_offline()
> > and so offline_pages().
> > 
> > Which means that we would have to call __remove_zone() from those paths.
> > But this alone will not work.
> 
> I mean, they move it to the zone ("replacing online/offlining code"), so
> they should take of removing it again.

Yeah, I guess so.

I mean, of course we can make this work by placing __remove_zone in devm_memremap_pages_release
and hmm_devmem_release functions and make sure to call offline_mem_sections first.
But sounds a bit "hacky"..

Thanks
Oscar Salvador Aug. 8, 2018, 8 a.m. UTC | #14
On Wed, Aug 08, 2018 at 09:51:50AM +0200, David Hildenbrand wrote:
> > I am pretty sure this is a dumb question, but why HMM/devm path
> > do not call online_pages/offline_pages?
> 
> I think mainly because onlining/offlining (wild guesses)
> 
> - calls memory notifiers
> - works with memory blocks
> 
> (and does some more things not applicable to ZONE_DEVICE memory)

Yes, you are right.
They call arch_add_memory while want_memblock being false and so they do not
get to call hotplug_memory_register which handles all the memory block stuff.

Thanks
David Hildenbrand Aug. 8, 2018, 8:08 a.m. UTC | #15
On 08.08.2018 09:56, Oscar Salvador wrote:
> On Wed, Aug 08, 2018 at 09:45:37AM +0200, David Hildenbrand wrote:
>> On 08.08.2018 09:38, Oscar Salvador wrote:
>>> On Tue, Aug 07, 2018 at 06:13:45PM -0400, Jerome Glisse wrote:
>>>>> And since we know for sure that memhotplug-code cannot call it with ZONE_DEVICE,
>>>>> I think this can be done easily.
>>>>
>>>> This might change down road but for now this is correct. They are
>>>> talks to enumerate device memory through standard platform mechanisms
>>>> and thus the kernel might see new types of resources down the road and
>>>> maybe we will want to hotplug them directly from regular hotplug path
>>>> as ZONE_DEVICE (lot of hypothetical at this point ;)).
>>>
>>> Well, I think that if that happens this whole thing will become
>>> much easier, since we will not have several paths for doing the same thing.
>>>
>>> Another thing that I realized is that while we want to move all operation-pages
>>> from remove_memory() path to offline_pages(), this can get tricky.
>>>
>>> Unless I am missing something, the devices from HMM and devm are not being registered
>>> against "memory_subsys" struct, and so, they never get to call memory_subsys_offline()
>>> and so offline_pages().
>>>
>>> Which means that we would have to call __remove_zone() from those paths.
>>> But this alone will not work.
>>
>> I mean, they move it to the zone ("replacing online/offlining code"), so
>> they should take of removing it again.
> 
> Yeah, I guess so.
> 
> I mean, of course we can make this work by placing __remove_zone in devm_memremap_pages_release
> and hmm_devmem_release functions and make sure to call offline_mem_sections first.
> But sounds a bit "hacky"..
> 
> Thanks
> 

Then it is maybe time to cleary distinguish both types of memory, as
they are fundamentally different when it comes to online/offline behavior.

Ordinary ram:
 add_memory ...
 online_pages ...
 offline_pages
 remove_memory

Device memory
 add_device_memory ...
 remove_device_memory

So adding/removing from the zone and stuff can be handled there.
Oscar Salvador Aug. 8, 2018, 9:45 a.m. UTC | #16
On Tue, Aug 07, 2018 at 11:18:10AM -0400, Jerome Glisse wrote:
> Correct, you should not call release_mem_region_adjustable() the device
> region is not part of regular iomem resource as it might not necessarily
> be enumerated through known ways to the kernel (ie only the device driver
> can discover the region and core kernel do not know about it).
> 
> One of the issue to adding this region to iomem resource is that they
> really need to be ignored by core kernel because you can not assume that
> CPU can actually access them. Moreover, if CPU can access them it is
> likely that CPU can not do atomic operation on them (ie what happens on
> a CPU atomic instruction is undefined). So they are _special_ and only
> make sense to be use in conjunction with a device driver.
> 
> 
> Also in the case they do exist in iomem resource it is as PCIE BAR so
> as IORESOURCE_IO (iirc) and thus release_mem_region_adjustable() would
> return -EINVAL. Thought nothing bad happens because of that, only a
> warning message that might confuse the user.

Just to see if I understand this correctly.
I guess that these regions are being registered via devm_request_mem_region() calls.
Among other callers, devm_request_mem_region() is being called from:

dax_pmem_probe
hmm_devmem_add

AFAICS from the code, those regions will inherit the flags from the parent, which is iomem_resource:

#define devm_request_mem_region(dev,start,n,name) \
	__devm_request_region(dev, &iomem_resource, (start), (n), (name))

struct resource iomem_resource = {
	.name	= "PCI mem",
	.start	= 0,
	.end	= -1,
	.flags	= IORESOURCE_MEM,
};


struct resource * __request_region()
{
	...
	...
	res->flags = resource_type(parent) | resource_ext_type(parent);
	res->flags |= IORESOURCE_BUSY | flags;
	res->desc = parent->desc;
	...
	...
}

So the regions will not be tagged as IORESOURCE_IO but IORESOURCE_MEM.
From the first glance release_mem_region_adjustable() looks like it does
more things than __release_region(), and I did not check it deeply
but maybe we can make it work.

Thanks
Oscar Salvador Aug. 8, 2018, 1:42 p.m. UTC | #17
On Wed, Aug 08, 2018 at 10:08:41AM +0200, David Hildenbrand wrote:
> Then it is maybe time to cleary distinguish both types of memory, as
> they are fundamentally different when it comes to online/offline behavior.
> 
> Ordinary ram:
>  add_memory ...
>  online_pages ...
>  offline_pages
>  remove_memory
> 
> Device memory
>  add_device_memory ...
>  remove_device_memory
> 
> So adding/removing from the zone and stuff can be handled there.

Uhm, I have been thinking about this.
Maybe we could do something like (completely untested):


== memory_hotplug code ==

int add_device_memory(int nid, unsigned long start, unsigned long size,
                                struct vmem_altmap *altmap, bool mapping)
{
        int ret;
        unsigned long start_pfn = PHYS_PFN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;

        mem_hotplug_begin();
        if (mapping)
                ret = arch_add_memory(nid, start, size, altmap, false)
        else
                ret = add_pages(nid, start_pfn, nr_pages, altmap, false):

        if (!ret) {
                pgdata_t *pgdata = NODE_DATA(nid);
                struct zone *zone = pgdata->node_zones[ZONE_DEVICE];

                online_mem_sections(start_pfn, start_pfn + nr_pages);
                move_pfn_range_to_zone(zone, start_pfn, nr_pages, altmap);
        }
        mem_hotplug_done();

        return ret;
}

int del_device_memory(int nid, unsigned long start, unsigned long size,
                                struct vmem_altmap *altmap, bool mapping)
{
        int ret;
        unsigned long start_pfn = PHYS_PFN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        pgdata_t *pgdata = NODE_DATA(nid);
        struct zone *zone = pgdata->node_zones[ZONE_DEVICE];

        mem_hotplug_begin();

        offline_mem_sections(start_pfn, start_pfn + nr_pages);
        __shrink_pages(zone, start_pfn, start_pfn + nr_pages, nr_pages);

        if (mapping)
                ret = arch_remove_memory(nid, start, size, altmap)
        else
                ret = __remove_pages(nid, start_pfn, nr_pages, altmap)

        mem_hotplug_done();

        return ret;
}

===

And then, HMM/devm code could use it.

For example:

hmm_devmem_pages_create():

...
...
if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
	linear_mapping = true;
else
	linear_mapping = false;

ret = add_device_memory(nid, align_start, align_size, NULL, linear_mapping);
if (ret)
	goto error_add_memory;
...
...


hmm_devmem_release:

...
...
if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
	mapping = false;
else
	mapping = true;

del_device_memory(nid, start_pfn << PAGE_SHIFT, npages << PAGE_SHIFT,
								NULL,
								mapping);
...
...


In this way, we do not need to play tricks in HMM/devm code, we just need to
call those functions when adding/removing memory.

We would still have to figure out a way to go for the release_mem_region_adjustable() stuff though.

Thanks
Jerome Glisse Aug. 8, 2018, 4:58 p.m. UTC | #18
On Wed, Aug 08, 2018 at 08:47:58AM +0200, Michal Hocko wrote:
> On Tue 07-08-18 11:18:10, Jerome Glisse wrote:
> > On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> > > On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > > > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > > > From: Oscar Salvador <osalvador@suse.de>
> > > > 
> > > > [...]
> > > > 
> > > > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > > > index 9bd629944c91..e33555651e46 100644
> > > > > --- a/mm/memory_hotplug.c
> > > > > +++ b/mm/memory_hotplug.c
> > > > 
> > > > [...]
> > > > 
> > > > >  /**
> > > > >   * __remove_pages() - remove sections of pages from a zone
> > > > > - * @zone: zone from which pages need to be removed
> > > > > + * @nid: node which pages belong to
> > > > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > > > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > > > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > > > >   * sure that pages are marked reserved and zones are adjust properly by
> > > > >   * calling offline_pages().
> > > > >   */
> > > > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > > > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > > > >  {
> > > > >  	unsigned long i;
> > > > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > >  	int sections_to_remove, ret = 0;
> > > > >  
> > > > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > > > -	if (is_dev_zone(zone)) {
> > > > > -		if (altmap)
> > > > > -			map_offset = vmem_altmap_offset(altmap);
> > > > > -	} else {
> > > > > +	if (altmap)
> > > > > +		map_offset = vmem_altmap_offset(altmap);
> > > > > +	else {
> > > > 
> > > > This will break ZONE_DEVICE at least for HMM. While i think that
> > > > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > > > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > > > with the above changes you change the expected behavior.
> > > 
> > > Could you be more specific what is the expected behavior here?
> > > Is this about calling release_mem_region_adjustable? Why does is it not
> > > suitable for zone device ranges?
> > 
> > Correct, you should not call release_mem_region_adjustable() the device
> > region is not part of regular iomem resource as it might not necessarily
> > be enumerated through known ways to the kernel (ie only the device driver
> > can discover the region and core kernel do not know about it).
> 
> If there is no region registered with the range then the call should be
> mere nop, no? So why do we have to special case?

IIRC this is because you can not release the resource ie the resource
is still own by the device driver even if you hotremove the memory.
The device driver might still be using the resource without struct page.

> 
> [...]
> 
> > Also in the case they do exist in iomem resource it is as PCIE BAR so
> > as IORESOURCE_IO (iirc) and thus release_mem_region_adjustable() would
> > return -EINVAL. Thought nothing bad happens because of that, only a
> > warning message that might confuse the user.
> 
> I am not sure I have understood this correctly. Are you referring to the
> current state when we would call release_mem_region_adjustable
> unconditionally or the case that the resource would be added also for
> zone device ranges?
> 
> If the former then I do not see any reason why we couldn't simply
> refactor the code to expect a failure and drop the warning in that path.

Referring to newer case ie calling release_mem_region_adjustable() for
ZONE_DEVICE too. It seems i got my recollection wrong in the sense that
the resource is properly register as MEM but still we do not want to
release it because the device driver might still be using the resource
without struct page. The lifetime of the resource as memory with struct
page and the lifetime of the resource as something use by the device
driver are not tie together. The latter can outlive the former.

So when we hotremove ZONE_DEVICE we do not want to release the resource
yet just to be on safe side and avoid some other driver/kernel component
to decide to use that resource.

Cheers,
Jérôme
Jerome Glisse Aug. 8, 2018, 5:33 p.m. UTC | #19
On Wed, Aug 08, 2018 at 11:45:02AM +0200, Oscar Salvador wrote:
> On Tue, Aug 07, 2018 at 11:18:10AM -0400, Jerome Glisse wrote:
> > Correct, you should not call release_mem_region_adjustable() the device
> > region is not part of regular iomem resource as it might not necessarily
> > be enumerated through known ways to the kernel (ie only the device driver
> > can discover the region and core kernel do not know about it).
> > 
> > One of the issue to adding this region to iomem resource is that they
> > really need to be ignored by core kernel because you can not assume that
> > CPU can actually access them. Moreover, if CPU can access them it is
> > likely that CPU can not do atomic operation on them (ie what happens on
> > a CPU atomic instruction is undefined). So they are _special_ and only
> > make sense to be use in conjunction with a device driver.
> > 
> > 
> > Also in the case they do exist in iomem resource it is as PCIE BAR so
> > as IORESOURCE_IO (iirc) and thus release_mem_region_adjustable() would
> > return -EINVAL. Thought nothing bad happens because of that, only a
> > warning message that might confuse the user.
> 
> Just to see if I understand this correctly.
> I guess that these regions are being registered via devm_request_mem_region() calls.
> Among other callers, devm_request_mem_region() is being called from:
> 
> dax_pmem_probe
> hmm_devmem_add
> 
> AFAICS from the code, those regions will inherit the flags from the parent, which is iomem_resource:
> 
> #define devm_request_mem_region(dev,start,n,name) \
> 	__devm_request_region(dev, &iomem_resource, (start), (n), (name))
> 
> struct resource iomem_resource = {
> 	.name	= "PCI mem",
> 	.start	= 0,
> 	.end	= -1,
> 	.flags	= IORESOURCE_MEM,
> };
> 
> 
> struct resource * __request_region()
> {
> 	...
> 	...
> 	res->flags = resource_type(parent) | resource_ext_type(parent);
> 	res->flags |= IORESOURCE_BUSY | flags;
> 	res->desc = parent->desc;
> 	...
> 	...
> }

Yeah you right my recollection of this was wrong.

> 
> So the regions will not be tagged as IORESOURCE_IO but IORESOURCE_MEM.
> From the first glance release_mem_region_adjustable() looks like it does
> more things than __release_region(), and I did not check it deeply
> but maybe we can make it work.

The root issue here is not releasing the resource when hotremoving
the memory. The device driver still wants to keep owning the resource
after hotremove of memory. The device driver do not necessarily always
need struct page to make use of that resource.


Cheers,
Jérôme
Jerome Glisse Aug. 8, 2018, 5:55 p.m. UTC | #20
On Wed, Aug 08, 2018 at 03:42:33PM +0200, Oscar Salvador wrote:
> On Wed, Aug 08, 2018 at 10:08:41AM +0200, David Hildenbrand wrote:
> > Then it is maybe time to cleary distinguish both types of memory, as
> > they are fundamentally different when it comes to online/offline behavior.
> > 
> > Ordinary ram:
> >  add_memory ...
> >  online_pages ...
> >  offline_pages
> >  remove_memory
> > 
> > Device memory
> >  add_device_memory ...
> >  remove_device_memory
> > 
> > So adding/removing from the zone and stuff can be handled there.
> 
> Uhm, I have been thinking about this.
> Maybe we could do something like (completely untested):
> 
> 
> == memory_hotplug code ==
> 
> int add_device_memory(int nid, unsigned long start, unsigned long size,
>                                 struct vmem_altmap *altmap, bool mapping)
> {
>         int ret;
>         unsigned long start_pfn = PHYS_PFN(start);
>         unsigned long nr_pages = size >> PAGE_SHIFT;
> 
>         mem_hotplug_begin();
>         if (mapping)
>                 ret = arch_add_memory(nid, start, size, altmap, false)
>         else
>                 ret = add_pages(nid, start_pfn, nr_pages, altmap, false):
> 
>         if (!ret) {
>                 pgdata_t *pgdata = NODE_DATA(nid);
>                 struct zone *zone = pgdata->node_zones[ZONE_DEVICE];
> 
>                 online_mem_sections(start_pfn, start_pfn + nr_pages);
>                 move_pfn_range_to_zone(zone, start_pfn, nr_pages, altmap);
>         }
>         mem_hotplug_done();
> 
>         return ret;
> }
> 
> int del_device_memory(int nid, unsigned long start, unsigned long size,
>                                 struct vmem_altmap *altmap, bool mapping)
> {
>         int ret;
>         unsigned long start_pfn = PHYS_PFN(start);
>         unsigned long nr_pages = size >> PAGE_SHIFT;
>         pgdata_t *pgdata = NODE_DATA(nid);
>         struct zone *zone = pgdata->node_zones[ZONE_DEVICE];
> 
>         mem_hotplug_begin();
> 
>         offline_mem_sections(start_pfn, start_pfn + nr_pages);
>         __shrink_pages(zone, start_pfn, start_pfn + nr_pages, nr_pages);
> 
>         if (mapping)
>                 ret = arch_remove_memory(nid, start, size, altmap)
>         else
>                 ret = __remove_pages(nid, start_pfn, nr_pages, altmap)
> 
>         mem_hotplug_done();
> 
>         return ret;
> }
> 
> ===
> 
> And then, HMM/devm code could use it.
> 
> For example:
> 
> hmm_devmem_pages_create():
> 
> ...
> ...
> if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
> 	linear_mapping = true;
> else
> 	linear_mapping = false;
> 
> ret = add_device_memory(nid, align_start, align_size, NULL, linear_mapping);
> if (ret)
> 	goto error_add_memory;
> ...
> ...
> 
> 
> hmm_devmem_release:
> 
> ...
> ...
> if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
> 	mapping = false;
> else
> 	mapping = true;
> 
> del_device_memory(nid, start_pfn << PAGE_SHIFT, npages << PAGE_SHIFT,
> 								NULL,
> 								mapping);
> ...
> ...
> 
> 
> In this way, we do not need to play tricks in HMM/devm code, we just need to
> call those functions when adding/removing memory.

Note that Dan did post patches that already go in that direction (unifying
code between devm and HMM). I think they are in Andrew queue, looks for

mm: Rework hmm to use devm_memremap_pages and other fixes

> 
> We would still have to figure out a way to go for the release_mem_region_adjustable() stuff though.

Yes.

Cheers,
Jérôme
Oscar Salvador Aug. 8, 2018, 9:28 p.m. UTC | #21
On Wed, Aug 08, 2018 at 12:58:15PM -0400, Jerome Glisse wrote:
> > If the former then I do not see any reason why we couldn't simply
> > refactor the code to expect a failure and drop the warning in that path.
> 
> Referring to newer case ie calling release_mem_region_adjustable() for
> ZONE_DEVICE too. It seems i got my recollection wrong in the sense that
> the resource is properly register as MEM but still we do not want to
> release it because the device driver might still be using the resource
> without struct page. The lifetime of the resource as memory with struct
> page and the lifetime of the resource as something use by the device
> driver are not tie together. The latter can outlive the former.
> 
> So when we hotremove ZONE_DEVICE we do not want to release the resource
> yet just to be on safe side and avoid some other driver/kernel component
> to decide to use that resource.

I checked the function that hot-removes the memory in HMM code.
hmm_devmem_pages_remove(), which gets called via hmm_devmem_remove(), is in charge
of hot-removing the memory.

Then, hmm_devmem_remove() will release the resource only if the resource is not of
type IORES_DESC_DEVICE_PUBLIC_MEMORY.

So I guess that there are cases(at least in HMM) where we release the resource when
hot-removing memory, but not always.

Looking at devm code, I could not see any place where we release the resource
when hot-removing memory.

So, if we are really left with such scenario, maybe the easiest way is to pass a parameter
from those paths to arch_remove_memory()->__remove_pages() to know
if we get called from device_functions, and so skip the call to release_mem_region_adjustable.

Thanks
Oscar Salvador Aug. 8, 2018, 9:29 p.m. UTC | #22
On Wed, Aug 08, 2018 at 01:55:59PM -0400, Jerome Glisse wrote:
> Note that Dan did post patches that already go in that direction (unifying
> code between devm and HMM). I think they are in Andrew queue, looks for
> 
> mm: Rework hmm to use devm_memremap_pages and other fixes

Thanks for pointing that out.
I will take a look at that work.
Oscar Salvador Aug. 9, 2018, 7:50 a.m. UTC | #23
On Wed, Aug 08, 2018 at 11:29:08PM +0200, Oscar Salvador wrote:
> On Wed, Aug 08, 2018 at 01:55:59PM -0400, Jerome Glisse wrote:
> > Note that Dan did post patches that already go in that direction (unifying
> > code between devm and HMM). I think they are in Andrew queue, looks for
> > 
> > mm: Rework hmm to use devm_memremap_pages and other fixes
> 
> Thanks for pointing that out.
> I will take a look at that work.

Ok, I checked the patchset [1] and I think it is nice that those two (devm and HMM)
get unified.
I think it will make things easier when we have to change things for the memory-hotplug.
Actually, I think that after [2], all hot-adding memory will be handled in 
devm_memremap_pages.

What I do not see is why the patch did not make it to further RCs.

Thanks
Oscar Salvador Aug. 9, 2018, 7:52 a.m. UTC | #24
On Thu, Aug 09, 2018 at 09:50:55AM +0200, Oscar Salvador wrote:
> On Wed, Aug 08, 2018 at 11:29:08PM +0200, Oscar Salvador wrote:
> > On Wed, Aug 08, 2018 at 01:55:59PM -0400, Jerome Glisse wrote:
> > > Note that Dan did post patches that already go in that direction (unifying
> > > code between devm and HMM). I think they are in Andrew queue, looks for
> > > 
> > > mm: Rework hmm to use devm_memremap_pages and other fixes
> > 
> > Thanks for pointing that out.
> > I will take a look at that work.
> 
> Ok, I checked the patchset [1] and I think it is nice that those two (devm and HMM)
> get unified.
> I think it will make things easier when we have to change things for the memory-hotplug.
> Actually, I think that after [2], all hot-adding memory will be handled in 
> devm_memremap_pages.

Forgot to include the links:

[1] https://lkml.org/lkml/2018/6/19/108
[2] https://lkml.org/lkml/2018/6/19/110
Michal Hocko Aug. 9, 2018, 8:24 a.m. UTC | #25
On Wed 08-08-18 12:58:15, Jerome Glisse wrote:
> On Wed, Aug 08, 2018 at 08:47:58AM +0200, Michal Hocko wrote:
> > On Tue 07-08-18 11:18:10, Jerome Glisse wrote:
> > > On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> > > > On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > > > > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > > > > From: Oscar Salvador <osalvador@suse.de>
> > > > > 
> > > > > [...]
> > > > > 
> > > > > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > > > > index 9bd629944c91..e33555651e46 100644
> > > > > > --- a/mm/memory_hotplug.c
> > > > > > +++ b/mm/memory_hotplug.c
> > > > > 
> > > > > [...]
> > > > > 
> > > > > >  /**
> > > > > >   * __remove_pages() - remove sections of pages from a zone
> > > > > > - * @zone: zone from which pages need to be removed
> > > > > > + * @nid: node which pages belong to
> > > > > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > > > > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > > > > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > > > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > > > > >   * sure that pages are marked reserved and zones are adjust properly by
> > > > > >   * calling offline_pages().
> > > > > >   */
> > > > > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > > > > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > > > > >  {
> > > > > >  	unsigned long i;
> > > > > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > >  	int sections_to_remove, ret = 0;
> > > > > >  
> > > > > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > > > > -	if (is_dev_zone(zone)) {
> > > > > > -		if (altmap)
> > > > > > -			map_offset = vmem_altmap_offset(altmap);
> > > > > > -	} else {
> > > > > > +	if (altmap)
> > > > > > +		map_offset = vmem_altmap_offset(altmap);
> > > > > > +	else {
> > > > > 
> > > > > This will break ZONE_DEVICE at least for HMM. While i think that
> > > > > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > > > > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > > > > with the above changes you change the expected behavior.
> > > > 
> > > > Could you be more specific what is the expected behavior here?
> > > > Is this about calling release_mem_region_adjustable? Why does is it not
> > > > suitable for zone device ranges?
> > > 
> > > Correct, you should not call release_mem_region_adjustable() the device
> > > region is not part of regular iomem resource as it might not necessarily
> > > be enumerated through known ways to the kernel (ie only the device driver
> > > can discover the region and core kernel do not know about it).
> > 
> > If there is no region registered with the range then the call should be
> > mere nop, no? So why do we have to special case?
> 
> IIRC this is because you can not release the resource ie the resource
> is still own by the device driver even if you hotremove the memory.
> The device driver might still be using the resource without struct page.

But then it seems to be a property of a device rather than zone_device,
no? If there are devices which want to preserve the resource then they
should tell that. Doing that unconditionally for all zone_device users
seems just wrong.
Jerome Glisse Aug. 9, 2018, 2:27 p.m. UTC | #26
On Thu, Aug 09, 2018 at 10:24:15AM +0200, Michal Hocko wrote:
> On Wed 08-08-18 12:58:15, Jerome Glisse wrote:
> > On Wed, Aug 08, 2018 at 08:47:58AM +0200, Michal Hocko wrote:
> > > On Tue 07-08-18 11:18:10, Jerome Glisse wrote:
> > > > On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> > > > > On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > > > > > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > > > > > From: Oscar Salvador <osalvador@suse.de>
> > > > > > 
> > > > > > [...]
> > > > > > 
> > > > > > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > > > > > index 9bd629944c91..e33555651e46 100644
> > > > > > > --- a/mm/memory_hotplug.c
> > > > > > > +++ b/mm/memory_hotplug.c
> > > > > > 
> > > > > > [...]
> > > > > > 
> > > > > > >  /**
> > > > > > >   * __remove_pages() - remove sections of pages from a zone
> > > > > > > - * @zone: zone from which pages need to be removed
> > > > > > > + * @nid: node which pages belong to
> > > > > > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > > > > > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > > > > > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > > > > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > > > > > >   * sure that pages are marked reserved and zones are adjust properly by
> > > > > > >   * calling offline_pages().
> > > > > > >   */
> > > > > > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > > > > > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > > > > > >  {
> > > > > > >  	unsigned long i;
> > > > > > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > >  	int sections_to_remove, ret = 0;
> > > > > > >  
> > > > > > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > > > > > -	if (is_dev_zone(zone)) {
> > > > > > > -		if (altmap)
> > > > > > > -			map_offset = vmem_altmap_offset(altmap);
> > > > > > > -	} else {
> > > > > > > +	if (altmap)
> > > > > > > +		map_offset = vmem_altmap_offset(altmap);
> > > > > > > +	else {
> > > > > > 
> > > > > > This will break ZONE_DEVICE at least for HMM. While i think that
> > > > > > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > > > > > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > > > > > with the above changes you change the expected behavior.
> > > > > 
> > > > > Could you be more specific what is the expected behavior here?
> > > > > Is this about calling release_mem_region_adjustable? Why does is it not
> > > > > suitable for zone device ranges?
> > > > 
> > > > Correct, you should not call release_mem_region_adjustable() the device
> > > > region is not part of regular iomem resource as it might not necessarily
> > > > be enumerated through known ways to the kernel (ie only the device driver
> > > > can discover the region and core kernel do not know about it).
> > > 
> > > If there is no region registered with the range then the call should be
> > > mere nop, no? So why do we have to special case?
> > 
> > IIRC this is because you can not release the resource ie the resource
> > is still own by the device driver even if you hotremove the memory.
> > The device driver might still be using the resource without struct page.
> 
> But then it seems to be a property of a device rather than zone_device,
> no? If there are devices which want to preserve the resource then they
> should tell that. Doing that unconditionally for all zone_device users
> seems just wrong.

I am fine with changing that, i did not do that and at the time i did
not have any feeling on that matter.

Cheers,
Jérôme
Michal Hocko Aug. 9, 2018, 3:09 p.m. UTC | #27
On Thu 09-08-18 10:27:09, Jerome Glisse wrote:
> On Thu, Aug 09, 2018 at 10:24:15AM +0200, Michal Hocko wrote:
> > On Wed 08-08-18 12:58:15, Jerome Glisse wrote:
> > > On Wed, Aug 08, 2018 at 08:47:58AM +0200, Michal Hocko wrote:
> > > > On Tue 07-08-18 11:18:10, Jerome Glisse wrote:
> > > > > On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> > > > > > On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > > > > > > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > > > > > > From: Oscar Salvador <osalvador@suse.de>
> > > > > > > 
> > > > > > > [...]
> > > > > > > 
> > > > > > > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > > > > > > index 9bd629944c91..e33555651e46 100644
> > > > > > > > --- a/mm/memory_hotplug.c
> > > > > > > > +++ b/mm/memory_hotplug.c
> > > > > > > 
> > > > > > > [...]
> > > > > > > 
> > > > > > > >  /**
> > > > > > > >   * __remove_pages() - remove sections of pages from a zone
> > > > > > > > - * @zone: zone from which pages need to be removed
> > > > > > > > + * @nid: node which pages belong to
> > > > > > > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > > > > > > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > > > > > > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > > > > > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > > > > > > >   * sure that pages are marked reserved and zones are adjust properly by
> > > > > > > >   * calling offline_pages().
> > > > > > > >   */
> > > > > > > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > > > > > > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > > > > > > >  {
> > > > > > > >  	unsigned long i;
> > > > > > > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > > >  	int sections_to_remove, ret = 0;
> > > > > > > >  
> > > > > > > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > > > > > > -	if (is_dev_zone(zone)) {
> > > > > > > > -		if (altmap)
> > > > > > > > -			map_offset = vmem_altmap_offset(altmap);
> > > > > > > > -	} else {
> > > > > > > > +	if (altmap)
> > > > > > > > +		map_offset = vmem_altmap_offset(altmap);
> > > > > > > > +	else {
> > > > > > > 
> > > > > > > This will break ZONE_DEVICE at least for HMM. While i think that
> > > > > > > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > > > > > > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > > > > > > with the above changes you change the expected behavior.
> > > > > > 
> > > > > > Could you be more specific what is the expected behavior here?
> > > > > > Is this about calling release_mem_region_adjustable? Why does is it not
> > > > > > suitable for zone device ranges?
> > > > > 
> > > > > Correct, you should not call release_mem_region_adjustable() the device
> > > > > region is not part of regular iomem resource as it might not necessarily
> > > > > be enumerated through known ways to the kernel (ie only the device driver
> > > > > can discover the region and core kernel do not know about it).
> > > > 
> > > > If there is no region registered with the range then the call should be
> > > > mere nop, no? So why do we have to special case?
> > > 
> > > IIRC this is because you can not release the resource ie the resource
> > > is still own by the device driver even if you hotremove the memory.
> > > The device driver might still be using the resource without struct page.
> > 
> > But then it seems to be a property of a device rather than zone_device,
> > no? If there are devices which want to preserve the resource then they
> > should tell that. Doing that unconditionally for all zone_device users
> > seems just wrong.
> 
> I am fine with changing that, i did not do that and at the time i did
> not have any feeling on that matter.

I would really prefer to be explicit about these requirements rather
than having subtle side effects quite deep in the memory hotplug code
and checks for zone device sprinkled at places for special handling.
Jerome Glisse Aug. 9, 2018, 4:58 p.m. UTC | #28
On Thu, Aug 09, 2018 at 05:09:50PM +0200, Michal Hocko wrote:
> On Thu 09-08-18 10:27:09, Jerome Glisse wrote:
> > On Thu, Aug 09, 2018 at 10:24:15AM +0200, Michal Hocko wrote:
> > > On Wed 08-08-18 12:58:15, Jerome Glisse wrote:
> > > > On Wed, Aug 08, 2018 at 08:47:58AM +0200, Michal Hocko wrote:
> > > > > On Tue 07-08-18 11:18:10, Jerome Glisse wrote:
> > > > > > On Tue, Aug 07, 2018 at 04:59:00PM +0200, Michal Hocko wrote:
> > > > > > > On Tue 07-08-18 09:52:21, Jerome Glisse wrote:
> > > > > > > > On Tue, Aug 07, 2018 at 03:37:56PM +0200, osalvador@techadventures.net wrote:
> > > > > > > > > From: Oscar Salvador <osalvador@suse.de>
> > > > > > > > 
> > > > > > > > [...]
> > > > > > > > 
> > > > > > > > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > > > > > > > > index 9bd629944c91..e33555651e46 100644
> > > > > > > > > --- a/mm/memory_hotplug.c
> > > > > > > > > +++ b/mm/memory_hotplug.c
> > > > > > > > 
> > > > > > > > [...]
> > > > > > > > 
> > > > > > > > >  /**
> > > > > > > > >   * __remove_pages() - remove sections of pages from a zone
> > > > > > > > > - * @zone: zone from which pages need to be removed
> > > > > > > > > + * @nid: node which pages belong to
> > > > > > > > >   * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
> > > > > > > > >   * @nr_pages: number of pages to remove (must be multiple of section size)
> > > > > > > > >   * @altmap: alternative device page map or %NULL if default memmap is used
> > > > > > > > > @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
> > > > > > > > >   * sure that pages are marked reserved and zones are adjust properly by
> > > > > > > > >   * calling offline_pages().
> > > > > > > > >   */
> > > > > > > > > -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > > > > +int __remove_pages(int nid, unsigned long phys_start_pfn,
> > > > > > > > >  		 unsigned long nr_pages, struct vmem_altmap *altmap)
> > > > > > > > >  {
> > > > > > > > >  	unsigned long i;
> > > > > > > > > @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
> > > > > > > > >  	int sections_to_remove, ret = 0;
> > > > > > > > >  
> > > > > > > > >  	/* In the ZONE_DEVICE case device driver owns the memory region */
> > > > > > > > > -	if (is_dev_zone(zone)) {
> > > > > > > > > -		if (altmap)
> > > > > > > > > -			map_offset = vmem_altmap_offset(altmap);
> > > > > > > > > -	} else {
> > > > > > > > > +	if (altmap)
> > > > > > > > > +		map_offset = vmem_altmap_offset(altmap);
> > > > > > > > > +	else {
> > > > > > > > 
> > > > > > > > This will break ZONE_DEVICE at least for HMM. While i think that
> > > > > > > > altmap -> ZONE_DEVICE (ie altmap imply ZONE_DEVICE) the reverse
> > > > > > > > is not true ie ZONE_DEVICE does not necessarily imply altmap. So
> > > > > > > > with the above changes you change the expected behavior.
> > > > > > > 
> > > > > > > Could you be more specific what is the expected behavior here?
> > > > > > > Is this about calling release_mem_region_adjustable? Why does is it not
> > > > > > > suitable for zone device ranges?
> > > > > > 
> > > > > > Correct, you should not call release_mem_region_adjustable() the device
> > > > > > region is not part of regular iomem resource as it might not necessarily
> > > > > > be enumerated through known ways to the kernel (ie only the device driver
> > > > > > can discover the region and core kernel do not know about it).
> > > > > 
> > > > > If there is no region registered with the range then the call should be
> > > > > mere nop, no? So why do we have to special case?
> > > > 
> > > > IIRC this is because you can not release the resource ie the resource
> > > > is still own by the device driver even if you hotremove the memory.
> > > > The device driver might still be using the resource without struct page.
> > > 
> > > But then it seems to be a property of a device rather than zone_device,
> > > no? If there are devices which want to preserve the resource then they
> > > should tell that. Doing that unconditionally for all zone_device users
> > > seems just wrong.
> > 
> > I am fine with changing that, i did not do that and at the time i did
> > not have any feeling on that matter.
> 
> I would really prefer to be explicit about these requirements rather
> than having subtle side effects quite deep in the memory hotplug code
> and checks for zone device sprinkled at places for special handling.

I agree, i never thought about that before. Looking at existing resource
management i think the simplest solution would be to use a refcount on the
resources instead of the IORESOURCE_BUSY flags.

So when you release resource as part of hotremove you would only dec the
refcount and a resource is not busy only when refcount is zero.

Just the idea i had in mind. Right now i am working on other thing, Oscar
is this something you would like to work on ? Feel free to come up with
something better than my first idea :)

Cheers,
Jérôme
Oscar Salvador Aug. 9, 2018, 8:50 p.m. UTC | #29
On Thu, Aug 09, 2018 at 12:58:21PM -0400, Jerome Glisse wrote:
> > I would really prefer to be explicit about these requirements rather
> > than having subtle side effects quite deep in the memory hotplug code
> > and checks for zone device sprinkled at places for special handling.
> 
> I agree, i never thought about that before. Looking at existing resource
> management i think the simplest solution would be to use a refcount on the
> resources instead of the IORESOURCE_BUSY flags.
> 
> So when you release resource as part of hotremove you would only dec the
> refcount and a resource is not busy only when refcount is zero.
> 
> Just the idea i had in mind. Right now i am working on other thing, Oscar
> is this something you would like to work on ? Feel free to come up with
> something better than my first idea :)

Hi Jerome,

Definetly it would be something I am interested to work on.
Let me think a bit about this and see if I can come up with something.

Thanks
Oscar Salvador Aug. 16, 2018, 2:58 p.m. UTC | #30
On Thu, Aug 09, 2018 at 12:58:21PM -0400, Jerome Glisse wrote:
> I agree, i never thought about that before. Looking at existing resource
> management i think the simplest solution would be to use a refcount on the
> resources instead of the IORESOURCE_BUSY flags.
> 
> So when you release resource as part of hotremove you would only dec the
> refcount and a resource is not busy only when refcount is zero.
> 
> Just the idea i had in mind. Right now i am working on other thing, Oscar
> is this something you would like to work on ? Feel free to come up with
> something better than my first idea :)

So, I thought a bit about this.
First I talked a bit with Jerome about the refcount idea.
The problem with reconverting this to refcount is that it is too intrusive,
and I think it is not really needed.

I then thought about defining a new flag, something like

#define IORESOURCE_NO_HOTREMOVE	xxx

but we ran out of bits for the flag field.

I then thought about doing something like:

struct resource {
        resource_size_t start;
        resource_size_t end;
        const char *name;
        unsigned long flags;
        unsigned long desc;
        struct resource *parent, *sibling, *child;
#ifdef CONFIG_MEMORY_HOTREMOVE
        bool device_managed;
#endif
};

but it is just too awful, not needed, and bytes consuming.

The only idea I had left is:

register_memory_resource(), which defines a new resource for the added memory-chunk
is only called from add_memory().
This function is only being hit when we add memory-chunks.

HMM/devm gets the resources their own way, calling devm_request_mem_region().

So resources that are requested from HMM/devm, have the following flags:

 (IORESOURCE_MEM|IORESOURCE_BUSY)

while resources that are requested via mem-hotplug have:

 (IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY)

IORESOURCE_SYSTEM_RAM = (IORESOURCE_MEM|IORESOURCE_SYSRAM)


release_mem_region_adjustable() is only being called from hot-remove path, so
unless I am mistaken, all resources hitting that path should match IORESOURCE_SYSTEM_RAM.

That leaves me with the idea that we could check for the resource->flags to contain IORESOURCE_SYSRAM,
as I think it is only being set for memory-chunks that are added via memory-hot-add path.

In case it is not, we know that that resource belongs to HMM/devm, so we can back off since
they take care of releasing the resource via devm_release_mem_region.

I am working on a RFC v2 containing this, but, Jerome, could you confirm above assumption, please?

Of course, ideas/suggestions are also welcome.

Thanks
Jerome Glisse Aug. 16, 2018, 5:32 p.m. UTC | #31
On Thu, Aug 16, 2018 at 04:58:49PM +0200, Oscar Salvador wrote:
> On Thu, Aug 09, 2018 at 12:58:21PM -0400, Jerome Glisse wrote:
> > I agree, i never thought about that before. Looking at existing resource
> > management i think the simplest solution would be to use a refcount on the
> > resources instead of the IORESOURCE_BUSY flags.
> > 
> > So when you release resource as part of hotremove you would only dec the
> > refcount and a resource is not busy only when refcount is zero.
> > 
> > Just the idea i had in mind. Right now i am working on other thing, Oscar
> > is this something you would like to work on ? Feel free to come up with
> > something better than my first idea :)
> 
> So, I thought a bit about this.
> First I talked a bit with Jerome about the refcount idea.
> The problem with reconverting this to refcount is that it is too intrusive,
> and I think it is not really needed.
> 
> I then thought about defining a new flag, something like
> 
> #define IORESOURCE_NO_HOTREMOVE	xxx
> 
> but we ran out of bits for the flag field.
> 
> I then thought about doing something like:
> 
> struct resource {
>         resource_size_t start;
>         resource_size_t end;
>         const char *name;
>         unsigned long flags;
>         unsigned long desc;
>         struct resource *parent, *sibling, *child;
> #ifdef CONFIG_MEMORY_HOTREMOVE
>         bool device_managed;
> #endif
> };
> 
> but it is just too awful, not needed, and bytes consuming.

Agree the above is ugly.

> 
> The only idea I had left is:
> 
> register_memory_resource(), which defines a new resource for the added memory-chunk
> is only called from add_memory().
> This function is only being hit when we add memory-chunks.
> 
> HMM/devm gets the resources their own way, calling devm_request_mem_region().
> 
> So resources that are requested from HMM/devm, have the following flags:
> 
>  (IORESOURCE_MEM|IORESOURCE_BUSY)
> 
> while resources that are requested via mem-hotplug have:
> 
>  (IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY)
> 
> IORESOURCE_SYSTEM_RAM = (IORESOURCE_MEM|IORESOURCE_SYSRAM)
> 
> 
> release_mem_region_adjustable() is only being called from hot-remove path, so
> unless I am mistaken, all resources hitting that path should match IORESOURCE_SYSTEM_RAM.
> 
> That leaves me with the idea that we could check for the resource->flags to contain IORESOURCE_SYSRAM,
> as I think it is only being set for memory-chunks that are added via memory-hot-add path.
> 
> In case it is not, we know that that resource belongs to HMM/devm, so we can back off since
> they take care of releasing the resource via devm_release_mem_region.
> 
> I am working on a RFC v2 containing this, but, Jerome, could you confirm above assumption, please?

I think you nail it. I am not 100% sure about devm as i have not
followed closely how persistent memory can be reported by ACPI. But
i am pretty sure it should never end up as SYSRAM.

Thank you for scratching your head on this :)

Cheers,
Jérôme
diff mbox series

Patch

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index bc5e15045cee..0029c4d3f574 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -664,11 +664,9 @@  int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct zone *zone;
 	int ret;
 
-	zone = page_zone(pfn_to_page(start_pfn));
-	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+	ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
 	if (ret)
 		pr_warn("%s: Problem encountered in __remove_pages() as"
 			" ret=%d\n", __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9e17d57a9948..12879191bc6b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -143,18 +143,9 @@  int __meminit arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altma
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page;
 	int ret;
 
-	/*
-	 * If we have an altmap then we need to skip over any reserved PFNs
-	 * when querying the zone.
-	 */
-	page = pfn_to_page(start_pfn);
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-
-	ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
+	ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
 	if (ret)
 		return ret;
 
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 55c740ab861b..4f183857b522 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -458,11 +458,9 @@  int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct zone *zone;
 	int ret;
 
-	zone = page_zone(pfn_to_page(start_pfn));
-	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+	ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
 	if (unlikely(ret))
 		pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
 			ret);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9fa503f2f56c..88909e88c873 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -865,10 +865,8 @@  int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct zone *zone;
 
-	zone = page_zone(pfn_to_page(start_pfn));
-	return __remove_pages(zone, start_pfn, nr_pages, altmap);
+	return __remove_pages(nid, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 26728df07072..1aad5ea2e274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1152,15 +1152,9 @@  int __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *a
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page = pfn_to_page(start_pfn);
-	struct zone *zone;
 	int ret;
 
-	/* With altmap the first mapped page is offset from @start */
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-	zone = page_zone(page);
-	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+	ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
 	WARN_ON_ONCE(ret);
 	kernel_physical_mapping_remove(start, start + size);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c68b03fd87bd..eff460ba3ab1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -109,8 +109,10 @@  static inline bool movable_node_is_enabled(void)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern int arch_remove_memory(int nid, u64 start, u64 size,
 		struct vmem_altmap *altmap);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
+extern int __remove_pages(int nid, unsigned long start_pfn,
 	unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void shrink_pages(struct zone *zone, unsigned long start_pfn,
+					unsigned long nr_pages);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* reasonably generic interface to expand the physical pages */
@@ -333,7 +335,7 @@  extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct pglist_data *pgdat,
 		unsigned long start_pfn, struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+extern void sparse_remove_one_section(int nid, struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7a832b844f24..2057af5c1c4f 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -121,6 +121,7 @@  static void devm_memremap_pages_release(void *data)
 	struct resource *res = &pgmap->res;
 	resource_size_t align_start, align_size;
 	unsigned long pfn;
+	struct page *page;
 	int nid;
 
 	for_each_device_pfn(pfn, pgmap)
@@ -138,6 +139,10 @@  static void devm_memremap_pages_release(void *data)
 	nid = dev_to_node(dev);
 
 	mem_hotplug_begin();
+	page = pfn_to_page(pfn);
+	if (pgmap->altmap_valid)
+		page += vmem_altmap_offset(&pgmap->altmap);
+	shrink_pages(page_zone(page), pfn, align_size >> PAGE_SHIFT);
 	arch_remove_memory(nid, align_start, align_size, pgmap->altmap_valid ?
 			&pgmap->altmap : NULL);
 	kasan_remove_zero_shadow(__va(align_start), align_size);
diff --git a/mm/hmm.c b/mm/hmm.c
index 21787e480b4a..b2f2dcadb5fb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1012,7 +1012,7 @@  static void hmm_devmem_release(struct device *dev, void *data)
 
 	mem_hotplug_begin();
 	if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
-		__remove_pages(zone, start_pfn, npages, NULL);
+		__remove_pages(nid, start_pfn, npages, NULL);
 	else
 		arch_remove_memory(nid, start_pfn << PAGE_SHIFT,
 				   npages << PAGE_SHIFT, NULL);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9bd629944c91..e33555651e46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -320,12 +320,10 @@  static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
 				     unsigned long end_pfn)
 {
-	struct mem_section *ms;
-
 	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
-		ms = __pfn_to_section(start_pfn);
+		struct mem_section *ms = __pfn_to_section(start_pfn);
 
-		if (unlikely(!valid_section(ms)))
+		if (unlikely(!online_section(ms)))
 			continue;
 
 		if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -345,15 +343,14 @@  static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
 				    unsigned long start_pfn,
 				    unsigned long end_pfn)
 {
-	struct mem_section *ms;
 	unsigned long pfn;
 
 	/* pfn is the end pfn of a memory section. */
 	pfn = end_pfn - 1;
 	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
-		ms = __pfn_to_section(pfn);
+		struct mem_section *ms = __pfn_to_section(pfn);
 
-		if (unlikely(!valid_section(ms)))
+		if (unlikely(!online_section(ms)))
 			continue;
 
 		if (unlikely(pfn_to_nid(pfn) != nid))
@@ -415,7 +412,7 @@  static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
 		ms = __pfn_to_section(pfn);
 
-		if (unlikely(!valid_section(ms)))
+		if (unlikely(!online_section(ms)))
 			continue;
 
 		if (page_zone(pfn_to_page(pfn)) != zone)
@@ -483,7 +480,7 @@  static void shrink_pgdat_span(struct pglist_data *pgdat,
 	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
 		ms = __pfn_to_section(pfn);
 
-		if (unlikely(!valid_section(ms)))
+		if (unlikely(!online_section(ms)))
 			continue;
 
 		if (pfn_to_nid(pfn) != nid)
@@ -502,19 +499,32 @@  static void shrink_pgdat_span(struct pglist_data *pgdat,
 	pgdat->node_spanned_pages = 0;
 }
 
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __shrink_pages(struct zone *zone, unsigned long start_pfn,
+						unsigned long end_pfn,
+						unsigned long offlined_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
 	unsigned long flags;
+	unsigned long pfn;
 
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
-	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+	clear_zone_contiguous(zone);
+	pgdat_resize_lock(pgdat, &flags);
+	for(pfn = start_pfn; pfn < end_pfn; pfn += nr_pages) {
+		shrink_zone_span(zone, pfn, pfn + nr_pages);
+		shrink_pgdat_span(pgdat, pfn, pfn + nr_pages);
+	}
+	pgdat->node_present_pages -= offlined_pages;
+	pgdat_resize_unlock(pgdat, &flags);
+	set_zone_contiguous(zone);
 }
 
-static int __remove_section(struct zone *zone, struct mem_section *ms,
+void shrink_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages)
+{
+	__shrink_pages(zone, start_pfn, start_pfn + nr_pages, nr_pages);
+}
+
+static int __remove_section(int nid, struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn;
@@ -530,15 +540,14 @@  static int __remove_section(struct zone *zone, struct mem_section *ms,
 
 	scn_nr = __section_nr(ms);
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
-	__remove_zone(zone, start_pfn);
 
-	sparse_remove_one_section(zone, ms, map_offset, altmap);
+	sparse_remove_one_section(nid, ms, map_offset, altmap);
 	return 0;
 }
 
 /**
  * __remove_pages() - remove sections of pages from a zone
- * @zone: zone from which pages need to be removed
+ * @nid: node which pages belong to
  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
  * @nr_pages: number of pages to remove (must be multiple of section size)
  * @altmap: alternative device page map or %NULL if default memmap is used
@@ -548,7 +557,7 @@  static int __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __remove_pages(int nid, unsigned long phys_start_pfn,
 		 unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long i;
@@ -556,10 +565,9 @@  int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	int sections_to_remove, ret = 0;
 
 	/* In the ZONE_DEVICE case device driver owns the memory region */
-	if (is_dev_zone(zone)) {
-		if (altmap)
-			map_offset = vmem_altmap_offset(altmap);
-	} else {
+	if (altmap)
+		map_offset = vmem_altmap_offset(altmap);
+	else {
 		resource_size_t start, size;
 
 		start = phys_start_pfn << PAGE_SHIFT;
@@ -575,8 +583,6 @@  int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 		}
 	}
 
-	clear_zone_contiguous(zone);
-
 	/*
 	 * We can only remove entire sections
 	 */
@@ -587,15 +593,13 @@  int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	for (i = 0; i < sections_to_remove; i++) {
 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
 
-		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
+		ret = __remove_section(nid, __pfn_to_section(pfn), map_offset,
 				altmap);
 		map_offset = 0;
 		if (ret)
 			break;
 	}
 
-	set_zone_contiguous(zone);
-
 	return ret;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -1595,7 +1599,6 @@  static int __ref __offline_pages(unsigned long start_pfn,
 	unsigned long pfn, nr_pages;
 	long offlined_pages;
 	int ret, node;
-	unsigned long flags;
 	unsigned long valid_start, valid_end;
 	struct zone *zone;
 	struct memory_notify arg;
@@ -1667,9 +1670,7 @@  static int __ref __offline_pages(unsigned long start_pfn,
 	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
 	zone->present_pages -= offlined_pages;
 
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	zone->zone_pgdat->node_present_pages -= offlined_pages;
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+	__shrink_pages(zone, valid_start, valid_end, offlined_pages);
 
 	init_per_zone_wmark_min();
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07eea9a6e..016020bd20b5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -766,12 +766,12 @@  static void free_section_usemap(struct page *memmap, unsigned long *usemap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+void sparse_remove_one_section(int nid, struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	unsigned long *usemap = NULL, flags;
-	struct pglist_data *pgdat = zone->zone_pgdat;
+	struct pglist_data *pgdat = NODE_DATA(nid);
 
 	pgdat_resize_lock(pgdat, &flags);
 	if (ms->section_mem_map) {