[v5,4/4] mm: Defer ZONE_DEVICE page initialization to the point where we init pgmap
diff mbox series

Message ID 20180925202053.3576.66039.stgit@localhost.localdomain
State New, archived
Headers show
Series
  • Address issues slowing persistent memory initialization
Related show

Commit Message

Alexander Duyck Sept. 25, 2018, 8:21 p.m. UTC
The ZONE_DEVICE pages were being initialized in two locations. One was with
the memory_hotplug lock held and another was outside of that lock. The
problem with this is that it was nearly doubling the memory initialization
time. Instead of doing this twice, once while holding a global lock and
once without, I am opting to defer the initialization to the one outside of
the lock. This allows us to avoid serializing the overhead for memory init
and we can instead focus on per-node init times.

One issue I encountered is that devm_memremap_pages and
hmm_devmmem_pages_create were initializing only the pgmap field the same
way. One wasn't initializing hmm_data, and the other was initializing it to
a poison value. Since this is something that is exposed to the driver in
the case of hmm I am opting for a third option and just initializing
hmm_data to 0 since this is going to be exposed to unknown third party
drivers.

Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---

v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
    merge conflicts with other changes in the kernel.
v5: No change

 include/linux/mm.h |    2 +
 kernel/memremap.c  |   24 +++++---------
 mm/hmm.c           |   12 ++++---
 mm/page_alloc.c    |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 107 insertions(+), 23 deletions(-)

Comments

Michal Hocko Sept. 26, 2018, 7:55 a.m. UTC | #1
On Tue 25-09-18 13:21:24, Alexander Duyck wrote:
> The ZONE_DEVICE pages were being initialized in two locations. One was with
> the memory_hotplug lock held and another was outside of that lock. The
> problem with this is that it was nearly doubling the memory initialization
> time. Instead of doing this twice, once while holding a global lock and
> once without, I am opting to defer the initialization to the one outside of
> the lock. This allows us to avoid serializing the overhead for memory init
> and we can instead focus on per-node init times.
> 
> One issue I encountered is that devm_memremap_pages and
> hmm_devmmem_pages_create were initializing only the pgmap field the same
> way. One wasn't initializing hmm_data, and the other was initializing it to
> a poison value. Since this is something that is exposed to the driver in
> the case of hmm I am opting for a third option and just initializing
> hmm_data to 0 since this is going to be exposed to unknown third party
> drivers.

Why cannot you pull move_pfn_range_to_zone out of the hotplug lock? In
other words why are you making zone device even more special in the
generic hotplug code when it already has its own means to initialize the
pfn range by calling move_pfn_range_to_zone. Not to mention the code
duplication.

That being said I really dislike this patch.

> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
> 
> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>     merge conflicts with other changes in the kernel.
> v5: No change
> 
>  include/linux/mm.h |    2 +
>  kernel/memremap.c  |   24 +++++---------
>  mm/hmm.c           |   12 ++++---
>  mm/page_alloc.c    |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  4 files changed, 107 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 06d7d7576f8d..7312fb78ef31 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
>  {
>  	return page_zonenum(page) == ZONE_DEVICE;
>  }
> +extern void memmap_init_zone_device(struct zone *, unsigned long,
> +				    unsigned long, struct dev_pagemap *);
>  #else
>  static inline bool is_zone_device_page(const struct page *page)
>  {
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 5b8600d39931..d0c32e473f82 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	struct vmem_altmap *altmap = pgmap->altmap_valid ?
>  			&pgmap->altmap : NULL;
>  	struct resource *res = &pgmap->res;
> -	unsigned long pfn, pgoff, order;
> +	struct dev_pagemap *conflict_pgmap;
>  	pgprot_t pgprot = PAGE_KERNEL;
> +	unsigned long pgoff, order;
>  	int error, nid, is_ram;
> -	struct dev_pagemap *conflict_pgmap;
>  
>  	align_start = res->start & ~(SECTION_SIZE - 1);
>  	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
> @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	if (error)
>  		goto err_add_memory;
>  
> -	for_each_device_pfn(pfn, pgmap) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		/*
> -		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
> -		 * pointer.  It is a bug if a ZONE_DEVICE page is ever
> -		 * freed or placed on a driver-private list.  Seed the
> -		 * storage with LIST_POISON* values.
> -		 */
> -		list_del(&page->lru);
> -		page->pgmap = pgmap;
> -		percpu_ref_get(pgmap->ref);
> -	}
> +	/*
> +	 * Initialization of the pages has been deferred until now in order
> +	 * to allow us to do the work while not holding the hotplug lock.
> +	 */
> +	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> +				align_start >> PAGE_SHIFT,
> +				align_size >> PAGE_SHIFT, pgmap);
>  
>  	devm_add_action(dev, devm_memremap_pages_release, pgmap);
>  
> diff --git a/mm/hmm.c b/mm/hmm.c
> index c968e49f7a0c..774d684fa2b4 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
>  	resource_size_t key, align_start, align_size, align_end;
>  	struct device *device = devmem->device;
>  	int ret, nid, is_ram;
> -	unsigned long pfn;
>  
>  	align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
>  	align_size = ALIGN(devmem->resource->start +
> @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
>  				align_size >> PAGE_SHIFT, NULL);
>  	mem_hotplug_done();
>  
> -	for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
> -		struct page *page = pfn_to_page(pfn);
> +	/*
> +	 * Initialization of the pages has been deferred until now in order
> +	 * to allow us to do the work while not holding the hotplug lock.
> +	 */
> +	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> +				align_start >> PAGE_SHIFT,
> +				align_size >> PAGE_SHIFT, &devmem->pagemap);
>  
> -		page->pgmap = &devmem->pagemap;
> -	}
>  	return 0;
>  
>  error_add_memory:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 926ad3083b28..7ec0997ded39 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5489,12 +5489,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  	if (highest_memmap_pfn < end_pfn - 1)
>  		highest_memmap_pfn = end_pfn - 1;
>  
> +#ifdef CONFIG_ZONE_DEVICE
>  	/*
>  	 * Honor reservation requested by the driver for this ZONE_DEVICE
> -	 * memory
> +	 * memory. We limit the total number of pages to initialize to just
> +	 * those that might contain the memory mapping. We will defer the
> +	 * ZONE_DEVICE page initialization until after we have released
> +	 * the hotplug lock.
>  	 */
> -	if (altmap && start_pfn == altmap->base_pfn)
> -		start_pfn += altmap->reserve;
> +	if (zone == ZONE_DEVICE) {
> +		if (!altmap)
> +			return;
> +
> +		if (start_pfn == altmap->base_pfn)
> +			start_pfn += altmap->reserve;
> +		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> +	}
> +#endif
>  
>  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>  		/*
> @@ -5538,6 +5549,81 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  	}
>  }
>  
> +#ifdef CONFIG_ZONE_DEVICE
> +void __ref memmap_init_zone_device(struct zone *zone,
> +				   unsigned long start_pfn,
> +				   unsigned long size,
> +				   struct dev_pagemap *pgmap)
> +{
> +	unsigned long pfn, end_pfn = start_pfn + size;
> +	struct pglist_data *pgdat = zone->zone_pgdat;
> +	unsigned long zone_idx = zone_idx(zone);
> +	unsigned long start = jiffies;
> +	int nid = pgdat->node_id;
> +
> +	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
> +		return;
> +
> +	/*
> +	 * The call to memmap_init_zone should have already taken care
> +	 * of the pages reserved for the memmap, so we can just jump to
> +	 * the end of that region and start processing the device pages.
> +	 */
> +	if (pgmap->altmap_valid) {
> +		struct vmem_altmap *altmap = &pgmap->altmap;
> +
> +		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> +		size = end_pfn - start_pfn;
> +	}
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +		struct page *page = pfn_to_page(pfn);
> +
> +		__init_single_page(page, pfn, zone_idx, nid);
> +
> +		/*
> +		 * Mark page reserved as it will need to wait for onlining
> +		 * phase for it to be fully associated with a zone.
> +		 *
> +		 * We can use the non-atomic __set_bit operation for setting
> +		 * the flag as we are still initializing the pages.
> +		 */
> +		__SetPageReserved(page);
> +
> +		/*
> +		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
> +		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
> +		 * page is ever freed or placed on a driver-private list.
> +		 */
> +		page->pgmap = pgmap;
> +		page->hmm_data = 0;
> +
> +		/*
> +		 * Mark the block movable so that blocks are reserved for
> +		 * movable at startup. This will force kernel allocations
> +		 * to reserve their blocks rather than leaking throughout
> +		 * the address space during boot when many long-lived
> +		 * kernel allocations are made.
> +		 *
> +		 * bitmap is created for zone's valid pfn range. but memmap
> +		 * can be created for invalid pages (for alignment)
> +		 * check here not to call set_pageblock_migratetype() against
> +		 * pfn out of zone.
> +		 *
> +		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
> +		 * because this is done early in sparse_add_one_section
> +		 */
> +		if (!(pfn & (pageblock_nr_pages - 1))) {
> +			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
> +			cond_resched();
> +		}
> +	}
> +
> +	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
> +		size, jiffies_to_msecs(jiffies - start));
> +}
> +
> +#endif
>  static void __meminit zone_init_free_lists(struct zone *zone)
>  {
>  	unsigned int order, t;
>
Alexander Duyck Sept. 26, 2018, 6:25 p.m. UTC | #2
On 9/26/2018 12:55 AM, Michal Hocko wrote:
> On Tue 25-09-18 13:21:24, Alexander Duyck wrote:
>> The ZONE_DEVICE pages were being initialized in two locations. One was with
>> the memory_hotplug lock held and another was outside of that lock. The
>> problem with this is that it was nearly doubling the memory initialization
>> time. Instead of doing this twice, once while holding a global lock and
>> once without, I am opting to defer the initialization to the one outside of
>> the lock. This allows us to avoid serializing the overhead for memory init
>> and we can instead focus on per-node init times.
>>
>> One issue I encountered is that devm_memremap_pages and
>> hmm_devmmem_pages_create were initializing only the pgmap field the same
>> way. One wasn't initializing hmm_data, and the other was initializing it to
>> a poison value. Since this is something that is exposed to the driver in
>> the case of hmm I am opting for a third option and just initializing
>> hmm_data to 0 since this is going to be exposed to unknown third party
>> drivers.
> 
> Why cannot you pull move_pfn_range_to_zone out of the hotplug lock? In
> other words why are you making zone device even more special in the
> generic hotplug code when it already has its own means to initialize the
> pfn range by calling move_pfn_range_to_zone. Not to mention the code
> duplication.

So there were a few things I wasn't sure we could pull outside of the 
hotplug lock. One specific example is the bits related to resizing the 
pgdat and zone. I wanted to avoid pulling those bits outside of the 
hotplug lock.

The other bit that I left inside the hot-plug lock with this approach 
was the initialization of the pages that contain the vmemmap.

> That being said I really dislike this patch.

In my mind this was a patch that "killed two birds with one stone". I 
had two issues to address, the first one being the fact that we were 
performing the memmap_init_zone while holding the hotplug lock, and the 
other being the loop that was going through and initializing pgmap in 
the hmm and memremap calls essentially added another 20 seconds 
(measured for 3TB of memory per node) to the init time. With this patch 
I was able to cut my init time per node by that 20 seconds, and then 
made it so that we could scale as we added nodes as they could run in 
parallel.

With that said I am open to suggestions if you still feel like I need to 
follow this up with some additional work. I just want to avoid 
introducing any regressions in regards to functionality or performance.

Thanks.

- Alex
Dan Williams Sept. 26, 2018, 6:52 p.m. UTC | #3
On Wed, Sep 26, 2018 at 11:25 AM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
>
>
> On 9/26/2018 12:55 AM, Michal Hocko wrote:
> > On Tue 25-09-18 13:21:24, Alexander Duyck wrote:
> >> The ZONE_DEVICE pages were being initialized in two locations. One was with
> >> the memory_hotplug lock held and another was outside of that lock. The
> >> problem with this is that it was nearly doubling the memory initialization
> >> time. Instead of doing this twice, once while holding a global lock and
> >> once without, I am opting to defer the initialization to the one outside of
> >> the lock. This allows us to avoid serializing the overhead for memory init
> >> and we can instead focus on per-node init times.
> >>
> >> One issue I encountered is that devm_memremap_pages and
> >> hmm_devmmem_pages_create were initializing only the pgmap field the same
> >> way. One wasn't initializing hmm_data, and the other was initializing it to
> >> a poison value. Since this is something that is exposed to the driver in
> >> the case of hmm I am opting for a third option and just initializing
> >> hmm_data to 0 since this is going to be exposed to unknown third party
> >> drivers.
> >
> > Why cannot you pull move_pfn_range_to_zone out of the hotplug lock? In
> > other words why are you making zone device even more special in the
> > generic hotplug code when it already has its own means to initialize the
> > pfn range by calling move_pfn_range_to_zone. Not to mention the code
> > duplication.
>
> So there were a few things I wasn't sure we could pull outside of the
> hotplug lock. One specific example is the bits related to resizing the
> pgdat and zone. I wanted to avoid pulling those bits outside of the
> hotplug lock.
>
> The other bit that I left inside the hot-plug lock with this approach
> was the initialization of the pages that contain the vmemmap.
>
> > That being said I really dislike this patch.
>
> In my mind this was a patch that "killed two birds with one stone". I
> had two issues to address, the first one being the fact that we were
> performing the memmap_init_zone while holding the hotplug lock, and the
> other being the loop that was going through and initializing pgmap in
> the hmm and memremap calls essentially added another 20 seconds
> (measured for 3TB of memory per node) to the init time. With this patch
> I was able to cut my init time per node by that 20 seconds, and then
> made it so that we could scale as we added nodes as they could run in
> parallel.

Yeah, at the very least there is no reason for devm_memremap_pages()
to do another loop through all pages, the core should handle this, but
cleaning up the scope of the hotplug lock is needed.

> With that said I am open to suggestions if you still feel like I need to
> follow this up with some additional work. I just want to avoid
> introducing any regressions in regards to functionality or performance.

Could we push the hotplug lock deeper to the places that actually need
it? What I found with my initial investigation is that we don't even
need the hotplug lock for the vmemmap initialization with this patch
[1].

Alternatively it seems the hotplug lock wants to synchronize changes
to the zone and the page init work. If the hotplug lock was an rwsem
the zone changes would be a write lock, but the init work could be
done as a read lock to allow parallelism. I.e. still provide a sync
point to be able to assert that no hotplug work is in-flight will
holding the write lock, but otherwise allow threads that are touching
independent parts of the memmap to run at the same time.

[1]: https://patchwork.kernel.org/patch/10527229/ just focus on the
mm/sparse-vmemmap.c changes at the end.
Michal Hocko Sept. 27, 2018, 11:09 a.m. UTC | #4
On Wed 26-09-18 11:25:37, Alexander Duyck wrote:
> 
> 
> On 9/26/2018 12:55 AM, Michal Hocko wrote:
> > On Tue 25-09-18 13:21:24, Alexander Duyck wrote:
> > > The ZONE_DEVICE pages were being initialized in two locations. One was with
> > > the memory_hotplug lock held and another was outside of that lock. The
> > > problem with this is that it was nearly doubling the memory initialization
> > > time. Instead of doing this twice, once while holding a global lock and
> > > once without, I am opting to defer the initialization to the one outside of
> > > the lock. This allows us to avoid serializing the overhead for memory init
> > > and we can instead focus on per-node init times.
> > > 
> > > One issue I encountered is that devm_memremap_pages and
> > > hmm_devmmem_pages_create were initializing only the pgmap field the same
> > > way. One wasn't initializing hmm_data, and the other was initializing it to
> > > a poison value. Since this is something that is exposed to the driver in
> > > the case of hmm I am opting for a third option and just initializing
> > > hmm_data to 0 since this is going to be exposed to unknown third party
> > > drivers.
> > 
> > Why cannot you pull move_pfn_range_to_zone out of the hotplug lock? In
> > other words why are you making zone device even more special in the
> > generic hotplug code when it already has its own means to initialize the
> > pfn range by calling move_pfn_range_to_zone. Not to mention the code
> > duplication.
> 
> So there were a few things I wasn't sure we could pull outside of the
> hotplug lock. One specific example is the bits related to resizing the pgdat
> and zone. I wanted to avoid pulling those bits outside of the hotplug lock.

Why would that be a problem. There are dedicated locks for resizing.

> The other bit that I left inside the hot-plug lock with this approach was
> the initialization of the pages that contain the vmemmap.

Again, why this is needed?

> > That being said I really dislike this patch.
> 
> In my mind this was a patch that "killed two birds with one stone". I had
> two issues to address, the first one being the fact that we were performing
> the memmap_init_zone while holding the hotplug lock, and the other being the
> loop that was going through and initializing pgmap in the hmm and memremap
> calls essentially added another 20 seconds (measured for 3TB of memory per
> node) to the init time. With this patch I was able to cut my init time per
> node by that 20 seconds, and then made it so that we could scale as we added
> nodes as they could run in parallel.
> 
> With that said I am open to suggestions if you still feel like I need to
> follow this up with some additional work. I just want to avoid introducing
> any regressions in regards to functionality or performance.

Yes, I really do prefer this to be done properly rather than tweak it
around because of uncertainties.
Michal Hocko Sept. 27, 2018, 11:20 a.m. UTC | #5
On Wed 26-09-18 11:52:56, Dan Williams wrote:
[...]
> Could we push the hotplug lock deeper to the places that actually need
> it? What I found with my initial investigation is that we don't even
> need the hotplug lock for the vmemmap initialization with this patch
> [1].

Yes, the scope of the hotplug lock should be evaluated and _documented_.
Then we can build on top.
Oscar Salvador Sept. 27, 2018, 12:25 p.m. UTC | #6
On Thu, Sep 27, 2018 at 01:09:26PM +0200, Michal Hocko wrote:
> > So there were a few things I wasn't sure we could pull outside of the
> > hotplug lock. One specific example is the bits related to resizing the pgdat
> > and zone. I wanted to avoid pulling those bits outside of the hotplug lock.
> 
> Why would that be a problem. There are dedicated locks for resizing.

True is that move_pfn_range_to_zone() manages the locks for pgdat/zone resizing,
but it also takes care of calling init_currently_empty_zone() in case the zone is empty.
Could not that be a problem if we take move_pfn_range_to_zone() out of the lock?
Oscar Salvador Sept. 27, 2018, 12:32 p.m. UTC | #7
On Wed, Sep 26, 2018 at 11:25:37AM -0700, Alexander Duyck wrote:
> With that said I am open to suggestions if you still feel like I need to
> follow this up with some additional work. I just want to avoid introducing
> any regressions in regards to functionality or performance.

Hi Alexander,

the problem I see is that devm/hmm is using some of the memory-hotplug 
features, but their paths are becoming more and more diverged with changes
like this, and that is sometimes a problem when we need to change
something in the generic memory-hotplug code.

E.g: I am trying to fix two issues in the memory-hotplug where we can
access steal pages if we hot-remove memory before online it.
That was not so difficult to fix, but I really struggled with the exceptions
that HMM/devm represent in this regard, for instance, regarding the resources.

The RFCv2 can be found here [1] https://patchwork.kernel.org/patch/10569083/
And the initial discussion with Jerome Glisse can be found here [2].

So it would be great to stick to the memory-hotplug path as much as possible,
otherwise when a problem arises, we need to think how we can workaround
HMM/devm.

[1] https://patchwork.kernel.org/patch/10569083/
[2] https://patchwork.kernel.org/patch/10558725/
Michal Hocko Sept. 27, 2018, 1:13 p.m. UTC | #8
On Thu 27-09-18 14:25:37, Oscar Salvador wrote:
> On Thu, Sep 27, 2018 at 01:09:26PM +0200, Michal Hocko wrote:
> > > So there were a few things I wasn't sure we could pull outside of the
> > > hotplug lock. One specific example is the bits related to resizing the pgdat
> > > and zone. I wanted to avoid pulling those bits outside of the hotplug lock.
> > 
> > Why would that be a problem. There are dedicated locks for resizing.
> 
> True is that move_pfn_range_to_zone() manages the locks for pgdat/zone resizing,
> but it also takes care of calling init_currently_empty_zone() in case the zone is empty.
> Could not that be a problem if we take move_pfn_range_to_zone() out of the lock?

I would have to double check but is the hotplug lock really serializing
access to the state initialized by init_currently_empty_zone? E.g.
zone_start_pfn is a nice example of a state that is used outside of the
lock. zone's free lists are similar. So do we really need the hoptlug
lock? And more broadly, what does the hotplug lock is supposed to
serialize in general. A proper documentation would surely help to answer
these questions. There is way too much of "do not touch this code and
just make my particular hack" mindset which made the whole memory
hotplug a giant pile of mess. We really should start with some proper
engineering here finally.
Oscar Salvador Sept. 27, 2018, 2:50 p.m. UTC | #9
On Thu, Sep 27, 2018 at 03:13:29PM +0200, Michal Hocko wrote:
> On Thu 27-09-18 14:25:37, Oscar Salvador wrote:
> > On Thu, Sep 27, 2018 at 01:09:26PM +0200, Michal Hocko wrote:
> > > > So there were a few things I wasn't sure we could pull outside of the
> > > > hotplug lock. One specific example is the bits related to resizing the pgdat
> > > > and zone. I wanted to avoid pulling those bits outside of the hotplug lock.
> > > 
> > > Why would that be a problem. There are dedicated locks for resizing.
> > 
> > True is that move_pfn_range_to_zone() manages the locks for pgdat/zone resizing,
> > but it also takes care of calling init_currently_empty_zone() in case the zone is empty.
> > Could not that be a problem if we take move_pfn_range_to_zone() out of the lock?
> 
> I would have to double check but is the hotplug lock really serializing
> access to the state initialized by init_currently_empty_zone? E.g.
> zone_start_pfn is a nice example of a state that is used outside of the
> lock. zone's free lists are similar. So do we really need the hoptlug
> lock? And more broadly, what does the hotplug lock is supposed to
> serialize in general. A proper documentation would surely help to answer
> these questions. There is way too much of "do not touch this code and
> just make my particular hack" mindset which made the whole memory
> hotplug a giant pile of mess. We really should start with some proper
> engineering here finally.

CC David

David has been looking into this lately, he even has updated memory-hotplug.txt
with some more documentation about the locking aspect [1].
And with this change [2], the hotplug lock has been moved
to the online/offline_pages.

From what I see (I might be wrong), the hotplug lock is there
to serialize the online/offline operations.

In online_pages, we do (among other things):

a) initialize the zone and its pages, and link them to the zone
b) re-adjust zone/pgdat nr of pages (present, spanned, managed)
b) check if the node changes in regard of N_MEMORY, N_HIGH_MEMORY or N_NORMAL_MEMORY.
c) fire notifiers
d) rebuild the zonelists in case we got a new zone
e) online memory sections and free the pages to the buddy allocator
f) wake up kswapd/kcompactd in case we got a new node

while in offline_pages we do the opposite.

Hotplug lock here serializes the operations as a whole, online and offline memory,
so they do not step on each other's feet.

Having said that, we might be able to move some of those operations out of the hotplug lock.
The device_hotplug_lock coming from every memblock (which is taken in device_online/device_offline) should protect
us against some operations being made on the same memblock (e.g: touching the same pages).

For the given case about move_pfn_range_to_zone() I have my doubts.
The resizing of the pgdat/zone is already serialized, so no discussion there.

Then we have memmap_init_zone. I __think__ that that should not worry us because those pages
belong to a memblock, and device_online/device_offline is serialized.
HMM/devm is different here as they do not handle memblocks.

And then we have init_currently_empty_zone.
Assuming that the shrinking of pages/removal of the zone is finally brought to
the offline stage (where it should be), I am not sure if we can somehow
race with an offline operation there if we do not hold the hotplug lock.

[1] https://patchwork.kernel.org/patch/10617715/
[2] https://patchwork.kernel.org/patch/10617705/
David Hildenbrand Sept. 27, 2018, 3:41 p.m. UTC | #10
On 27/09/2018 16:50, Oscar Salvador wrote:
> On Thu, Sep 27, 2018 at 03:13:29PM +0200, Michal Hocko wrote:
>> On Thu 27-09-18 14:25:37, Oscar Salvador wrote:
>>> On Thu, Sep 27, 2018 at 01:09:26PM +0200, Michal Hocko wrote:
>>>>> So there were a few things I wasn't sure we could pull outside of the
>>>>> hotplug lock. One specific example is the bits related to resizing the pgdat
>>>>> and zone. I wanted to avoid pulling those bits outside of the hotplug lock.
>>>>
>>>> Why would that be a problem. There are dedicated locks for resizing.
>>>
>>> True is that move_pfn_range_to_zone() manages the locks for pgdat/zone resizing,
>>> but it also takes care of calling init_currently_empty_zone() in case the zone is empty.
>>> Could not that be a problem if we take move_pfn_range_to_zone() out of the lock?
>>
>> I would have to double check but is the hotplug lock really serializing
>> access to the state initialized by init_currently_empty_zone? E.g.
>> zone_start_pfn is a nice example of a state that is used outside of the
>> lock. zone's free lists are similar. So do we really need the hoptlug
>> lock? And more broadly, what does the hotplug lock is supposed to
>> serialize in general. A proper documentation would surely help to answer
>> these questions. There is way too much of "do not touch this code and
>> just make my particular hack" mindset which made the whole memory
>> hotplug a giant pile of mess. We really should start with some proper
>> engineering here finally.
> 
> CC David
> 
> David has been looking into this lately, he even has updated memory-hotplug.txt
> with some more documentation about the locking aspect [1].
> And with this change [2], the hotplug lock has been moved
> to the online/offline_pages.
> 
> From what I see (I might be wrong), the hotplug lock is there
> to serialize the online/offline operations.

mem_hotplug_lock is especially relevant for users of
get_online_mems/put_online_mems. Whatever affects them, you can't move
out of the lock.

Everything else is theoretically serialized via device_hotplug_lock now.

> 
> In online_pages, we do (among other things):
> 
> a) initialize the zone and its pages, and link them to the zone
> b) re-adjust zone/pgdat nr of pages (present, spanned, managed)
> b) check if the node changes in regard of N_MEMORY, N_HIGH_MEMORY or N_NORMAL_MEMORY.
> c) fire notifiers
> d) rebuild the zonelists in case we got a new zone
> e) online memory sections and free the pages to the buddy allocator
> f) wake up kswapd/kcompactd in case we got a new node
> 
> while in offline_pages we do the opposite.
> 
> Hotplug lock here serializes the operations as a whole, online and offline memory,
> so they do not step on each other's feet.
> 
> Having said that, we might be able to move some of those operations out of the hotplug lock.
> The device_hotplug_lock coming from every memblock (which is taken in device_online/device_offline) should protect
> us against some operations being made on the same memblock (e.g: touching the same pages).

Yes, very right.
Oscar Salvador Sept. 28, 2018, 8:12 a.m. UTC | #11
On Thu, Sep 27, 2018 at 03:13:29PM +0200, Michal Hocko wrote:
> I would have to double check but is the hotplug lock really serializing
> access to the state initialized by init_currently_empty_zone? E.g.
> zone_start_pfn is a nice example of a state that is used outside of the
> lock. zone's free lists are similar. So do we really need the hoptlug
> lock? And more broadly, what does the hotplug lock is supposed to
> serialize in general. A proper documentation would surely help to answer
> these questions. There is way too much of "do not touch this code and
> just make my particular hack" mindset which made the whole memory
> hotplug a giant pile of mess. We really should start with some proper
> engineering here finally.

* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path.  But, it is written
* quite infrequently.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.

IIUC, looks like zone_start_pfn should be envolved with
zone_span_writelock/zone_span_writeunlock, and since zone_start_pfn is changed
in init_currently_empty_zone, I guess that the whole function should be within
that lock.

So, a blind shot, but could we do something like the following?

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 898e1f816821..49f87252f1b1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -764,14 +764,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
        int nid = pgdat->node_id;
        unsigned long flags;
 
-       if (zone_is_empty(zone))
-               init_currently_empty_zone(zone, start_pfn, nr_pages);
-
        clear_zone_contiguous(zone);
 
        /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
        pgdat_resize_lock(pgdat, &flags);
        zone_span_writelock(zone);
+       if (zone_is_empty(zone))
+               init_currently_empty_zone(zone, start_pfn, nr_pages);
        resize_zone_range(zone, start_pfn, nr_pages);
        zone_span_writeunlock(zone);
        resize_pgdat_range(pgdat, start_pfn, nr_pages);

Then, we could take move_pfn_range_to_zone out of the hotplug lock.

Although I am not sure about leaving memmap_init_zone unprotected.
For the normal memory, that is not a problem since the memblock's lock
protects us from touching the same pages at the same time in online/offline_pages,
but for HMM/devm the story is different.

I am totally unaware of HMM/devm, so I am not sure if its protected somehow.
e.g: what happens if devm_memremap_pages and devm_memremap_pages_release are running
at the same time for the same memory-range (with the assumption that the hotplug-lock
does not protect move_pfn_range_to_zone anymore).
Oscar Salvador Sept. 28, 2018, 8:44 a.m. UTC | #12
On Fri, Sep 28, 2018 at 10:12:24AM +0200, Oscar Salvador wrote:
> Although I am not sure about leaving memmap_init_zone unprotected.
> For the normal memory, that is not a problem since the memblock's lock
> protects us from touching the same pages at the same time in online/offline_pages,
> but for HMM/devm the story is different.
> 
> I am totally unaware of HMM/devm, so I am not sure if its protected somehow.
> e.g: what happens if devm_memremap_pages and devm_memremap_pages_release are running
> at the same time for the same memory-range (with the assumption that the hotplug-lock
> does not protect move_pfn_range_to_zone anymore).

I guess that this could not happen since the device is not linked to devm_memremap_pages_release
until the end with:

devm_add_action(dev, devm_memremap_pages_release, pgmap)
Dan Williams Sept. 28, 2018, 3:50 p.m. UTC | #13
On Fri, Sep 28, 2018 at 1:45 AM Oscar Salvador
<osalvador@techadventures.net> wrote:
>
> On Fri, Sep 28, 2018 at 10:12:24AM +0200, Oscar Salvador wrote:
> > Although I am not sure about leaving memmap_init_zone unprotected.
> > For the normal memory, that is not a problem since the memblock's lock
> > protects us from touching the same pages at the same time in online/offline_pages,
> > but for HMM/devm the story is different.
> >
> > I am totally unaware of HMM/devm, so I am not sure if its protected somehow.
> > e.g: what happens if devm_memremap_pages and devm_memremap_pages_release are running
> > at the same time for the same memory-range (with the assumption that the hotplug-lock
> > does not protect move_pfn_range_to_zone anymore).
>
> I guess that this could not happen since the device is not linked to devm_memremap_pages_release
> until the end with:
>
> devm_add_action(dev, devm_memremap_pages_release, pgmap)

It's a bug if devm_memremap_pages and devm_memremap_pages_release are
running simultaneously for the same range. This is enforced by the
requirement that the caller has done a successful request_region() on
the range before the call to map pages.
Dan Williams Oct. 8, 2018, 9:01 p.m. UTC | #14
On Tue, Sep 25, 2018 at 1:29 PM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> The ZONE_DEVICE pages were being initialized in two locations. One was with
> the memory_hotplug lock held and another was outside of that lock. The
> problem with this is that it was nearly doubling the memory initialization
> time. Instead of doing this twice, once while holding a global lock and
> once without, I am opting to defer the initialization to the one outside of
> the lock. This allows us to avoid serializing the overhead for memory init
> and we can instead focus on per-node init times.
>
> One issue I encountered is that devm_memremap_pages and
> hmm_devmmem_pages_create were initializing only the pgmap field the same
> way. One wasn't initializing hmm_data, and the other was initializing it to
> a poison value. Since this is something that is exposed to the driver in
> the case of hmm I am opting for a third option and just initializing
> hmm_data to 0 since this is going to be exposed to unknown third party
> drivers.
>
> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
>
> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>     merge conflicts with other changes in the kernel.
> v5: No change

This patch appears to cause a regression in the "create.sh" unit test
in the ndctl test suite.

I tried to reproduce on -next with:

2302f5ee215e mm: defer ZONE_DEVICE page initialization to the point
where we init pgmap

...but -next does not even boot for me at that commit.

Here is a warning signature that proceeds a hang with this patch
applied against v4.19-rc6:

percpu ref (blk_queue_usage_counter_release) <= 0 (-1530626) after
switching to atomic
WARNING: CPU: 24 PID: 7346 at lib/percpu-refcount.c:155
percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
CPU: 24 PID: 7346 Comm: modprobe Tainted: G           OE     4.19.0-rc6+ #2458
[..]
RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
[..]
Call Trace:
 <IRQ>
 ? percpu_ref_reinit+0x140/0x140
 rcu_process_callbacks+0x273/0x880
 __do_softirq+0xd2/0x428
 irq_exit+0xf6/0x100
 smp_apic_timer_interrupt+0xa2/0x220
 apic_timer_interrupt+0xf/0x20
 </IRQ>
RIP: 0010:lock_acquire+0xb8/0x1a0
[..]
 ? __put_page+0x55/0x150
 ? __put_page+0x55/0x150
 __put_page+0x83/0x150
 ? __put_page+0x55/0x150
 devm_memremap_pages_release+0x194/0x250
 release_nodes+0x17c/0x2c0
 device_release_driver_internal+0x1a2/0x250
 driver_detach+0x3a/0x70
 bus_remove_driver+0x58/0xd0
 __x64_sys_delete_module+0x13f/0x200
 ? trace_hardirqs_off_thunk+0x1a/0x1c
 do_syscall_64+0x60/0x210
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Alexander Duyck Oct. 8, 2018, 9:38 p.m. UTC | #15
On 10/8/2018 2:01 PM, Dan Williams wrote:
> On Tue, Sep 25, 2018 at 1:29 PM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
>>
>> The ZONE_DEVICE pages were being initialized in two locations. One was with
>> the memory_hotplug lock held and another was outside of that lock. The
>> problem with this is that it was nearly doubling the memory initialization
>> time. Instead of doing this twice, once while holding a global lock and
>> once without, I am opting to defer the initialization to the one outside of
>> the lock. This allows us to avoid serializing the overhead for memory init
>> and we can instead focus on per-node init times.
>>
>> One issue I encountered is that devm_memremap_pages and
>> hmm_devmmem_pages_create were initializing only the pgmap field the same
>> way. One wasn't initializing hmm_data, and the other was initializing it to
>> a poison value. Since this is something that is exposed to the driver in
>> the case of hmm I am opting for a third option and just initializing
>> hmm_data to 0 since this is going to be exposed to unknown third party
>> drivers.
>>
>> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>> ---
>>
>> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>>      merge conflicts with other changes in the kernel.
>> v5: No change
> 
> This patch appears to cause a regression in the "create.sh" unit test
> in the ndctl test suite.

So all you had to do is run the create.sh script to see the issue? I 
just want to confirm there isn't any additional information needed 
before I try chasing this down.

> I tried to reproduce on -next with:
> 
> 2302f5ee215e mm: defer ZONE_DEVICE page initialization to the point
> where we init pgmap
> 
> ...but -next does not even boot for me at that commit.

What version of -next? There are a couple of patches probably needed 
depending on which version you are trying to boot.

> Here is a warning signature that proceeds a hang with this patch
> applied against v4.19-rc6:
> 
> percpu ref (blk_queue_usage_counter_release) <= 0 (-1530626) after
> switching to atomic
> WARNING: CPU: 24 PID: 7346 at lib/percpu-refcount.c:155
> percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
> CPU: 24 PID: 7346 Comm: modprobe Tainted: G           OE     4.19.0-rc6+ #2458
> [..]
> RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
> [..]
> Call Trace:
>   <IRQ>
>   ? percpu_ref_reinit+0x140/0x140
>   rcu_process_callbacks+0x273/0x880
>   __do_softirq+0xd2/0x428
>   irq_exit+0xf6/0x100
>   smp_apic_timer_interrupt+0xa2/0x220
>   apic_timer_interrupt+0xf/0x20
>   </IRQ>
> RIP: 0010:lock_acquire+0xb8/0x1a0
> [..]
>   ? __put_page+0x55/0x150
>   ? __put_page+0x55/0x150
>   __put_page+0x83/0x150
>   ? __put_page+0x55/0x150
>   devm_memremap_pages_release+0x194/0x250
>   release_nodes+0x17c/0x2c0
>   device_release_driver_internal+0x1a2/0x250
>   driver_detach+0x3a/0x70
>   bus_remove_driver+0x58/0xd0
>   __x64_sys_delete_module+0x13f/0x200
>   ? trace_hardirqs_off_thunk+0x1a/0x1c
>   do_syscall_64+0x60/0x210
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 

So it looks like we are tearing down memory when this is triggered. Do 
we know if this is at the end of the test or if this is running in 
parallel with anything?
Dan Williams Oct. 8, 2018, 10 p.m. UTC | #16
On Mon, Oct 8, 2018 at 2:48 PM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On 10/8/2018 2:01 PM, Dan Williams wrote:
> > On Tue, Sep 25, 2018 at 1:29 PM Alexander Duyck
> > <alexander.h.duyck@linux.intel.com> wrote:
> >>
> >> The ZONE_DEVICE pages were being initialized in two locations. One was with
> >> the memory_hotplug lock held and another was outside of that lock. The
> >> problem with this is that it was nearly doubling the memory initialization
> >> time. Instead of doing this twice, once while holding a global lock and
> >> once without, I am opting to defer the initialization to the one outside of
> >> the lock. This allows us to avoid serializing the overhead for memory init
> >> and we can instead focus on per-node init times.
> >>
> >> One issue I encountered is that devm_memremap_pages and
> >> hmm_devmmem_pages_create were initializing only the pgmap field the same
> >> way. One wasn't initializing hmm_data, and the other was initializing it to
> >> a poison value. Since this is something that is exposed to the driver in
> >> the case of hmm I am opting for a third option and just initializing
> >> hmm_data to 0 since this is going to be exposed to unknown third party
> >> drivers.
> >>
> >> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
> >> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >> ---
> >>
> >> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
> >>      merge conflicts with other changes in the kernel.
> >> v5: No change
> >
> > This patch appears to cause a regression in the "create.sh" unit test
> > in the ndctl test suite.
>
> So all you had to do is run the create.sh script to see the issue? I
> just want to confirm there isn't any additional information needed
> before I try chasing this down.

From the ndctl source tree run:

    make -j TESTS="create.sh" check

...the readme has some more setup instructions:
https://github.com/pmem/ndctl/blob/master/README.md

0day has sometimes run this test suite automatically, but we need to
get that more robust because setting up this environment is a bit of a
hoop to jump through with the need to setup the nfit_test module.

> > I tried to reproduce on -next with:
> >
> > 2302f5ee215e mm: defer ZONE_DEVICE page initialization to the point
> > where we init pgmap
> >
> > ...but -next does not even boot for me at that commit.
>
> What version of -next? There are a couple of patches probably needed
> depending on which version you are trying to boot.

Today's -next, but backed up to that above commit. I was also seeing
CONFIG_DEBUG_LIST spamming the logs, and a crash in the crypto layer.

> > Here is a warning signature that proceeds a hang with this patch
> > applied against v4.19-rc6:
> >
> > percpu ref (blk_queue_usage_counter_release) <= 0 (-1530626) after
> > switching to atomic
> > WARNING: CPU: 24 PID: 7346 at lib/percpu-refcount.c:155
> > percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
> > CPU: 24 PID: 7346 Comm: modprobe Tainted: G           OE     4.19.0-rc6+ #2458
> > [..]
> > RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
> > [..]
> > Call Trace:
> >   <IRQ>
> >   ? percpu_ref_reinit+0x140/0x140
> >   rcu_process_callbacks+0x273/0x880
> >   __do_softirq+0xd2/0x428
> >   irq_exit+0xf6/0x100
> >   smp_apic_timer_interrupt+0xa2/0x220
> >   apic_timer_interrupt+0xf/0x20
> >   </IRQ>
> > RIP: 0010:lock_acquire+0xb8/0x1a0
> > [..]
> >   ? __put_page+0x55/0x150
> >   ? __put_page+0x55/0x150
> >   __put_page+0x83/0x150
> >   ? __put_page+0x55/0x150
> >   devm_memremap_pages_release+0x194/0x250
> >   release_nodes+0x17c/0x2c0
> >   device_release_driver_internal+0x1a2/0x250
> >   driver_detach+0x3a/0x70
> >   bus_remove_driver+0x58/0xd0
> >   __x64_sys_delete_module+0x13f/0x200
> >   ? trace_hardirqs_off_thunk+0x1a/0x1c
> >   do_syscall_64+0x60/0x210
> >   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> >
>
> So it looks like we are tearing down memory when this is triggered. Do
> we know if this is at the end of the test or if this is running in
> parallel with anything?

Should not be running in parallel with anything this test is
performing a series of namespace setup and teardown events.

Wait, where did the call to "percpu_ref_get()" go? I think that's the bug.
Alexander Duyck Oct. 8, 2018, 10:07 p.m. UTC | #17
On 10/8/2018 3:00 PM, Dan Williams wrote:
> On Mon, Oct 8, 2018 at 2:48 PM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
>>
>> On 10/8/2018 2:01 PM, Dan Williams wrote:
>>> On Tue, Sep 25, 2018 at 1:29 PM Alexander Duyck
>>> <alexander.h.duyck@linux.intel.com> wrote:
>>>>
>>>> The ZONE_DEVICE pages were being initialized in two locations. One was with
>>>> the memory_hotplug lock held and another was outside of that lock. The
>>>> problem with this is that it was nearly doubling the memory initialization
>>>> time. Instead of doing this twice, once while holding a global lock and
>>>> once without, I am opting to defer the initialization to the one outside of
>>>> the lock. This allows us to avoid serializing the overhead for memory init
>>>> and we can instead focus on per-node init times.
>>>>
>>>> One issue I encountered is that devm_memremap_pages and
>>>> hmm_devmmem_pages_create were initializing only the pgmap field the same
>>>> way. One wasn't initializing hmm_data, and the other was initializing it to
>>>> a poison value. Since this is something that is exposed to the driver in
>>>> the case of hmm I am opting for a third option and just initializing
>>>> hmm_data to 0 since this is going to be exposed to unknown third party
>>>> drivers.
>>>>
>>>> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
>>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>>> ---
>>>>
>>>> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>>>>       merge conflicts with other changes in the kernel.
>>>> v5: No change
>>>
>>> This patch appears to cause a regression in the "create.sh" unit test
>>> in the ndctl test suite.
>>
>> So all you had to do is run the create.sh script to see the issue? I
>> just want to confirm there isn't any additional information needed
>> before I try chasing this down.
> 
>  From the ndctl source tree run:
> 
>      make -j TESTS="create.sh" check
> 
> ...the readme has some more setup instructions:
> https://github.com/pmem/ndctl/blob/master/README.md
> 
> 0day has sometimes run this test suite automatically, but we need to
> get that more robust because setting up this environment is a bit of a
> hoop to jump through with the need to setup the nfit_test module.
> 
>>> I tried to reproduce on -next with:
>>>
>>> 2302f5ee215e mm: defer ZONE_DEVICE page initialization to the point
>>> where we init pgmap
>>>
>>> ...but -next does not even boot for me at that commit.
>>
>> What version of -next? There are a couple of patches probably needed
>> depending on which version you are trying to boot.
> 
> Today's -next, but backed up to that above commit. I was also seeing
> CONFIG_DEBUG_LIST spamming the logs, and a crash in the crypto layer.
> 
>>> Here is a warning signature that proceeds a hang with this patch
>>> applied against v4.19-rc6:
>>>
>>> percpu ref (blk_queue_usage_counter_release) <= 0 (-1530626) after
>>> switching to atomic
>>> WARNING: CPU: 24 PID: 7346 at lib/percpu-refcount.c:155
>>> percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
>>> CPU: 24 PID: 7346 Comm: modprobe Tainted: G           OE     4.19.0-rc6+ #2458
>>> [..]
>>> RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
>>> [..]
>>> Call Trace:
>>>    <IRQ>
>>>    ? percpu_ref_reinit+0x140/0x140
>>>    rcu_process_callbacks+0x273/0x880
>>>    __do_softirq+0xd2/0x428
>>>    irq_exit+0xf6/0x100
>>>    smp_apic_timer_interrupt+0xa2/0x220
>>>    apic_timer_interrupt+0xf/0x20
>>>    </IRQ>
>>> RIP: 0010:lock_acquire+0xb8/0x1a0
>>> [..]
>>>    ? __put_page+0x55/0x150
>>>    ? __put_page+0x55/0x150
>>>    __put_page+0x83/0x150
>>>    ? __put_page+0x55/0x150
>>>    devm_memremap_pages_release+0x194/0x250
>>>    release_nodes+0x17c/0x2c0
>>>    device_release_driver_internal+0x1a2/0x250
>>>    driver_detach+0x3a/0x70
>>>    bus_remove_driver+0x58/0xd0
>>>    __x64_sys_delete_module+0x13f/0x200
>>>    ? trace_hardirqs_off_thunk+0x1a/0x1c
>>>    do_syscall_64+0x60/0x210
>>>    entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>
>>
>> So it looks like we are tearing down memory when this is triggered. Do
>> we know if this is at the end of the test or if this is running in
>> parallel with anything?
> 
> Should not be running in parallel with anything this test is
> performing a series of namespace setup and teardown events.
> 
> Wait, where did the call to "percpu_ref_get()" go? I think that's the bug.

Actually I think you are probably right. Do you want to get that or 
should I. Should be a quick patch since you could probably just add a 
call to percpu_ref_get_many to hold a reference for each page in the 
range of device pages before calling memmap_init_zone_device.

- Alex
Alexander Duyck Oct. 8, 2018, 10:36 p.m. UTC | #18
On 10/8/2018 3:00 PM, Dan Williams wrote:
> On Mon, Oct 8, 2018 at 2:48 PM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
>>
>> On 10/8/2018 2:01 PM, Dan Williams wrote:
>>> On Tue, Sep 25, 2018 at 1:29 PM Alexander Duyck
>>> <alexander.h.duyck@linux.intel.com> wrote:
>>>>
>>>> The ZONE_DEVICE pages were being initialized in two locations. One was with
>>>> the memory_hotplug lock held and another was outside of that lock. The
>>>> problem with this is that it was nearly doubling the memory initialization
>>>> time. Instead of doing this twice, once while holding a global lock and
>>>> once without, I am opting to defer the initialization to the one outside of
>>>> the lock. This allows us to avoid serializing the overhead for memory init
>>>> and we can instead focus on per-node init times.
>>>>
>>>> One issue I encountered is that devm_memremap_pages and
>>>> hmm_devmmem_pages_create were initializing only the pgmap field the same
>>>> way. One wasn't initializing hmm_data, and the other was initializing it to
>>>> a poison value. Since this is something that is exposed to the driver in
>>>> the case of hmm I am opting for a third option and just initializing
>>>> hmm_data to 0 since this is going to be exposed to unknown third party
>>>> drivers.
>>>>
>>>> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
>>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>>> ---
>>>>
>>>> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>>>>       merge conflicts with other changes in the kernel.
>>>> v5: No change
>>>
>>> This patch appears to cause a regression in the "create.sh" unit test
>>> in the ndctl test suite.
>>
>> So all you had to do is run the create.sh script to see the issue? I
>> just want to confirm there isn't any additional information needed
>> before I try chasing this down.
> 
>  From the ndctl source tree run:
> 
>      make -j TESTS="create.sh" check
> 
> ...the readme has some more setup instructions:
> https://github.com/pmem/ndctl/blob/master/README.md
> 
> 0day has sometimes run this test suite automatically, but we need to
> get that more robust because setting up this environment is a bit of a
> hoop to jump through with the need to setup the nfit_test module.
> 
>>> I tried to reproduce on -next with:
>>>
>>> 2302f5ee215e mm: defer ZONE_DEVICE page initialization to the point
>>> where we init pgmap
>>>
>>> ...but -next does not even boot for me at that commit.
>>
>> What version of -next? There are a couple of patches probably needed
>> depending on which version you are trying to boot.
> 
> Today's -next, but backed up to that above commit. I was also seeing
> CONFIG_DEBUG_LIST spamming the logs, and a crash in the crypto layer.
> 
>>> Here is a warning signature that proceeds a hang with this patch
>>> applied against v4.19-rc6:
>>>
>>> percpu ref (blk_queue_usage_counter_release) <= 0 (-1530626) after
>>> switching to atomic
>>> WARNING: CPU: 24 PID: 7346 at lib/percpu-refcount.c:155
>>> percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
>>> CPU: 24 PID: 7346 Comm: modprobe Tainted: G           OE     4.19.0-rc6+ #2458
>>> [..]
>>> RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
>>> [..]
>>> Call Trace:
>>>    <IRQ>
>>>    ? percpu_ref_reinit+0x140/0x140
>>>    rcu_process_callbacks+0x273/0x880
>>>    __do_softirq+0xd2/0x428
>>>    irq_exit+0xf6/0x100
>>>    smp_apic_timer_interrupt+0xa2/0x220
>>>    apic_timer_interrupt+0xf/0x20
>>>    </IRQ>
>>> RIP: 0010:lock_acquire+0xb8/0x1a0
>>> [..]
>>>    ? __put_page+0x55/0x150
>>>    ? __put_page+0x55/0x150
>>>    __put_page+0x83/0x150
>>>    ? __put_page+0x55/0x150
>>>    devm_memremap_pages_release+0x194/0x250
>>>    release_nodes+0x17c/0x2c0
>>>    device_release_driver_internal+0x1a2/0x250
>>>    driver_detach+0x3a/0x70
>>>    bus_remove_driver+0x58/0xd0
>>>    __x64_sys_delete_module+0x13f/0x200
>>>    ? trace_hardirqs_off_thunk+0x1a/0x1c
>>>    do_syscall_64+0x60/0x210
>>>    entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>
>>
>> So it looks like we are tearing down memory when this is triggered. Do
>> we know if this is at the end of the test or if this is running in
>> parallel with anything?
> 
> Should not be running in parallel with anything this test is
> performing a series of namespace setup and teardown events.
> 
> Wait, where did the call to "percpu_ref_get()" go? I think that's the bug.
> 

I have a reproduction on my system now as well. I should have a patch 
ready to go for it in the next hour or so.

Thanks.

- Alex
Dan Williams Oct. 8, 2018, 10:59 p.m. UTC | #19
On Mon, Oct 8, 2018 at 3:37 PM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
>
>
> On 10/8/2018 3:00 PM, Dan Williams wrote:
> > On Mon, Oct 8, 2018 at 2:48 PM Alexander Duyck
> > <alexander.h.duyck@linux.intel.com> wrote:
> >>
> >> On 10/8/2018 2:01 PM, Dan Williams wrote:
> >>> On Tue, Sep 25, 2018 at 1:29 PM Alexander Duyck
> >>> <alexander.h.duyck@linux.intel.com> wrote:
> >>>>
> >>>> The ZONE_DEVICE pages were being initialized in two locations. One was with
> >>>> the memory_hotplug lock held and another was outside of that lock. The
> >>>> problem with this is that it was nearly doubling the memory initialization
> >>>> time. Instead of doing this twice, once while holding a global lock and
> >>>> once without, I am opting to defer the initialization to the one outside of
> >>>> the lock. This allows us to avoid serializing the overhead for memory init
> >>>> and we can instead focus on per-node init times.
> >>>>
> >>>> One issue I encountered is that devm_memremap_pages and
> >>>> hmm_devmmem_pages_create were initializing only the pgmap field the same
> >>>> way. One wasn't initializing hmm_data, and the other was initializing it to
> >>>> a poison value. Since this is something that is exposed to the driver in
> >>>> the case of hmm I am opting for a third option and just initializing
> >>>> hmm_data to 0 since this is going to be exposed to unknown third party
> >>>> drivers.
> >>>>
> >>>> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
> >>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >>>> ---
> >>>>
> >>>> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
> >>>>       merge conflicts with other changes in the kernel.
> >>>> v5: No change
> >>>
> >>> This patch appears to cause a regression in the "create.sh" unit test
> >>> in the ndctl test suite.
> >>
> >> So all you had to do is run the create.sh script to see the issue? I
> >> just want to confirm there isn't any additional information needed
> >> before I try chasing this down.
> >
> >  From the ndctl source tree run:
> >
> >      make -j TESTS="create.sh" check
> >
> > ...the readme has some more setup instructions:
> > https://github.com/pmem/ndctl/blob/master/README.md
> >
> > 0day has sometimes run this test suite automatically, but we need to
> > get that more robust because setting up this environment is a bit of a
> > hoop to jump through with the need to setup the nfit_test module.
> >
> >>> I tried to reproduce on -next with:
> >>>
> >>> 2302f5ee215e mm: defer ZONE_DEVICE page initialization to the point
> >>> where we init pgmap
> >>>
> >>> ...but -next does not even boot for me at that commit.
> >>
> >> What version of -next? There are a couple of patches probably needed
> >> depending on which version you are trying to boot.
> >
> > Today's -next, but backed up to that above commit. I was also seeing
> > CONFIG_DEBUG_LIST spamming the logs, and a crash in the crypto layer.
> >
> >>> Here is a warning signature that proceeds a hang with this patch
> >>> applied against v4.19-rc6:
> >>>
> >>> percpu ref (blk_queue_usage_counter_release) <= 0 (-1530626) after
> >>> switching to atomic
> >>> WARNING: CPU: 24 PID: 7346 at lib/percpu-refcount.c:155
> >>> percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
> >>> CPU: 24 PID: 7346 Comm: modprobe Tainted: G           OE     4.19.0-rc6+ #2458
> >>> [..]
> >>> RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x1f7/0x200
> >>> [..]
> >>> Call Trace:
> >>>    <IRQ>
> >>>    ? percpu_ref_reinit+0x140/0x140
> >>>    rcu_process_callbacks+0x273/0x880
> >>>    __do_softirq+0xd2/0x428
> >>>    irq_exit+0xf6/0x100
> >>>    smp_apic_timer_interrupt+0xa2/0x220
> >>>    apic_timer_interrupt+0xf/0x20
> >>>    </IRQ>
> >>> RIP: 0010:lock_acquire+0xb8/0x1a0
> >>> [..]
> >>>    ? __put_page+0x55/0x150
> >>>    ? __put_page+0x55/0x150
> >>>    __put_page+0x83/0x150
> >>>    ? __put_page+0x55/0x150
> >>>    devm_memremap_pages_release+0x194/0x250
> >>>    release_nodes+0x17c/0x2c0
> >>>    device_release_driver_internal+0x1a2/0x250
> >>>    driver_detach+0x3a/0x70
> >>>    bus_remove_driver+0x58/0xd0
> >>>    __x64_sys_delete_module+0x13f/0x200
> >>>    ? trace_hardirqs_off_thunk+0x1a/0x1c
> >>>    do_syscall_64+0x60/0x210
> >>>    entry_SYSCALL_64_after_hwframe+0x49/0xbe
> >>>
> >>
> >> So it looks like we are tearing down memory when this is triggered. Do
> >> we know if this is at the end of the test or if this is running in
> >> parallel with anything?
> >
> > Should not be running in parallel with anything this test is
> > performing a series of namespace setup and teardown events.
> >
> > Wait, where did the call to "percpu_ref_get()" go? I think that's the bug.
> >
>
> I have a reproduction on my system now as well. I should have a patch
> ready to go for it in the next hour or so.
>

Nice! Thanks for jumping on this, and I like the "get_many" optimization.
Zhang, Yi Oct. 9, 2018, 5 p.m. UTC | #20
On 2018-09-25 at 13:21:24 -0700, Alexander Duyck wrote:
> The ZONE_DEVICE pages were being initialized in two locations. One was with
> the memory_hotplug lock held and another was outside of that lock. The
> problem with this is that it was nearly doubling the memory initialization
> time. Instead of doing this twice, once while holding a global lock and
> once without, I am opting to defer the initialization to the one outside of
> the lock. This allows us to avoid serializing the overhead for memory init
> and we can instead focus on per-node init times.
> 
> One issue I encountered is that devm_memremap_pages and
> hmm_devmmem_pages_create were initializing only the pgmap field the same
> way. One wasn't initializing hmm_data, and the other was initializing it to
> a poison value. Since this is something that is exposed to the driver in
> the case of hmm I am opting for a third option and just initializing
> hmm_data to 0 since this is going to be exposed to unknown third party
> drivers.
> 
> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
> 
> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>     merge conflicts with other changes in the kernel.
> v5: No change
> 
>  include/linux/mm.h |    2 +
>  kernel/memremap.c  |   24 +++++---------
>  mm/hmm.c           |   12 ++++---
>  mm/page_alloc.c    |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  4 files changed, 107 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 06d7d7576f8d..7312fb78ef31 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
>  {
>  	return page_zonenum(page) == ZONE_DEVICE;
>  }
> +extern void memmap_init_zone_device(struct zone *, unsigned long,
> +				    unsigned long, struct dev_pagemap *);
>  #else
>  static inline bool is_zone_device_page(const struct page *page)
>  {
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 5b8600d39931..d0c32e473f82 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	struct vmem_altmap *altmap = pgmap->altmap_valid ?
>  			&pgmap->altmap : NULL;
>  	struct resource *res = &pgmap->res;
> -	unsigned long pfn, pgoff, order;
> +	struct dev_pagemap *conflict_pgmap;
>  	pgprot_t pgprot = PAGE_KERNEL;
> +	unsigned long pgoff, order;
>  	int error, nid, is_ram;
> -	struct dev_pagemap *conflict_pgmap;
>  
>  	align_start = res->start & ~(SECTION_SIZE - 1);
>  	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
> @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	if (error)
>  		goto err_add_memory;
>  
> -	for_each_device_pfn(pfn, pgmap) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		/*
> -		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
> -		 * pointer.  It is a bug if a ZONE_DEVICE page is ever
> -		 * freed or placed on a driver-private list.  Seed the
> -		 * storage with LIST_POISON* values.
> -		 */
> -		list_del(&page->lru);
> -		page->pgmap = pgmap;
> -		percpu_ref_get(pgmap->ref);
> -	}
> +	/*
> +	 * Initialization of the pages has been deferred until now in order
> +	 * to allow us to do the work while not holding the hotplug lock.
> +	 */
> +	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> +				align_start >> PAGE_SHIFT,
> +				align_size >> PAGE_SHIFT, pgmap);
>  
>  	devm_add_action(dev, devm_memremap_pages_release, pgmap);
>  
> diff --git a/mm/hmm.c b/mm/hmm.c
> index c968e49f7a0c..774d684fa2b4 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
>  	resource_size_t key, align_start, align_size, align_end;
>  	struct device *device = devmem->device;
>  	int ret, nid, is_ram;
> -	unsigned long pfn;
>  
>  	align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
>  	align_size = ALIGN(devmem->resource->start +
> @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
>  				align_size >> PAGE_SHIFT, NULL);
>  	mem_hotplug_done();
>  
> -	for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
> -		struct page *page = pfn_to_page(pfn);
> +	/*
> +	 * Initialization of the pages has been deferred until now in order
> +	 * to allow us to do the work while not holding the hotplug lock.
> +	 */
> +	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> +				align_start >> PAGE_SHIFT,
> +				align_size >> PAGE_SHIFT, &devmem->pagemap);
>  
> -		page->pgmap = &devmem->pagemap;
> -	}
>  	return 0;
>  
>  error_add_memory:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 926ad3083b28..7ec0997ded39 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5489,12 +5489,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  	if (highest_memmap_pfn < end_pfn - 1)
>  		highest_memmap_pfn = end_pfn - 1;
>  
> +#ifdef CONFIG_ZONE_DEVICE
>  	/*
>  	 * Honor reservation requested by the driver for this ZONE_DEVICE
> -	 * memory
> +	 * memory. We limit the total number of pages to initialize to just
> +	 * those that might contain the memory mapping. We will defer the
> +	 * ZONE_DEVICE page initialization until after we have released
> +	 * the hotplug lock.
>  	 */
> -	if (altmap && start_pfn == altmap->base_pfn)
> -		start_pfn += altmap->reserve;
> +	if (zone == ZONE_DEVICE) {
> +		if (!altmap)
> +			return;
> +
> +		if (start_pfn == altmap->base_pfn)
> +			start_pfn += altmap->reserve;
> +		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> +	}
> +#endif
>  
>  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>  		/*
> @@ -5538,6 +5549,81 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  	}
>  }
>  
> +#ifdef CONFIG_ZONE_DEVICE
> +void __ref memmap_init_zone_device(struct zone *zone,
> +				   unsigned long start_pfn,
> +				   unsigned long size,
> +				   struct dev_pagemap *pgmap)
> +{
> +	unsigned long pfn, end_pfn = start_pfn + size;
> +	struct pglist_data *pgdat = zone->zone_pgdat;
> +	unsigned long zone_idx = zone_idx(zone);
> +	unsigned long start = jiffies;
> +	int nid = pgdat->node_id;
> +
> +	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
> +		return;
> +
> +	/*
> +	 * The call to memmap_init_zone should have already taken care
> +	 * of the pages reserved for the memmap, so we can just jump to
> +	 * the end of that region and start processing the device pages.
> +	 */
> +	if (pgmap->altmap_valid) {
> +		struct vmem_altmap *altmap = &pgmap->altmap;
> +
> +		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> +		size = end_pfn - start_pfn;
> +	}
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +		struct page *page = pfn_to_page(pfn);
> +
> +		__init_single_page(page, pfn, zone_idx, nid);
> +
> +		/*
> +		 * Mark page reserved as it will need to wait for onlining
> +		 * phase for it to be fully associated with a zone.
> +		 *
> +		 * We can use the non-atomic __set_bit operation for setting
> +		 * the flag as we are still initializing the pages.
> +		 */
> +		__SetPageReserved(page);

So we need to hold the page reserved flag while memory onlining.
But after onlined, Do we neeed to clear the reserved flag in the DEV/FS
DAX memory type?
@Dan, What will going on with this?

Regards
Yi.

> +
> +		/*
> +		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
> +		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
> +		 * page is ever freed or placed on a driver-private list.
> +		 */
> +		page->pgmap = pgmap;
> +		page->hmm_data = 0;
> +
> +		/*
> +		 * Mark the block movable so that blocks are reserved for
> +		 * movable at startup. This will force kernel allocations
> +		 * to reserve their blocks rather than leaking throughout
> +		 * the address space during boot when many long-lived
> +		 * kernel allocations are made.
> +		 *
> +		 * bitmap is created for zone's valid pfn range. but memmap
> +		 * can be created for invalid pages (for alignment)
> +		 * check here not to call set_pageblock_migratetype() against
> +		 * pfn out of zone.
> +		 *
> +		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
> +		 * because this is done early in sparse_add_one_section
> +		 */
> +		if (!(pfn & (pageblock_nr_pages - 1))) {
> +			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
> +			cond_resched();
> +		}
> +	}
> +
> +	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
> +		size, jiffies_to_msecs(jiffies - start));
> +}
> +
> +#endif
>  static void __meminit zone_init_free_lists(struct zone *zone)
>  {
>  	unsigned int order, t;
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
Dan Williams Oct. 9, 2018, 6:04 p.m. UTC | #21
On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang <yi.z.zhang@linux.intel.com> wrote:
>
> On 2018-09-25 at 13:21:24 -0700, Alexander Duyck wrote:
> > The ZONE_DEVICE pages were being initialized in two locations. One was with
> > the memory_hotplug lock held and another was outside of that lock. The
> > problem with this is that it was nearly doubling the memory initialization
> > time. Instead of doing this twice, once while holding a global lock and
> > once without, I am opting to defer the initialization to the one outside of
> > the lock. This allows us to avoid serializing the overhead for memory init
> > and we can instead focus on per-node init times.
> >
> > One issue I encountered is that devm_memremap_pages and
> > hmm_devmmem_pages_create were initializing only the pgmap field the same
> > way. One wasn't initializing hmm_data, and the other was initializing it to
> > a poison value. Since this is something that is exposed to the driver in
> > the case of hmm I am opting for a third option and just initializing
> > hmm_data to 0 since this is going to be exposed to unknown third party
> > drivers.
> >
> > Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > ---
> >
> > v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
> >     merge conflicts with other changes in the kernel.
> > v5: No change
> >
> >  include/linux/mm.h |    2 +
> >  kernel/memremap.c  |   24 +++++---------
> >  mm/hmm.c           |   12 ++++---
> >  mm/page_alloc.c    |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> >  4 files changed, 107 insertions(+), 23 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 06d7d7576f8d..7312fb78ef31 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
> >  {
> >       return page_zonenum(page) == ZONE_DEVICE;
> >  }
> > +extern void memmap_init_zone_device(struct zone *, unsigned long,
> > +                                 unsigned long, struct dev_pagemap *);
> >  #else
> >  static inline bool is_zone_device_page(const struct page *page)
> >  {
> > diff --git a/kernel/memremap.c b/kernel/memremap.c
> > index 5b8600d39931..d0c32e473f82 100644
> > --- a/kernel/memremap.c
> > +++ b/kernel/memremap.c
> > @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
> >       struct vmem_altmap *altmap = pgmap->altmap_valid ?
> >                       &pgmap->altmap : NULL;
> >       struct resource *res = &pgmap->res;
> > -     unsigned long pfn, pgoff, order;
> > +     struct dev_pagemap *conflict_pgmap;
> >       pgprot_t pgprot = PAGE_KERNEL;
> > +     unsigned long pgoff, order;
> >       int error, nid, is_ram;
> > -     struct dev_pagemap *conflict_pgmap;
> >
> >       align_start = res->start & ~(SECTION_SIZE - 1);
> >       align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
> > @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
> >       if (error)
> >               goto err_add_memory;
> >
> > -     for_each_device_pfn(pfn, pgmap) {
> > -             struct page *page = pfn_to_page(pfn);
> > -
> > -             /*
> > -              * ZONE_DEVICE pages union ->lru with a ->pgmap back
> > -              * pointer.  It is a bug if a ZONE_DEVICE page is ever
> > -              * freed or placed on a driver-private list.  Seed the
> > -              * storage with LIST_POISON* values.
> > -              */
> > -             list_del(&page->lru);
> > -             page->pgmap = pgmap;
> > -             percpu_ref_get(pgmap->ref);
> > -     }
> > +     /*
> > +      * Initialization of the pages has been deferred until now in order
> > +      * to allow us to do the work while not holding the hotplug lock.
> > +      */
> > +     memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> > +                             align_start >> PAGE_SHIFT,
> > +                             align_size >> PAGE_SHIFT, pgmap);
> >
> >       devm_add_action(dev, devm_memremap_pages_release, pgmap);
> >
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index c968e49f7a0c..774d684fa2b4 100644
> > --- a/mm/hmm.c
> > +++ b/mm/hmm.c
> > @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
> >       resource_size_t key, align_start, align_size, align_end;
> >       struct device *device = devmem->device;
> >       int ret, nid, is_ram;
> > -     unsigned long pfn;
> >
> >       align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
> >       align_size = ALIGN(devmem->resource->start +
> > @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
> >                               align_size >> PAGE_SHIFT, NULL);
> >       mem_hotplug_done();
> >
> > -     for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
> > -             struct page *page = pfn_to_page(pfn);
> > +     /*
> > +      * Initialization of the pages has been deferred until now in order
> > +      * to allow us to do the work while not holding the hotplug lock.
> > +      */
> > +     memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> > +                             align_start >> PAGE_SHIFT,
> > +                             align_size >> PAGE_SHIFT, &devmem->pagemap);
> >
> > -             page->pgmap = &devmem->pagemap;
> > -     }
> >       return 0;
> >
> >  error_add_memory:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 926ad3083b28..7ec0997ded39 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -5489,12 +5489,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> >       if (highest_memmap_pfn < end_pfn - 1)
> >               highest_memmap_pfn = end_pfn - 1;
> >
> > +#ifdef CONFIG_ZONE_DEVICE
> >       /*
> >        * Honor reservation requested by the driver for this ZONE_DEVICE
> > -      * memory
> > +      * memory. We limit the total number of pages to initialize to just
> > +      * those that might contain the memory mapping. We will defer the
> > +      * ZONE_DEVICE page initialization until after we have released
> > +      * the hotplug lock.
> >        */
> > -     if (altmap && start_pfn == altmap->base_pfn)
> > -             start_pfn += altmap->reserve;
> > +     if (zone == ZONE_DEVICE) {
> > +             if (!altmap)
> > +                     return;
> > +
> > +             if (start_pfn == altmap->base_pfn)
> > +                     start_pfn += altmap->reserve;
> > +             end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> > +     }
> > +#endif
> >
> >       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> >               /*
> > @@ -5538,6 +5549,81 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> >       }
> >  }
> >
> > +#ifdef CONFIG_ZONE_DEVICE
> > +void __ref memmap_init_zone_device(struct zone *zone,
> > +                                unsigned long start_pfn,
> > +                                unsigned long size,
> > +                                struct dev_pagemap *pgmap)
> > +{
> > +     unsigned long pfn, end_pfn = start_pfn + size;
> > +     struct pglist_data *pgdat = zone->zone_pgdat;
> > +     unsigned long zone_idx = zone_idx(zone);
> > +     unsigned long start = jiffies;
> > +     int nid = pgdat->node_id;
> > +
> > +     if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
> > +             return;
> > +
> > +     /*
> > +      * The call to memmap_init_zone should have already taken care
> > +      * of the pages reserved for the memmap, so we can just jump to
> > +      * the end of that region and start processing the device pages.
> > +      */
> > +     if (pgmap->altmap_valid) {
> > +             struct vmem_altmap *altmap = &pgmap->altmap;
> > +
> > +             start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> > +             size = end_pfn - start_pfn;
> > +     }
> > +
> > +     for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> > +             struct page *page = pfn_to_page(pfn);
> > +
> > +             __init_single_page(page, pfn, zone_idx, nid);
> > +
> > +             /*
> > +              * Mark page reserved as it will need to wait for onlining
> > +              * phase for it to be fully associated with a zone.
> > +              *
> > +              * We can use the non-atomic __set_bit operation for setting
> > +              * the flag as we are still initializing the pages.
> > +              */
> > +             __SetPageReserved(page);
>
> So we need to hold the page reserved flag while memory onlining.
> But after onlined, Do we neeed to clear the reserved flag in the DEV/FS
> DAX memory type?
> @Dan, What will going on with this?
>

That comment is incorrect, device-pages are never onlined. So I think
we can just skip that call to __SetPageReserved() unless the memory
range is MEMORY_DEVICE_{PRIVATE,PUBLIC}.
Alexander Duyck Oct. 9, 2018, 8:26 p.m. UTC | #22
On 10/9/2018 11:04 AM, Dan Williams wrote:
> On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang <yi.z.zhang@linux.intel.com> wrote:
>>
>> On 2018-09-25 at 13:21:24 -0700, Alexander Duyck wrote:
>>> The ZONE_DEVICE pages were being initialized in two locations. One was with
>>> the memory_hotplug lock held and another was outside of that lock. The
>>> problem with this is that it was nearly doubling the memory initialization
>>> time. Instead of doing this twice, once while holding a global lock and
>>> once without, I am opting to defer the initialization to the one outside of
>>> the lock. This allows us to avoid serializing the overhead for memory init
>>> and we can instead focus on per-node init times.
>>>
>>> One issue I encountered is that devm_memremap_pages and
>>> hmm_devmmem_pages_create were initializing only the pgmap field the same
>>> way. One wasn't initializing hmm_data, and the other was initializing it to
>>> a poison value. Since this is something that is exposed to the driver in
>>> the case of hmm I am opting for a third option and just initializing
>>> hmm_data to 0 since this is going to be exposed to unknown third party
>>> drivers.
>>>
>>> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>> ---
>>>
>>> v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid
>>>      merge conflicts with other changes in the kernel.
>>> v5: No change
>>>
>>>   include/linux/mm.h |    2 +
>>>   kernel/memremap.c  |   24 +++++---------
>>>   mm/hmm.c           |   12 ++++---
>>>   mm/page_alloc.c    |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>   4 files changed, 107 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 06d7d7576f8d..7312fb78ef31 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
>>>   {
>>>        return page_zonenum(page) == ZONE_DEVICE;
>>>   }
>>> +extern void memmap_init_zone_device(struct zone *, unsigned long,
>>> +                                 unsigned long, struct dev_pagemap *);
>>>   #else
>>>   static inline bool is_zone_device_page(const struct page *page)
>>>   {
>>> diff --git a/kernel/memremap.c b/kernel/memremap.c
>>> index 5b8600d39931..d0c32e473f82 100644
>>> --- a/kernel/memremap.c
>>> +++ b/kernel/memremap.c
>>> @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>>>        struct vmem_altmap *altmap = pgmap->altmap_valid ?
>>>                        &pgmap->altmap : NULL;
>>>        struct resource *res = &pgmap->res;
>>> -     unsigned long pfn, pgoff, order;
>>> +     struct dev_pagemap *conflict_pgmap;
>>>        pgprot_t pgprot = PAGE_KERNEL;
>>> +     unsigned long pgoff, order;
>>>        int error, nid, is_ram;
>>> -     struct dev_pagemap *conflict_pgmap;
>>>
>>>        align_start = res->start & ~(SECTION_SIZE - 1);
>>>        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
>>> @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>>>        if (error)
>>>                goto err_add_memory;
>>>
>>> -     for_each_device_pfn(pfn, pgmap) {
>>> -             struct page *page = pfn_to_page(pfn);
>>> -
>>> -             /*
>>> -              * ZONE_DEVICE pages union ->lru with a ->pgmap back
>>> -              * pointer.  It is a bug if a ZONE_DEVICE page is ever
>>> -              * freed or placed on a driver-private list.  Seed the
>>> -              * storage with LIST_POISON* values.
>>> -              */
>>> -             list_del(&page->lru);
>>> -             page->pgmap = pgmap;
>>> -             percpu_ref_get(pgmap->ref);
>>> -     }
>>> +     /*
>>> +      * Initialization of the pages has been deferred until now in order
>>> +      * to allow us to do the work while not holding the hotplug lock.
>>> +      */
>>> +     memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>>> +                             align_start >> PAGE_SHIFT,
>>> +                             align_size >> PAGE_SHIFT, pgmap);
>>>
>>>        devm_add_action(dev, devm_memremap_pages_release, pgmap);
>>>
>>> diff --git a/mm/hmm.c b/mm/hmm.c
>>> index c968e49f7a0c..774d684fa2b4 100644
>>> --- a/mm/hmm.c
>>> +++ b/mm/hmm.c
>>> @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
>>>        resource_size_t key, align_start, align_size, align_end;
>>>        struct device *device = devmem->device;
>>>        int ret, nid, is_ram;
>>> -     unsigned long pfn;
>>>
>>>        align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
>>>        align_size = ALIGN(devmem->resource->start +
>>> @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
>>>                                align_size >> PAGE_SHIFT, NULL);
>>>        mem_hotplug_done();
>>>
>>> -     for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
>>> -             struct page *page = pfn_to_page(pfn);
>>> +     /*
>>> +      * Initialization of the pages has been deferred until now in order
>>> +      * to allow us to do the work while not holding the hotplug lock.
>>> +      */
>>> +     memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>>> +                             align_start >> PAGE_SHIFT,
>>> +                             align_size >> PAGE_SHIFT, &devmem->pagemap);
>>>
>>> -             page->pgmap = &devmem->pagemap;
>>> -     }
>>>        return 0;
>>>
>>>   error_add_memory:
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 926ad3083b28..7ec0997ded39 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -5489,12 +5489,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>>>        if (highest_memmap_pfn < end_pfn - 1)
>>>                highest_memmap_pfn = end_pfn - 1;
>>>
>>> +#ifdef CONFIG_ZONE_DEVICE
>>>        /*
>>>         * Honor reservation requested by the driver for this ZONE_DEVICE
>>> -      * memory
>>> +      * memory. We limit the total number of pages to initialize to just
>>> +      * those that might contain the memory mapping. We will defer the
>>> +      * ZONE_DEVICE page initialization until after we have released
>>> +      * the hotplug lock.
>>>         */
>>> -     if (altmap && start_pfn == altmap->base_pfn)
>>> -             start_pfn += altmap->reserve;
>>> +     if (zone == ZONE_DEVICE) {
>>> +             if (!altmap)
>>> +                     return;
>>> +
>>> +             if (start_pfn == altmap->base_pfn)
>>> +                     start_pfn += altmap->reserve;
>>> +             end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
>>> +     }
>>> +#endif
>>>
>>>        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>>                /*
>>> @@ -5538,6 +5549,81 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>>>        }
>>>   }
>>>
>>> +#ifdef CONFIG_ZONE_DEVICE
>>> +void __ref memmap_init_zone_device(struct zone *zone,
>>> +                                unsigned long start_pfn,
>>> +                                unsigned long size,
>>> +                                struct dev_pagemap *pgmap)
>>> +{
>>> +     unsigned long pfn, end_pfn = start_pfn + size;
>>> +     struct pglist_data *pgdat = zone->zone_pgdat;
>>> +     unsigned long zone_idx = zone_idx(zone);
>>> +     unsigned long start = jiffies;
>>> +     int nid = pgdat->node_id;
>>> +
>>> +     if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
>>> +             return;
>>> +
>>> +     /*
>>> +      * The call to memmap_init_zone should have already taken care
>>> +      * of the pages reserved for the memmap, so we can just jump to
>>> +      * the end of that region and start processing the device pages.
>>> +      */
>>> +     if (pgmap->altmap_valid) {
>>> +             struct vmem_altmap *altmap = &pgmap->altmap;
>>> +
>>> +             start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
>>> +             size = end_pfn - start_pfn;
>>> +     }
>>> +
>>> +     for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>> +             struct page *page = pfn_to_page(pfn);
>>> +
>>> +             __init_single_page(page, pfn, zone_idx, nid);
>>> +
>>> +             /*
>>> +              * Mark page reserved as it will need to wait for onlining
>>> +              * phase for it to be fully associated with a zone.
>>> +              *
>>> +              * We can use the non-atomic __set_bit operation for setting
>>> +              * the flag as we are still initializing the pages.
>>> +              */
>>> +             __SetPageReserved(page);
>>
>> So we need to hold the page reserved flag while memory onlining.
>> But after onlined, Do we neeed to clear the reserved flag in the DEV/FS
>> DAX memory type?
>> @Dan, What will going on with this?
>>
> 
> That comment is incorrect, device-pages are never onlined. So I think
> we can just skip that call to __SetPageReserved() unless the memory
> range is MEMORY_DEVICE_{PRIVATE,PUBLIC}.
> 

When pages are "onlined" via __free_pages_boot_core they clear the 
reserved bit, that is the reason for the comment. The reserved bit is 
meant to indicate that the page cannot be swapped out or moved based on 
the description of the bit.

I would think with that being the case we still probably need the call 
to __SetPageReserved to set the bit with the expectation that it will 
not be cleared for device-pages since the pages are not onlined. 
Removing the call to __SetPageReserved would probably introduce a number 
of regressions as there are multiple spots that use the reserved bit to 
determine if a page can be swapped out to disk, mapped as system memory, 
or migrated.

Thanks.

- Alex
Dan Williams Oct. 9, 2018, 9:19 p.m. UTC | #23
On Tue, Oct 9, 2018 at 1:34 PM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On 10/9/2018 11:04 AM, Dan Williams wrote:
> > On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang <yi.z.zhang@linux.intel.com> wrote:
[..]
> > That comment is incorrect, device-pages are never onlined. So I think
> > we can just skip that call to __SetPageReserved() unless the memory
> > range is MEMORY_DEVICE_{PRIVATE,PUBLIC}.
> >
>
> When pages are "onlined" via __free_pages_boot_core they clear the
> reserved bit, that is the reason for the comment. The reserved bit is
> meant to indicate that the page cannot be swapped out or moved based on
> the description of the bit.

...but ZONE_DEVICE pages are never onlined so I would expect
memmap_init_zone_device() to know that detail.

> I would think with that being the case we still probably need the call
> to __SetPageReserved to set the bit with the expectation that it will
> not be cleared for device-pages since the pages are not onlined.
> Removing the call to __SetPageReserved would probably introduce a number
> of regressions as there are multiple spots that use the reserved bit to
> determine if a page can be swapped out to disk, mapped as system memory,
> or migrated.

Right, this is what Yi is working on... the PageReserved flag is
problematic for KVM. Auditing those locations it seems as long as we
teach hibernation to avoid ZONE_DEVICE ranges we can safely not set
the reserved flag for DAX pages. What I'm trying to avoid is a local
KVM hack to check for DAX pages when the Reserved flag is not
otherwise needed.
Michal Hocko Oct. 10, 2018, 9:58 a.m. UTC | #24
On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
[...]
> I would think with that being the case we still probably need the call to
> __SetPageReserved to set the bit with the expectation that it will not be
> cleared for device-pages since the pages are not onlined. Removing the call
> to __SetPageReserved would probably introduce a number of regressions as
> there are multiple spots that use the reserved bit to determine if a page
> can be swapped out to disk, mapped as system memory, or migrated.

PageReserved is meant to tell any potential pfn walkers that might get
to this struct page to back off and not touch it. Even though
ZONE_DEVICE doesn't online pages in traditional sense it makes those
pages available for further use so the page reserved bit should be
cleared.
Zhang, Yi Oct. 10, 2018, 12:52 p.m. UTC | #25
On 2018-10-09 at 14:19:32 -0700, Dan Williams wrote:
> On Tue, Oct 9, 2018 at 1:34 PM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
> >
> > On 10/9/2018 11:04 AM, Dan Williams wrote:
> > > On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang <yi.z.zhang@linux.intel.com> wrote:
> [..]
> > > That comment is incorrect, device-pages are never onlined. So I think
> > > we can just skip that call to __SetPageReserved() unless the memory
> > > range is MEMORY_DEVICE_{PRIVATE,PUBLIC}.
> > >
> >
> > When pages are "onlined" via __free_pages_boot_core they clear the
> > reserved bit, that is the reason for the comment. The reserved bit is
> > meant to indicate that the page cannot be swapped out or moved based on
> > the description of the bit.
> 
> ...but ZONE_DEVICE pages are never onlined so I would expect
> memmap_init_zone_device() to know that detail.
> 
> > I would think with that being the case we still probably need the call
> > to __SetPageReserved to set the bit with the expectation that it will
> > not be cleared for device-pages since the pages are not onlined.
> > Removing the call to __SetPageReserved would probably introduce a number
> > of regressions as there are multiple spots that use the reserved bit to
> > determine if a page can be swapped out to disk, mapped as system memory,
> > or migrated.

Another things, it seems page_init/set_reserved already been done in the
move_pfn_range_to_zone
    |-->memmap_init_zone
    	|-->for_each_page_in_pfn
		|-->__init_single_page
		|-->SetPageReserved

Why we haven't remove these redundant initial in memmap_init_zone?

Correct me if I missed something.

> 
> Right, this is what Yi is working on... the PageReserved flag is
> problematic for KVM. Auditing those locations it seems as long as we
> teach hibernation to avoid ZONE_DEVICE ranges we can safely not set
> the reserved flag for DAX pages. What I'm trying to avoid is a local
> KVM hack to check for DAX pages when the Reserved flag is not
> otherwise needed.
Thanks Dan. Provide the patch link.

https://lore.kernel.org/lkml/cover.1536342881.git.yi.z.zhang@linux.intel.com



> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
Alexander Duyck Oct. 10, 2018, 3:27 p.m. UTC | #26
On 10/10/2018 5:52 AM, Yi Zhang wrote:
> On 2018-10-09 at 14:19:32 -0700, Dan Williams wrote:
>> On Tue, Oct 9, 2018 at 1:34 PM Alexander Duyck
>> <alexander.h.duyck@linux.intel.com> wrote:
>>>
>>> On 10/9/2018 11:04 AM, Dan Williams wrote:
>>>> On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang <yi.z.zhang@linux.intel.com> wrote:
>> [..]
>>>> That comment is incorrect, device-pages are never onlined. So I think
>>>> we can just skip that call to __SetPageReserved() unless the memory
>>>> range is MEMORY_DEVICE_{PRIVATE,PUBLIC}.
>>>>
>>>
>>> When pages are "onlined" via __free_pages_boot_core they clear the
>>> reserved bit, that is the reason for the comment. The reserved bit is
>>> meant to indicate that the page cannot be swapped out or moved based on
>>> the description of the bit.
>>
>> ...but ZONE_DEVICE pages are never onlined so I would expect
>> memmap_init_zone_device() to know that detail.
>>
>>> I would think with that being the case we still probably need the call
>>> to __SetPageReserved to set the bit with the expectation that it will
>>> not be cleared for device-pages since the pages are not onlined.
>>> Removing the call to __SetPageReserved would probably introduce a number
>>> of regressions as there are multiple spots that use the reserved bit to
>>> determine if a page can be swapped out to disk, mapped as system memory,
>>> or migrated.
> 
> Another things, it seems page_init/set_reserved already been done in the
> move_pfn_range_to_zone
>      |-->memmap_init_zone
>      	|-->for_each_page_in_pfn
> 		|-->__init_single_page
> 		|-->SetPageReserved
> 
> Why we haven't remove these redundant initial in memmap_init_zone?
> 
> Correct me if I missed something.

In this case it isn't redundant as only the vmmemmap pages are 
initialized in memmap_init_zone now. So all of the pages that are going 
to be used as device pages are not initialized until the call to 
memmap_init_zone_device. What I did is split the initialization of the 
pages into two parts in order to allow us to initialize the pages 
outside of the hotplug lock.

>>
>> Right, this is what Yi is working on... the PageReserved flag is
>> problematic for KVM. Auditing those locations it seems as long as we
>> teach hibernation to avoid ZONE_DEVICE ranges we can safely not set
>> the reserved flag for DAX pages. What I'm trying to avoid is a local
>> KVM hack to check for DAX pages when the Reserved flag is not
>> otherwise needed.
> Thanks Dan. Provide the patch link.
> 
> https://lore.kernel.org/lkml/cover.1536342881.git.yi.z.zhang@linux.intel.com

So it looks like your current logic is just working around the bit then 
since it just allows for reserved DAX pages.
Alexander Duyck Oct. 10, 2018, 4:39 p.m. UTC | #27
On 10/10/2018 2:58 AM, Michal Hocko wrote:
> On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
> [...]
>> I would think with that being the case we still probably need the call to
>> __SetPageReserved to set the bit with the expectation that it will not be
>> cleared for device-pages since the pages are not onlined. Removing the call
>> to __SetPageReserved would probably introduce a number of regressions as
>> there are multiple spots that use the reserved bit to determine if a page
>> can be swapped out to disk, mapped as system memory, or migrated.
> 
> PageReserved is meant to tell any potential pfn walkers that might get
> to this struct page to back off and not touch it. Even though
> ZONE_DEVICE doesn't online pages in traditional sense it makes those
> pages available for further use so the page reserved bit should be
> cleared.

So from what I can tell that isn't necessarily the case. Specifically if 
the pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both 
are special cases where the memory may not be accessible to the CPU or 
cannot be pinned in order to allow for eviction.

The specific case that Dan and Yi are referring to is for the type 
MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting 
the reserved bit. Part of me wants to say that we should wait and clear 
the bit later, but that would end up just adding time back to 
initialization. At this point I would consider the change more of a 
follow-up optimization rather than a fix though since this is tailoring 
things specifically for DAX versus the other ZONE_DEVICE types.
Michal Hocko Oct. 10, 2018, 5:24 p.m. UTC | #28
On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
> On 10/10/2018 2:58 AM, Michal Hocko wrote:
> > On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
> > [...]
> > > I would think with that being the case we still probably need the call to
> > > __SetPageReserved to set the bit with the expectation that it will not be
> > > cleared for device-pages since the pages are not onlined. Removing the call
> > > to __SetPageReserved would probably introduce a number of regressions as
> > > there are multiple spots that use the reserved bit to determine if a page
> > > can be swapped out to disk, mapped as system memory, or migrated.
> > 
> > PageReserved is meant to tell any potential pfn walkers that might get
> > to this struct page to back off and not touch it. Even though
> > ZONE_DEVICE doesn't online pages in traditional sense it makes those
> > pages available for further use so the page reserved bit should be
> > cleared.
> 
> So from what I can tell that isn't necessarily the case. Specifically if the
> pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
> special cases where the memory may not be accessible to the CPU or cannot be
> pinned in order to allow for eviction.

Could you give me an example please?

> The specific case that Dan and Yi are referring to is for the type
> MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
> reserved bit. Part of me wants to say that we should wait and clear the bit
> later, but that would end up just adding time back to initialization. At
> this point I would consider the change more of a follow-up optimization
> rather than a fix though since this is tailoring things specifically for DAX
> versus the other ZONE_DEVICE types.

I thought I have already made it clear that these zone device hacks are
not acceptable to the generic hotplug code. If the current reserve bit
handling is not correct then give us a specific reason for that and we
can start thinking about the proper fix.
Alexander Duyck Oct. 10, 2018, 5:39 p.m. UTC | #29
On 10/10/2018 10:24 AM, Michal Hocko wrote:
> On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
>> On 10/10/2018 2:58 AM, Michal Hocko wrote:
>>> On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
>>> [...]
>>>> I would think with that being the case we still probably need the call to
>>>> __SetPageReserved to set the bit with the expectation that it will not be
>>>> cleared for device-pages since the pages are not onlined. Removing the call
>>>> to __SetPageReserved would probably introduce a number of regressions as
>>>> there are multiple spots that use the reserved bit to determine if a page
>>>> can be swapped out to disk, mapped as system memory, or migrated.
>>>
>>> PageReserved is meant to tell any potential pfn walkers that might get
>>> to this struct page to back off and not touch it. Even though
>>> ZONE_DEVICE doesn't online pages in traditional sense it makes those
>>> pages available for further use so the page reserved bit should be
>>> cleared.
>>
>> So from what I can tell that isn't necessarily the case. Specifically if the
>> pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
>> special cases where the memory may not be accessible to the CPU or cannot be
>> pinned in order to allow for eviction.
> 
> Could you give me an example please?

Honestly I am getting a bit beyond my depth here so maybe Dan could 
explain better. I am basing the above comment on Dan's earlier comment 
in this thread combined with the comment that explains the "memory_type" 
field for the pgmap:
https://elixir.bootlin.com/linux/v4.19-rc7/source/include/linux/memremap.h#L28

>> The specific case that Dan and Yi are referring to is for the type
>> MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
>> reserved bit. Part of me wants to say that we should wait and clear the bit
>> later, but that would end up just adding time back to initialization. At
>> this point I would consider the change more of a follow-up optimization
>> rather than a fix though since this is tailoring things specifically for DAX
>> versus the other ZONE_DEVICE types.
> 
> I thought I have already made it clear that these zone device hacks are
> not acceptable to the generic hotplug code. If the current reserve bit
> handling is not correct then give us a specific reason for that and we
> can start thinking about the proper fix.

I might have misunderstood your earlier comment then. I thought you were 
saying that we shouldn't bother with setting the reserved bit. Now it 
sounds like you were thinking more along the lines of what I was here in 
my comment where I thought the bit should be cleared later in some code 
specifically related to DAX when it is exposing it for use to userspace 
or KVM.
Michal Hocko Oct. 10, 2018, 5:53 p.m. UTC | #30
On Wed 10-10-18 10:39:01, Alexander Duyck wrote:
> 
> 
> On 10/10/2018 10:24 AM, Michal Hocko wrote:
> > On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
> > > On 10/10/2018 2:58 AM, Michal Hocko wrote:
> > > > On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
> > > > [...]
> > > > > I would think with that being the case we still probably need the call to
> > > > > __SetPageReserved to set the bit with the expectation that it will not be
> > > > > cleared for device-pages since the pages are not onlined. Removing the call
> > > > > to __SetPageReserved would probably introduce a number of regressions as
> > > > > there are multiple spots that use the reserved bit to determine if a page
> > > > > can be swapped out to disk, mapped as system memory, or migrated.
> > > > 
> > > > PageReserved is meant to tell any potential pfn walkers that might get
> > > > to this struct page to back off and not touch it. Even though
> > > > ZONE_DEVICE doesn't online pages in traditional sense it makes those
> > > > pages available for further use so the page reserved bit should be
> > > > cleared.
> > > 
> > > So from what I can tell that isn't necessarily the case. Specifically if the
> > > pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
> > > special cases where the memory may not be accessible to the CPU or cannot be
> > > pinned in order to allow for eviction.
> > 
> > Could you give me an example please?
> 
> Honestly I am getting a bit beyond my depth here so maybe Dan could explain
> better. I am basing the above comment on Dan's earlier comment in this
> thread combined with the comment that explains the "memory_type" field for
> the pgmap:
> https://elixir.bootlin.com/linux/v4.19-rc7/source/include/linux/memremap.h#L28
> 
> > > The specific case that Dan and Yi are referring to is for the type
> > > MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
> > > reserved bit. Part of me wants to say that we should wait and clear the bit
> > > later, but that would end up just adding time back to initialization. At
> > > this point I would consider the change more of a follow-up optimization
> > > rather than a fix though since this is tailoring things specifically for DAX
> > > versus the other ZONE_DEVICE types.
> > 
> > I thought I have already made it clear that these zone device hacks are
> > not acceptable to the generic hotplug code. If the current reserve bit
> > handling is not correct then give us a specific reason for that and we
> > can start thinking about the proper fix.
> 
> I might have misunderstood your earlier comment then. I thought you were
> saying that we shouldn't bother with setting the reserved bit. Now it sounds
> like you were thinking more along the lines of what I was here in my comment
> where I thought the bit should be cleared later in some code specifically
> related to DAX when it is exposing it for use to userspace or KVM.

I was referring to my earlier comment that if you need to do something
about struct page initialization (move_pfn_range_to_zone) outside of the
lock (with the appropriate ground work that is needed) rather than
pulling more zone device hacks into the generic hotplug code [1]

[1] http://lkml.kernel.org/r/20180926075540.GD6278@dhcp22.suse.cz
Alexander Duyck Oct. 10, 2018, 6:13 p.m. UTC | #31
On 10/10/2018 10:53 AM, Michal Hocko wrote:
> On Wed 10-10-18 10:39:01, Alexander Duyck wrote:
>>
>>
>> On 10/10/2018 10:24 AM, Michal Hocko wrote:
>>> On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
>>>> On 10/10/2018 2:58 AM, Michal Hocko wrote:
>>>>> On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
>>>>> [...]
>>>>>> I would think with that being the case we still probably need the call to
>>>>>> __SetPageReserved to set the bit with the expectation that it will not be
>>>>>> cleared for device-pages since the pages are not onlined. Removing the call
>>>>>> to __SetPageReserved would probably introduce a number of regressions as
>>>>>> there are multiple spots that use the reserved bit to determine if a page
>>>>>> can be swapped out to disk, mapped as system memory, or migrated.
>>>>>
>>>>> PageReserved is meant to tell any potential pfn walkers that might get
>>>>> to this struct page to back off and not touch it. Even though
>>>>> ZONE_DEVICE doesn't online pages in traditional sense it makes those
>>>>> pages available for further use so the page reserved bit should be
>>>>> cleared.
>>>>
>>>> So from what I can tell that isn't necessarily the case. Specifically if the
>>>> pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
>>>> special cases where the memory may not be accessible to the CPU or cannot be
>>>> pinned in order to allow for eviction.
>>>
>>> Could you give me an example please?
>>
>> Honestly I am getting a bit beyond my depth here so maybe Dan could explain
>> better. I am basing the above comment on Dan's earlier comment in this
>> thread combined with the comment that explains the "memory_type" field for
>> the pgmap:
>> https://elixir.bootlin.com/linux/v4.19-rc7/source/include/linux/memremap.h#L28
>>
>>>> The specific case that Dan and Yi are referring to is for the type
>>>> MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
>>>> reserved bit. Part of me wants to say that we should wait and clear the bit
>>>> later, but that would end up just adding time back to initialization. At
>>>> this point I would consider the change more of a follow-up optimization
>>>> rather than a fix though since this is tailoring things specifically for DAX
>>>> versus the other ZONE_DEVICE types.
>>>
>>> I thought I have already made it clear that these zone device hacks are
>>> not acceptable to the generic hotplug code. If the current reserve bit
>>> handling is not correct then give us a specific reason for that and we
>>> can start thinking about the proper fix.
>>
>> I might have misunderstood your earlier comment then. I thought you were
>> saying that we shouldn't bother with setting the reserved bit. Now it sounds
>> like you were thinking more along the lines of what I was here in my comment
>> where I thought the bit should be cleared later in some code specifically
>> related to DAX when it is exposing it for use to userspace or KVM.
> 
> I was referring to my earlier comment that if you need to do something
> about struct page initialization (move_pfn_range_to_zone) outside of the
> lock (with the appropriate ground work that is needed) rather than
> pulling more zone device hacks into the generic hotplug code [1]
> 
> [1] http://lkml.kernel.org/r/20180926075540.GD6278@dhcp22.suse.cz

The only issue is if we don't pull the code together we are looking at a 
massive increase in initialization times. So for example just the loop 
that was originally there that was setting the pgmap and resetting the 
LRU prev pointer was adding an additional 20+ seconds per node with 3TB 
allocated per node. That essentially doubles the initialization time.

How would you recommend I address that? Currently it is a few extra 
lines in the memmap_init_zone_device function. Eventually I was planning 
to combine the memmap_init_zone hoplug functionality and 
memmap_init_zone_device core functionality into a single function called 
__init_pageblock[1] and then reuse that functionality for deferred page 
init as well.

[1] 
http://lkml.kernel.org/r/20181005151224.17473.53398.stgit@localhost.localdomain
Dan Williams Oct. 10, 2018, 6:18 p.m. UTC | #32
On Wed, Oct 10, 2018 at 10:30 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
> > On 10/10/2018 2:58 AM, Michal Hocko wrote:
> > > On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
> > > [...]
> > > > I would think with that being the case we still probably need the call to
> > > > __SetPageReserved to set the bit with the expectation that it will not be
> > > > cleared for device-pages since the pages are not onlined. Removing the call
> > > > to __SetPageReserved would probably introduce a number of regressions as
> > > > there are multiple spots that use the reserved bit to determine if a page
> > > > can be swapped out to disk, mapped as system memory, or migrated.
> > >
> > > PageReserved is meant to tell any potential pfn walkers that might get
> > > to this struct page to back off and not touch it. Even though
> > > ZONE_DEVICE doesn't online pages in traditional sense it makes those
> > > pages available for further use so the page reserved bit should be
> > > cleared.
> >
> > So from what I can tell that isn't necessarily the case. Specifically if the
> > pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
> > special cases where the memory may not be accessible to the CPU or cannot be
> > pinned in order to allow for eviction.
>
> Could you give me an example please?
>
> > The specific case that Dan and Yi are referring to is for the type
> > MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
> > reserved bit. Part of me wants to say that we should wait and clear the bit
> > later, but that would end up just adding time back to initialization. At
> > this point I would consider the change more of a follow-up optimization
> > rather than a fix though since this is tailoring things specifically for DAX
> > versus the other ZONE_DEVICE types.
>
> I thought I have already made it clear that these zone device hacks are
> not acceptable to the generic hotplug code. If the current reserve bit
> handling is not correct then give us a specific reason for that and we
> can start thinking about the proper fix.
>

Right, so we're in a situation where a hack is needed for KVM's
current interpretation of the Reserved flag relative to dax mapped
pages. I'm arguing to push that knowledge / handling as deep as
possible into the core rather than hack the leaf implementations like
KVM, i.e. disable the Reserved flag for all non-MEMORY_DEVICE_*
ZONE_DEVICE types.

Here is the KVM thread about why they need a change:

    https://lkml.org/lkml/2018/9/7/552

...and where I pushed back on a KVM-local hack:

    https://lkml.org/lkml/2018/9/19/154
Michal Hocko Oct. 10, 2018, 6:52 p.m. UTC | #33
On Wed 10-10-18 10:39:01, Alexander Duyck wrote:
> On 10/10/2018 10:24 AM, Michal Hocko wrote:
[...]
> > I thought I have already made it clear that these zone device hacks are
> > not acceptable to the generic hotplug code. If the current reserve bit
> > handling is not correct then give us a specific reason for that and we
> > can start thinking about the proper fix.
> 
> I might have misunderstood your earlier comment then. I thought you were
> saying that we shouldn't bother with setting the reserved bit. Now it sounds
> like you were thinking more along the lines of what I was here in my comment
> where I thought the bit should be cleared later in some code specifically
> related to DAX when it is exposing it for use to userspace or KVM.

It seems I managed to confuse myself completely. Sorry, it's been a long
day and I am sick so the brain doesn't work all that well. I will get
back to this tomorrow or on Friday with a fresh brain.

My recollection was that we do clear the reserved bit in
move_pfn_range_to_zone and we indeed do in __init_single_page. But then
we set the bit back right afterwards. This seems to be the case since
d0dc12e86b319 which reorganized the code. I have to study this some more
obviously.
Zhang, Yi Oct. 11, 2018, 8:17 a.m. UTC | #34
On 2018-10-10 at 08:27:58 -0700, Alexander Duyck wrote:
> 
> 
> On 10/10/2018 5:52 AM, Yi Zhang wrote:
> >On 2018-10-09 at 14:19:32 -0700, Dan Williams wrote:
> >>On Tue, Oct 9, 2018 at 1:34 PM Alexander Duyck
> >><alexander.h.duyck@linux.intel.com> wrote:
> >>>
> >>>On 10/9/2018 11:04 AM, Dan Williams wrote:
> >>>>On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang <yi.z.zhang@linux.intel.com> wrote:
> >>[..]
> >>>>That comment is incorrect, device-pages are never onlined. So I think
> >>>>we can just skip that call to __SetPageReserved() unless the memory
> >>>>range is MEMORY_DEVICE_{PRIVATE,PUBLIC}.
> >>>>
> >>>
> >>>When pages are "onlined" via __free_pages_boot_core they clear the
> >>>reserved bit, that is the reason for the comment. The reserved bit is
> >>>meant to indicate that the page cannot be swapped out or moved based on
> >>>the description of the bit.
> >>
> >>...but ZONE_DEVICE pages are never onlined so I would expect
> >>memmap_init_zone_device() to know that detail.
> >>
> >>>I would think with that being the case we still probably need the call
> >>>to __SetPageReserved to set the bit with the expectation that it will
> >>>not be cleared for device-pages since the pages are not onlined.
> >>>Removing the call to __SetPageReserved would probably introduce a number
> >>>of regressions as there are multiple spots that use the reserved bit to
> >>>determine if a page can be swapped out to disk, mapped as system memory,
> >>>or migrated.
> >
> >Another things, it seems page_init/set_reserved already been done in the
> >move_pfn_range_to_zone
> >     |-->memmap_init_zone
> >     	|-->for_each_page_in_pfn
> >		|-->__init_single_page
> >		|-->SetPageReserved
> >
> >Why we haven't remove these redundant initial in memmap_init_zone?
> >
> >Correct me if I missed something.
> 
> In this case it isn't redundant as only the vmmemmap pages are initialized
> in memmap_init_zone now. So all of the pages that are going to be used as
> device pages are not initialized until the call to memmap_init_zone_device.
> What I did is split the initialization of the pages into two parts in order
> to allow us to initialize the pages outside of the hotplug lock.
Ah.. I saw that, Thanks the explanation, so that is we only need to
care about the device pages reserved flag, and plan to remove that.
> 
> >>
> >>Right, this is what Yi is working on... the PageReserved flag is
> >>problematic for KVM. Auditing those locations it seems as long as we
> >>teach hibernation to avoid ZONE_DEVICE ranges we can safely not set
> >>the reserved flag for DAX pages. What I'm trying to avoid is a local
> >>KVM hack to check for DAX pages when the Reserved flag is not
> >>otherwise needed.
> >Thanks Dan. Provide the patch link.
> >
> >https://lore.kernel.org/lkml/cover.1536342881.git.yi.z.zhang@linux.intel.com
> 
> So it looks like your current logic is just working around the bit then
> since it just allows for reserved DAX pages.
> 
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
Zhang, Yi Oct. 11, 2018, 8:39 a.m. UTC | #35
On 2018-10-10 at 11:18:49 -0700, Dan Williams wrote:
> On Wed, Oct 10, 2018 at 10:30 AM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
> > > On 10/10/2018 2:58 AM, Michal Hocko wrote:
> > > > On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
> > > > [...]
> > > > > I would think with that being the case we still probably need the call to
> > > > > __SetPageReserved to set the bit with the expectation that it will not be
> > > > > cleared for device-pages since the pages are not onlined. Removing the call
> > > > > to __SetPageReserved would probably introduce a number of regressions as
> > > > > there are multiple spots that use the reserved bit to determine if a page
> > > > > can be swapped out to disk, mapped as system memory, or migrated.
> > > >
> > > > PageReserved is meant to tell any potential pfn walkers that might get
> > > > to this struct page to back off and not touch it. Even though
> > > > ZONE_DEVICE doesn't online pages in traditional sense it makes those
> > > > pages available for further use so the page reserved bit should be
> > > > cleared.
> > >
> > > So from what I can tell that isn't necessarily the case. Specifically if the
> > > pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
> > > special cases where the memory may not be accessible to the CPU or cannot be
> > > pinned in order to allow for eviction.
> >
> > Could you give me an example please?
> >
> > > The specific case that Dan and Yi are referring to is for the type
> > > MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
> > > reserved bit. Part of me wants to say that we should wait and clear the bit
> > > later, but that would end up just adding time back to initialization. At
> > > this point I would consider the change more of a follow-up optimization
> > > rather than a fix though since this is tailoring things specifically for DAX
> > > versus the other ZONE_DEVICE types.
> >
> > I thought I have already made it clear that these zone device hacks are
> > not acceptable to the generic hotplug code. If the current reserve bit
> > handling is not correct then give us a specific reason for that and we
> > can start thinking about the proper fix.
> >
> 
> Right, so we're in a situation where a hack is needed for KVM's
> current interpretation of the Reserved flag relative to dax mapped
> pages. I'm arguing to push that knowledge / handling as deep as
> possible into the core rather than hack the leaf implementations like
> KVM, i.e. disable the Reserved flag for all non-MEMORY_DEVICE_*
> ZONE_DEVICE types.
> 
> Here is the KVM thread about why they need a change:
> 
>     https://lkml.org/lkml/2018/9/7/552
> 
> ...and where I pushed back on a KVM-local hack:
> 
>     https://lkml.org/lkml/2018/9/19/154
Yeah, Thank Dan, I think I can going on with something like this:

@@ -5589,6 +5589,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		struct page *page = pfn_to_page(pfn);
 
 		__init_single_page(page, pfn, zone_idx, nid);
+		/* Could we move this a little bit earlier as I can
+		 * direct use is_dax_page(page), or something else?
+		 */
+		page->pgmap = pgmap;
 
 		/*
 		 * Mark page reserved as it will need to wait for onlining
@@ -5597,14 +5598,14 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		 * We can use the non-atomic __set_bit operation for setting
 		 * the flag as we are still initializing the pages.
 		 */
-		__SetPageReserved(page);
+		 if(!is_dax_page(page))
+			__SetPageReserved(page);
 
 		/*
 		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
 		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
 		 * page is ever freed or placed on a driver-private list.
 		 */
-		page->pgmap = pgmap;
 		page->hmm_data = 0;


After Alex's patch merged.
Michal Hocko Oct. 11, 2018, 8:55 a.m. UTC | #36
On Wed 10-10-18 20:52:42, Michal Hocko wrote:
[...]
> My recollection was that we do clear the reserved bit in
> move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> we set the bit back right afterwards. This seems to be the case since
> d0dc12e86b319 which reorganized the code. I have to study this some more
> obviously.

so my recollection was wrong and d0dc12e86b319 hasn't really changed
much because __init_single_page wouldn't zero out the struct page for
the hotplug contex. A comment in move_pfn_range_to_zone explains that we
want the reserved bit because pfn walkers already do see the pfn range
and the page is not fully associated with the zone until it is onlined.

I am thinking that we might be overzealous here. With the full state
initialized we shouldn't actually care. pfn_to_online_page should return
NULL regardless of the reserved bit and normal pfn walkers shouldn't
touch pages they do not recognize and a plain page with ref. count 1
doesn't tell much to anybody. So I _suspect_ that we can simply drop the
reserved bit setting here.

Regarding the post initialization required by devm_memremap_pages and
potentially others. Can we update the altmap which is already a way how
to get alternative struct pages by a constructor which we could call
from memmap_init_zone and do the post initialization? This would reduce
the additional loop in the caller while it would still fit the overall
design of the altmap and the core hotplug doesn't have to know anything
about DAX or whatever needs a special treatment.

Does that make any sense?
Alexander Duyck Oct. 11, 2018, 3:38 p.m. UTC | #37
On 10/11/2018 1:39 AM, Yi Zhang wrote:
> On 2018-10-10 at 11:18:49 -0700, Dan Williams wrote:
>> On Wed, Oct 10, 2018 at 10:30 AM Michal Hocko <mhocko@kernel.org> wrote:
>>>
>>> On Wed 10-10-18 09:39:08, Alexander Duyck wrote:
>>>> On 10/10/2018 2:58 AM, Michal Hocko wrote:
>>>>> On Tue 09-10-18 13:26:41, Alexander Duyck wrote:
>>>>> [...]
>>>>>> I would think with that being the case we still probably need the call to
>>>>>> __SetPageReserved to set the bit with the expectation that it will not be
>>>>>> cleared for device-pages since the pages are not onlined. Removing the call
>>>>>> to __SetPageReserved would probably introduce a number of regressions as
>>>>>> there are multiple spots that use the reserved bit to determine if a page
>>>>>> can be swapped out to disk, mapped as system memory, or migrated.
>>>>>
>>>>> PageReserved is meant to tell any potential pfn walkers that might get
>>>>> to this struct page to back off and not touch it. Even though
>>>>> ZONE_DEVICE doesn't online pages in traditional sense it makes those
>>>>> pages available for further use so the page reserved bit should be
>>>>> cleared.
>>>>
>>>> So from what I can tell that isn't necessarily the case. Specifically if the
>>>> pagemap type is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_PUBLIC both are
>>>> special cases where the memory may not be accessible to the CPU or cannot be
>>>> pinned in order to allow for eviction.
>>>
>>> Could you give me an example please?
>>>
>>>> The specific case that Dan and Yi are referring to is for the type
>>>> MEMORY_DEVICE_FS_DAX. For that type I could probably look at not setting the
>>>> reserved bit. Part of me wants to say that we should wait and clear the bit
>>>> later, but that would end up just adding time back to initialization. At
>>>> this point I would consider the change more of a follow-up optimization
>>>> rather than a fix though since this is tailoring things specifically for DAX
>>>> versus the other ZONE_DEVICE types.
>>>
>>> I thought I have already made it clear that these zone device hacks are
>>> not acceptable to the generic hotplug code. If the current reserve bit
>>> handling is not correct then give us a specific reason for that and we
>>> can start thinking about the proper fix.
>>>
>>
>> Right, so we're in a situation where a hack is needed for KVM's
>> current interpretation of the Reserved flag relative to dax mapped
>> pages. I'm arguing to push that knowledge / handling as deep as
>> possible into the core rather than hack the leaf implementations like
>> KVM, i.e. disable the Reserved flag for all non-MEMORY_DEVICE_*
>> ZONE_DEVICE types.
>>
>> Here is the KVM thread about why they need a change:
>>
>>      https://lkml.org/lkml/2018/9/7/552
>>
>> ...and where I pushed back on a KVM-local hack:
>>
>>      https://lkml.org/lkml/2018/9/19/154
> Yeah, Thank Dan, I think I can going on with something like this:
> 
> @@ -5589,6 +5589,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
>   		struct page *page = pfn_to_page(pfn);
>   
>   		__init_single_page(page, pfn, zone_idx, nid);
> +		/* Could we move this a little bit earlier as I can
> +		 * direct use is_dax_page(page), or something else?
> +		 */
> +		page->pgmap = pgmap;
>   
>   		/*
>   		 * Mark page reserved as it will need to wait for onlining
> @@ -5597,14 +5598,14 @@ void __ref memmap_init_zone_device(struct zone *zone,
>   		 * We can use the non-atomic __set_bit operation for setting
>   		 * the flag as we are still initializing the pages.
>   		 */
> -		__SetPageReserved(page);
> +		 if(!is_dax_page(page))
> +			__SetPageReserved(page);
>   
>   		/*
>   		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
>   		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
>   		 * page is ever freed or placed on a driver-private list.
>   		 */
> -		page->pgmap = pgmap;
>   		page->hmm_data = 0;
> 
> 
> After Alex's patch merged.

So I am not a huge fan of splitting up the pgmap init from the hmm_data, 
but I suppose this is just for your proof-of-concept?

I already have another patch set outstanding that may actually make this 
change easier[1]. What I could do is add the logic there based on the 
pgmap.type as an additional patch since I pass a boolean to determine if 
I am setting the reserved bit or not.

[1] 
https://lore.kernel.org/lkml/20181005151006.17473.83040.stgit@localhost.localdomain/
Alexander Duyck Oct. 11, 2018, 5:38 p.m. UTC | #38
On 10/11/2018 1:55 AM, Michal Hocko wrote:
> On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> [...]
>> My recollection was that we do clear the reserved bit in
>> move_pfn_range_to_zone and we indeed do in __init_single_page. But then
>> we set the bit back right afterwards. This seems to be the case since
>> d0dc12e86b319 which reorganized the code. I have to study this some more
>> obviously.
> 
> so my recollection was wrong and d0dc12e86b319 hasn't really changed
> much because __init_single_page wouldn't zero out the struct page for
> the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> want the reserved bit because pfn walkers already do see the pfn range
> and the page is not fully associated with the zone until it is onlined.
> 
> I am thinking that we might be overzealous here. With the full state
> initialized we shouldn't actually care. pfn_to_online_page should return
> NULL regardless of the reserved bit and normal pfn walkers shouldn't
> touch pages they do not recognize and a plain page with ref. count 1
> doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> reserved bit setting here.

So this has me a bit hesitant to want to just drop the bit entirely. If 
nothing else I think I may wan to make that a patch onto itself so that 
if we aren't going to set it we just drop it there. That way if it does 
cause issues we can bisect it to that patch and pinpoint the cause.

> Regarding the post initialization required by devm_memremap_pages and
> potentially others. Can we update the altmap which is already a way how
> to get alternative struct pages by a constructor which we could call
> from memmap_init_zone and do the post initialization? This would reduce
> the additional loop in the caller while it would still fit the overall
> design of the altmap and the core hotplug doesn't have to know anything
> about DAX or whatever needs a special treatment.
> 
> Does that make any sense?

I think the only thing that is currently using the altmap is the 
ZONE_DEVICE memory init. Specifically I think it is only really used by 
the devm_memremap_pages version of things, and then only under certain 
circumstances. Also the HMM driver doesn't pass an altmap. What we would 
really need is a non-ZONE_DEVICE users of the altmap to really justify 
sticking with that as the preferred argument to pass.

For those two functions it currently makes much more sense to pass the 
dev_pagemap pointer and then reference the altmap from there. Otherwise 
we are likely starting to look at something that would be more of a 
dirty hack where we are passing a unused altmap in order to get to the 
dev_pagemap so that we could populate the page.
Dan Williams Oct. 11, 2018, 6:22 p.m. UTC | #39
On Thu, Oct 11, 2018 at 10:39 AM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > [...]
> >> My recollection was that we do clear the reserved bit in
> >> move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> >> we set the bit back right afterwards. This seems to be the case since
> >> d0dc12e86b319 which reorganized the code. I have to study this some more
> >> obviously.
> >
> > so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > much because __init_single_page wouldn't zero out the struct page for
> > the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > want the reserved bit because pfn walkers already do see the pfn range
> > and the page is not fully associated with the zone until it is onlined.
> >
> > I am thinking that we might be overzealous here. With the full state
> > initialized we shouldn't actually care. pfn_to_online_page should return
> > NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > touch pages they do not recognize and a plain page with ref. count 1
> > doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > reserved bit setting here.
>
> So this has me a bit hesitant to want to just drop the bit entirely. If
> nothing else I think I may wan to make that a patch onto itself so that
> if we aren't going to set it we just drop it there. That way if it does
> cause issues we can bisect it to that patch and pinpoint the cause.
>
> > Regarding the post initialization required by devm_memremap_pages and
> > potentially others. Can we update the altmap which is already a way how
> > to get alternative struct pages by a constructor which we could call
> > from memmap_init_zone and do the post initialization? This would reduce
> > the additional loop in the caller while it would still fit the overall
> > design of the altmap and the core hotplug doesn't have to know anything
> > about DAX or whatever needs a special treatment.
> >
> > Does that make any sense?
>
> I think the only thing that is currently using the altmap is the
> ZONE_DEVICE memory init. Specifically I think it is only really used by
> the devm_memremap_pages version of things, and then only under certain
> circumstances. Also the HMM driver doesn't pass an altmap. What we would
> really need is a non-ZONE_DEVICE users of the altmap to really justify
> sticking with that as the preferred argument to pass.

Right, the altmap is optional. It's only there to direct the memmap
array to be allocated from the memory-range being hot-added vs a
dynamic page-allocator allocation from System-RAM.

> For those two functions it currently makes much more sense to pass the
> dev_pagemap pointer and then reference the altmap from there. Otherwise
> we are likely starting to look at something that would be more of a
> dirty hack where we are passing a unused altmap in order to get to the
> dev_pagemap so that we could populate the page.

Yeah, we can't rely on the altmap, it's marked invalid in many cases.
Michal Hocko Oct. 17, 2018, 7:52 a.m. UTC | #40
On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > [...]
> > > My recollection was that we do clear the reserved bit in
> > > move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> > > we set the bit back right afterwards. This seems to be the case since
> > > d0dc12e86b319 which reorganized the code. I have to study this some more
> > > obviously.
> > 
> > so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > much because __init_single_page wouldn't zero out the struct page for
> > the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > want the reserved bit because pfn walkers already do see the pfn range
> > and the page is not fully associated with the zone until it is onlined.
> > 
> > I am thinking that we might be overzealous here. With the full state
> > initialized we shouldn't actually care. pfn_to_online_page should return
> > NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > touch pages they do not recognize and a plain page with ref. count 1
> > doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > reserved bit setting here.
> 
> So this has me a bit hesitant to want to just drop the bit entirely. If
> nothing else I think I may wan to make that a patch onto itself so that if
> we aren't going to set it we just drop it there. That way if it does cause
> issues we can bisect it to that patch and pinpoint the cause.

Yes a patch on its own make sense for bisectability.

> > Regarding the post initialization required by devm_memremap_pages and
> > potentially others. Can we update the altmap which is already a way how
> > to get alternative struct pages by a constructor which we could call
> > from memmap_init_zone and do the post initialization? This would reduce
> > the additional loop in the caller while it would still fit the overall
> > design of the altmap and the core hotplug doesn't have to know anything
> > about DAX or whatever needs a special treatment.
> > 
> > Does that make any sense?
> 
> I think the only thing that is currently using the altmap is the ZONE_DEVICE
> memory init. Specifically I think it is only really used by the
> devm_memremap_pages version of things, and then only under certain
> circumstances. Also the HMM driver doesn't pass an altmap. What we would
> really need is a non-ZONE_DEVICE users of the altmap to really justify
> sticking with that as the preferred argument to pass.

I am not aware of any upstream HMM user so I am not sure what are the
expectations there. But I thought that ZONE_DEVICE users use altmap. If
that is not generally true then we certainly have to think about a
better interface.
 
> For those two functions it currently makes much more sense to pass the
> dev_pagemap pointer and then reference the altmap from there. Otherwise we
> are likely starting to look at something that would be more of a dirty hack
> where we are passing a unused altmap in order to get to the dev_pagemap so
> that we could populate the page.

If dev_pagemap is a general abstraction then I agree.
Alexander Duyck Oct. 17, 2018, 3:02 p.m. UTC | #41
On 10/17/2018 12:52 AM, Michal Hocko wrote:
> On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
>> On 10/11/2018 1:55 AM, Michal Hocko wrote:
>>> On Wed 10-10-18 20:52:42, Michal Hocko wrote:
>>> [...]
>>>> My recollection was that we do clear the reserved bit in
>>>> move_pfn_range_to_zone and we indeed do in __init_single_page. But then
>>>> we set the bit back right afterwards. This seems to be the case since
>>>> d0dc12e86b319 which reorganized the code. I have to study this some more
>>>> obviously.
>>>
>>> so my recollection was wrong and d0dc12e86b319 hasn't really changed
>>> much because __init_single_page wouldn't zero out the struct page for
>>> the hotplug contex. A comment in move_pfn_range_to_zone explains that we
>>> want the reserved bit because pfn walkers already do see the pfn range
>>> and the page is not fully associated with the zone until it is onlined.
>>>
>>> I am thinking that we might be overzealous here. With the full state
>>> initialized we shouldn't actually care. pfn_to_online_page should return
>>> NULL regardless of the reserved bit and normal pfn walkers shouldn't
>>> touch pages they do not recognize and a plain page with ref. count 1
>>> doesn't tell much to anybody. So I _suspect_ that we can simply drop the
>>> reserved bit setting here.
>>
>> So this has me a bit hesitant to want to just drop the bit entirely. If
>> nothing else I think I may wan to make that a patch onto itself so that if
>> we aren't going to set it we just drop it there. That way if it does cause
>> issues we can bisect it to that patch and pinpoint the cause.
> 
> Yes a patch on its own make sense for bisectability.

For now I think I am going to back off of this. There is a bunch of 
other changes that need to happen in order for us to make this work. As 
far as I can tell there are several places that are relying on this 
reserved bit.

>>> Regarding the post initialization required by devm_memremap_pages and
>>> potentially others. Can we update the altmap which is already a way how
>>> to get alternative struct pages by a constructor which we could call
>>> from memmap_init_zone and do the post initialization? This would reduce
>>> the additional loop in the caller while it would still fit the overall
>>> design of the altmap and the core hotplug doesn't have to know anything
>>> about DAX or whatever needs a special treatment.
>>>
>>> Does that make any sense?
>>
>> I think the only thing that is currently using the altmap is the ZONE_DEVICE
>> memory init. Specifically I think it is only really used by the
>> devm_memremap_pages version of things, and then only under certain
>> circumstances. Also the HMM driver doesn't pass an altmap. What we would
>> really need is a non-ZONE_DEVICE users of the altmap to really justify
>> sticking with that as the preferred argument to pass.
> 
> I am not aware of any upstream HMM user so I am not sure what are the
> expectations there. But I thought that ZONE_DEVICE users use altmap. If
> that is not generally true then we certainly have to think about a
> better interface.

I'm just basing my statement on the use of the move_pfn_range_to_zone 
call. The only caller that is actually passing the altmap is 
devm_memremap_pages and if I understand things correctly that is only 
used when we want to stare the vmmemmap on the same memory that we just 
hotplugged.

That is why it made more sense to me to just create a ZONE_DEVICE 
specific function for handling the page initialization because the one 
value I do have to pass is the dev_pagemap in both HMM and memremap 
case, and that has the altmap already embedded inside of it.

>> For those two functions it currently makes much more sense to pass the
>> dev_pagemap pointer and then reference the altmap from there. Otherwise we
>> are likely starting to look at something that would be more of a dirty hack
>> where we are passing a unused altmap in order to get to the dev_pagemap so
>> that we could populate the page.
> 
> If dev_pagemap is a general abstraction then I agree.

Well so far HMM and the memremap code have both agreed to use that 
structure to store the metadata for ZONE_DEVICE mappings, and at this 
point we are already looking at 3 different memory types being stored 
within that zone as we already have the private, public, and DAX memory 
types all using this structure.
Michal Hocko Oct. 29, 2018, 2:12 p.m. UTC | #42
On Wed 17-10-18 08:02:20, Alexander Duyck wrote:
> On 10/17/2018 12:52 AM, Michal Hocko wrote:
> > On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> > > On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > > > On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > > > [...]
> > > > > My recollection was that we do clear the reserved bit in
> > > > > move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> > > > > we set the bit back right afterwards. This seems to be the case since
> > > > > d0dc12e86b319 which reorganized the code. I have to study this some more
> > > > > obviously.
> > > > 
> > > > so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > > > much because __init_single_page wouldn't zero out the struct page for
> > > > the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > > > want the reserved bit because pfn walkers already do see the pfn range
> > > > and the page is not fully associated with the zone until it is onlined.
> > > > 
> > > > I am thinking that we might be overzealous here. With the full state
> > > > initialized we shouldn't actually care. pfn_to_online_page should return
> > > > NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > > > touch pages they do not recognize and a plain page with ref. count 1
> > > > doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > > > reserved bit setting here.
> > > 
> > > So this has me a bit hesitant to want to just drop the bit entirely. If
> > > nothing else I think I may wan to make that a patch onto itself so that if
> > > we aren't going to set it we just drop it there. That way if it does cause
> > > issues we can bisect it to that patch and pinpoint the cause.
> > 
> > Yes a patch on its own make sense for bisectability.
> 
> For now I think I am going to back off of this. There is a bunch of other
> changes that need to happen in order for us to make this work. As far as I
> can tell there are several places that are relying on this reserved bit.

Please be more specific. Unless I misremember, I have added this
PageReserved just to be sure (f1dd2cd13c4bb) because pages where just
half initialized back then. I am not aware anybody is depending on this.
If there is somebody then be explicit about that. The last thing I want
to see is to preserve a cargo cult and build a design around it.

> > > > Regarding the post initialization required by devm_memremap_pages and
> > > > potentially others. Can we update the altmap which is already a way how
> > > > to get alternative struct pages by a constructor which we could call
> > > > from memmap_init_zone and do the post initialization? This would reduce
> > > > the additional loop in the caller while it would still fit the overall
> > > > design of the altmap and the core hotplug doesn't have to know anything
> > > > about DAX or whatever needs a special treatment.
> > > > 
> > > > Does that make any sense?
> > > 
> > > I think the only thing that is currently using the altmap is the ZONE_DEVICE
> > > memory init. Specifically I think it is only really used by the
> > > devm_memremap_pages version of things, and then only under certain
> > > circumstances. Also the HMM driver doesn't pass an altmap. What we would
> > > really need is a non-ZONE_DEVICE users of the altmap to really justify
> > > sticking with that as the preferred argument to pass.
> > 
> > I am not aware of any upstream HMM user so I am not sure what are the
> > expectations there. But I thought that ZONE_DEVICE users use altmap. If
> > that is not generally true then we certainly have to think about a
> > better interface.
> 
> I'm just basing my statement on the use of the move_pfn_range_to_zone call.
> The only caller that is actually passing the altmap is devm_memremap_pages
> and if I understand things correctly that is only used when we want to stare
> the vmmemmap on the same memory that we just hotplugged.

Yes, and that is what I've called as allocator callback earlier.

> That is why it made more sense to me to just create a ZONE_DEVICE specific
> function for handling the page initialization because the one value I do
> have to pass is the dev_pagemap in both HMM and memremap case, and that has
> the altmap already embedded inside of it.

And I have argued that this is a wrong approach to the problem. If you
need a very specific struct page initialization then create a init
(constructor) callback.
Dan Williams Oct. 29, 2018, 3:49 p.m. UTC | #43
On Wed, Oct 17, 2018 at 8:02 AM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On 10/17/2018 12:52 AM, Michal Hocko wrote:
> > On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> >> On 10/11/2018 1:55 AM, Michal Hocko wrote:
> >>> On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> >>> [...]
> >>>> My recollection was that we do clear the reserved bit in
> >>>> move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> >>>> we set the bit back right afterwards. This seems to be the case since
> >>>> d0dc12e86b319 which reorganized the code. I have to study this some more
> >>>> obviously.
> >>>
> >>> so my recollection was wrong and d0dc12e86b319 hasn't really changed
> >>> much because __init_single_page wouldn't zero out the struct page for
> >>> the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> >>> want the reserved bit because pfn walkers already do see the pfn range
> >>> and the page is not fully associated with the zone until it is onlined.
> >>>
> >>> I am thinking that we might be overzealous here. With the full state
> >>> initialized we shouldn't actually care. pfn_to_online_page should return
> >>> NULL regardless of the reserved bit and normal pfn walkers shouldn't
> >>> touch pages they do not recognize and a plain page with ref. count 1
> >>> doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> >>> reserved bit setting here.
> >>
> >> So this has me a bit hesitant to want to just drop the bit entirely. If
> >> nothing else I think I may wan to make that a patch onto itself so that if
> >> we aren't going to set it we just drop it there. That way if it does cause
> >> issues we can bisect it to that patch and pinpoint the cause.
> >
> > Yes a patch on its own make sense for bisectability.
>
> For now I think I am going to back off of this. There is a bunch of
> other changes that need to happen in order for us to make this work. As
> far as I can tell there are several places that are relying on this
> reserved bit.

When David Hildebrand and I looked it was only the hibernation code
that we thought needed changing. We either need to audit the removal
or go back to adding a special case hack for kvm because this is a
blocking issue for them.

What do you see beyond the hibernation change?
Michal Hocko Oct. 29, 2018, 3:56 p.m. UTC | #44
On Mon 29-10-18 08:49:46, Dan Williams wrote:
> On Wed, Oct 17, 2018 at 8:02 AM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
> >
> > On 10/17/2018 12:52 AM, Michal Hocko wrote:
> > > On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> > >> On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > >>> On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > >>> [...]
> > >>>> My recollection was that we do clear the reserved bit in
> > >>>> move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> > >>>> we set the bit back right afterwards. This seems to be the case since
> > >>>> d0dc12e86b319 which reorganized the code. I have to study this some more
> > >>>> obviously.
> > >>>
> > >>> so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > >>> much because __init_single_page wouldn't zero out the struct page for
> > >>> the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > >>> want the reserved bit because pfn walkers already do see the pfn range
> > >>> and the page is not fully associated with the zone until it is onlined.
> > >>>
> > >>> I am thinking that we might be overzealous here. With the full state
> > >>> initialized we shouldn't actually care. pfn_to_online_page should return
> > >>> NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > >>> touch pages they do not recognize and a plain page with ref. count 1
> > >>> doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > >>> reserved bit setting here.
> > >>
> > >> So this has me a bit hesitant to want to just drop the bit entirely. If
> > >> nothing else I think I may wan to make that a patch onto itself so that if
> > >> we aren't going to set it we just drop it there. That way if it does cause
> > >> issues we can bisect it to that patch and pinpoint the cause.
> > >
> > > Yes a patch on its own make sense for bisectability.
> >
> > For now I think I am going to back off of this. There is a bunch of
> > other changes that need to happen in order for us to make this work. As
> > far as I can tell there are several places that are relying on this
> > reserved bit.
> 
> When David Hildebrand and I looked it was only the hibernation code
> that we thought needed changing.

More details please?
Alexander Duyck Oct. 29, 2018, 3:59 p.m. UTC | #45
On Mon, 2018-10-29 at 15:12 +0100, Michal Hocko wrote:
> On Wed 17-10-18 08:02:20, Alexander Duyck wrote:
> > On 10/17/2018 12:52 AM, Michal Hocko wrote:
> > > On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> > > > On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > > > > On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > > > > [...]
> > > > > > My recollection was that we do clear the reserved bit in
> > > > > > move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> > > > > > we set the bit back right afterwards. This seems to be the case since
> > > > > > d0dc12e86b319 which reorganized the code. I have to study this some more
> > > > > > obviously.
> > > > > 
> > > > > so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > > > > much because __init_single_page wouldn't zero out the struct page for
> > > > > the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > > > > want the reserved bit because pfn walkers already do see the pfn range
> > > > > and the page is not fully associated with the zone until it is onlined.
> > > > > 
> > > > > I am thinking that we might be overzealous here. With the full state
> > > > > initialized we shouldn't actually care. pfn_to_online_page should return
> > > > > NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > > > > touch pages they do not recognize and a plain page with ref. count 1
> > > > > doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > > > > reserved bit setting here.
> > > > 
> > > > So this has me a bit hesitant to want to just drop the bit entirely. If
> > > > nothing else I think I may wan to make that a patch onto itself so that if
> > > > we aren't going to set it we just drop it there. That way if it does cause
> > > > issues we can bisect it to that patch and pinpoint the cause.
> > > 
> > > Yes a patch on its own make sense for bisectability.
> > 
> > For now I think I am going to back off of this. There is a bunch of other
> > changes that need to happen in order for us to make this work. As far as I
> > can tell there are several places that are relying on this reserved bit.
> 
> Please be more specific. Unless I misremember, I have added this
> PageReserved just to be sure (f1dd2cd13c4bb) because pages where just
> half initialized back then. I am not aware anybody is depending on this.
> If there is somebody then be explicit about that. The last thing I want
> to see is to preserve a cargo cult and build a design around it.

It is mostly just a matter of going through and auditing all the
places that are using PageReserved to identify pages that they aren't
supposed to touch for whatever reason.

From what I can tell the issue appears to be the fact that the reserved
bit is used to identify if a region of memory is "online" or "offline".
So for example the call "online_pages_range" doesn't invoke the
online_page_callback unless the first pfn in the range is marked as
reserved.

Another example Dan had pointed out was the saveable_page function in
kernel/power/snapshot.c.

> > > > > Regarding the post initialization required by devm_memremap_pages and
> > > > > potentially others. Can we update the altmap which is already a way how
> > > > > to get alternative struct pages by a constructor which we could call
> > > > > from memmap_init_zone and do the post initialization? This would reduce
> > > > > the additional loop in the caller while it would still fit the overall
> > > > > design of the altmap and the core hotplug doesn't have to know anything
> > > > > about DAX or whatever needs a special treatment.
> > > > > 
> > > > > Does that make any sense?
> > > > 
> > > > I think the only thing that is currently using the altmap is the ZONE_DEVICE
> > > > memory init. Specifically I think it is only really used by the
> > > > devm_memremap_pages version of things, and then only under certain
> > > > circumstances. Also the HMM driver doesn't pass an altmap. What we would
> > > > really need is a non-ZONE_DEVICE users of the altmap to really justify
> > > > sticking with that as the preferred argument to pass.
> > > 
> > > I am not aware of any upstream HMM user so I am not sure what are the
> > > expectations there. But I thought that ZONE_DEVICE users use altmap. If
> > > that is not generally true then we certainly have to think about a
> > > better interface.
> > 
> > I'm just basing my statement on the use of the move_pfn_range_to_zone call.
> > The only caller that is actually passing the altmap is devm_memremap_pages
> > and if I understand things correctly that is only used when we want to stare
> > the vmmemmap on the same memory that we just hotplugged.
> 
> Yes, and that is what I've called as allocator callback earlier.

I am really not a fan of the callback approach. It just means we will
have to do everything multiple times in terms of initialization.

> > That is why it made more sense to me to just create a ZONE_DEVICE specific
> > function for handling the page initialization because the one value I do
> > have to pass is the dev_pagemap in both HMM and memremap case, and that has
> > the altmap already embedded inside of it.
> 
> And I have argued that this is a wrong approach to the problem. If you
> need a very specific struct page initialization then create a init
> (constructor) callback.

The callback solution just ends up being more expensive because we lose
multiple layers of possible optimization. By putting everything into on
initization function we are able to let the compiler go through and
optimize things to the point where we are essentially just doing
something akin to one bit memcpy/memset where we are able to construct
one set of page values and write that to every single page we have to
initialize within a given page block.

My concern is that we are going to see a 2-4x regression if I were to
update the current patches I have to improve init performance in order
to achieve the purity of the page initilization functions that you are
looking for. I feel we are much better off having one function that can
handle all cases and do so with high performance, than trying to
construct a set of functions that end up having to reinitialize the
same memory from the previous step and end up with us wasting cycles
and duplicating overhead in multiple spots.

In my mind the memmap_init_zone_device function is essentially just
bringing the pages "online" after they have been mapped. That is why I
am thinking it is probably okay to clear the reseved bit for the DAX
pages at least. Once we have a hard go/no-go on Dan's patches that were
consolidating the HMM functionality we could look at seeing if we need
to move some functions around and what we can do to make it so that all
the ZONE_DEVICE code can be moved as far out of the generic page init
as possible while still maintaining reasonable initialization
performance.
Michal Hocko Oct. 29, 2018, 4:35 p.m. UTC | #46
On Mon 29-10-18 08:59:34, Alexander Duyck wrote:
> On Mon, 2018-10-29 at 15:12 +0100, Michal Hocko wrote:
> > On Wed 17-10-18 08:02:20, Alexander Duyck wrote:
> > > On 10/17/2018 12:52 AM, Michal Hocko wrote:
> > > > On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> > > > > On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > > > > > On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > > > > > [...]
> > > > > > > My recollection was that we do clear the reserved bit in
> > > > > > > move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> > > > > > > we set the bit back right afterwards. This seems to be the case since
> > > > > > > d0dc12e86b319 which reorganized the code. I have to study this some more
> > > > > > > obviously.
> > > > > > 
> > > > > > so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > > > > > much because __init_single_page wouldn't zero out the struct page for
> > > > > > the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > > > > > want the reserved bit because pfn walkers already do see the pfn range
> > > > > > and the page is not fully associated with the zone until it is onlined.
> > > > > > 
> > > > > > I am thinking that we might be overzealous here. With the full state
> > > > > > initialized we shouldn't actually care. pfn_to_online_page should return
> > > > > > NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > > > > > touch pages they do not recognize and a plain page with ref. count 1
> > > > > > doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > > > > > reserved bit setting here.
> > > > > 
> > > > > So this has me a bit hesitant to want to just drop the bit entirely. If
> > > > > nothing else I think I may wan to make that a patch onto itself so that if
> > > > > we aren't going to set it we just drop it there. That way if it does cause
> > > > > issues we can bisect it to that patch and pinpoint the cause.
> > > > 
> > > > Yes a patch on its own make sense for bisectability.
> > > 
> > > For now I think I am going to back off of this. There is a bunch of other
> > > changes that need to happen in order for us to make this work. As far as I
> > > can tell there are several places that are relying on this reserved bit.
> > 
> > Please be more specific. Unless I misremember, I have added this
> > PageReserved just to be sure (f1dd2cd13c4bb) because pages where just
> > half initialized back then. I am not aware anybody is depending on this.
> > If there is somebody then be explicit about that. The last thing I want
> > to see is to preserve a cargo cult and build a design around it.
> 
> It is mostly just a matter of going through and auditing all the
> places that are using PageReserved to identify pages that they aren't
> supposed to touch for whatever reason.
>
> From what I can tell the issue appears to be the fact that the reserved
> bit is used to identify if a region of memory is "online" or "offline".

No, this is wrong. pfn_to_online_page does that. PageReserved has
nothing to do with online vs. offline status. It merely says that you
shouldn't touch the page unless you own it. Sure we might have few
places relying on it but nothing should really depend the reserved bit
check from the MM hotplug POV.

> So for example the call "online_pages_range" doesn't invoke the
> online_page_callback unless the first pfn in the range is marked as
> reserved.

Yes and there is no fundamental reason to do that. We can easily check
the online status without that.

> Another example Dan had pointed out was the saveable_page function in
> kernel/power/snapshot.c.

Use pfn_to_online_page there.

> > > > > > Regarding the post initialization required by devm_memremap_pages and
> > > > > > potentially others. Can we update the altmap which is already a way how
> > > > > > to get alternative struct pages by a constructor which we could call
> > > > > > from memmap_init_zone and do the post initialization? This would reduce
> > > > > > the additional loop in the caller while it would still fit the overall
> > > > > > design of the altmap and the core hotplug doesn't have to know anything
> > > > > > about DAX or whatever needs a special treatment.
> > > > > > 
> > > > > > Does that make any sense?
> > > > > 
> > > > > I think the only thing that is currently using the altmap is the ZONE_DEVICE
> > > > > memory init. Specifically I think it is only really used by the
> > > > > devm_memremap_pages version of things, and then only under certain
> > > > > circumstances. Also the HMM driver doesn't pass an altmap. What we would
> > > > > really need is a non-ZONE_DEVICE users of the altmap to really justify
> > > > > sticking with that as the preferred argument to pass.
> > > > 
> > > > I am not aware of any upstream HMM user so I am not sure what are the
> > > > expectations there. But I thought that ZONE_DEVICE users use altmap. If
> > > > that is not generally true then we certainly have to think about a
> > > > better interface.
> > > 
> > > I'm just basing my statement on the use of the move_pfn_range_to_zone call.
> > > The only caller that is actually passing the altmap is devm_memremap_pages
> > > and if I understand things correctly that is only used when we want to stare
> > > the vmmemmap on the same memory that we just hotplugged.
> > 
> > Yes, and that is what I've called as allocator callback earlier.
> 
> I am really not a fan of the callback approach. It just means we will
> have to do everything multiple times in terms of initialization.

I do not follow. Could you elaborate?

> > > That is why it made more sense to me to just create a ZONE_DEVICE specific
> > > function for handling the page initialization because the one value I do
> > > have to pass is the dev_pagemap in both HMM and memremap case, and that has
> > > the altmap already embedded inside of it.
> > 
> > And I have argued that this is a wrong approach to the problem. If you
> > need a very specific struct page initialization then create a init
> > (constructor) callback.
> 
> The callback solution just ends up being more expensive because we lose
> multiple layers of possible optimization. By putting everything into on
> initization function we are able to let the compiler go through and
> optimize things to the point where we are essentially just doing
> something akin to one bit memcpy/memset where we are able to construct
> one set of page values and write that to every single page we have to
> initialize within a given page block.

You are already doing per-page initialization so I fail to see a larger
unit to operate on.

> My concern is that we are going to see a 2-4x regression if I were to
> update the current patches I have to improve init performance in order
> to achieve the purity of the page initilization functions that you are
> looking for. I feel we are much better off having one function that can
> handle all cases and do so with high performance, than trying to
> construct a set of functions that end up having to reinitialize the
> same memory from the previous step and end up with us wasting cycles
> and duplicating overhead in multiple spots.

The memory hotplug is just one pile of unmaintainable mess mostly because
of this kind of attitude. You just care about _your_ particular usecase
and not a wee bit beyond that.
Alexander Duyck Oct. 29, 2018, 5:01 p.m. UTC | #47
On Mon, 2018-10-29 at 17:35 +0100, Michal Hocko wrote:
> On Mon 29-10-18 08:59:34, Alexander Duyck wrote:
> > On Mon, 2018-10-29 at 15:12 +0100, Michal Hocko wrote:
> > > On Wed 17-10-18 08:02:20, Alexander Duyck wrote:
> > > > On 10/17/2018 12:52 AM, Michal Hocko wrote:
> > > > > On Thu 11-10-18 10:38:39, Alexander Duyck wrote:
> > > > > > On 10/11/2018 1:55 AM, Michal Hocko wrote:
> > > > > > > On Wed 10-10-18 20:52:42, Michal Hocko wrote:
> > > > > > > [...]
> > > > > > > > My recollection was that we do clear the reserved bit in
> > > > > > > > move_pfn_range_to_zone and we indeed do in __init_single_page. But then
> > > > > > > > we set the bit back right afterwards. This seems to be the case since
> > > > > > > > d0dc12e86b319 which reorganized the code. I have to study this some more
> > > > > > > > obviously.
> > > > > > > 
> > > > > > > so my recollection was wrong and d0dc12e86b319 hasn't really changed
> > > > > > > much because __init_single_page wouldn't zero out the struct page for
> > > > > > > the hotplug contex. A comment in move_pfn_range_to_zone explains that we
> > > > > > > want the reserved bit because pfn walkers already do see the pfn range
> > > > > > > and the page is not fully associated with the zone until it is onlined.
> > > > > > > 
> > > > > > > I am thinking that we might be overzealous here. With the full state
> > > > > > > initialized we shouldn't actually care. pfn_to_online_page should return
> > > > > > > NULL regardless of the reserved bit and normal pfn walkers shouldn't
> > > > > > > touch pages they do not recognize and a plain page with ref. count 1
> > > > > > > doesn't tell much to anybody. So I _suspect_ that we can simply drop the
> > > > > > > reserved bit setting here.
> > > > > > 
> > > > > > So this has me a bit hesitant to want to just drop the bit entirely. If
> > > > > > nothing else I think I may wan to make that a patch onto itself so that if
> > > > > > we aren't going to set it we just drop it there. That way if it does cause
> > > > > > issues we can bisect it to that patch and pinpoint the cause.
> > > > > 
> > > > > Yes a patch on its own make sense for bisectability.
> > > > 
> > > > For now I think I am going to back off of this. There is a bunch of other
> > > > changes that need to happen in order for us to make this work. As far as I
> > > > can tell there are several places that are relying on this reserved bit.
> > > 
> > > Please be more specific. Unless I misremember, I have added this
> > > PageReserved just to be sure (f1dd2cd13c4bb) because pages where just
> > > half initialized back then. I am not aware anybody is depending on this.
> > > If there is somebody then be explicit about that. The last thing I want
> > > to see is to preserve a cargo cult and build a design around it.
> > 
> > It is mostly just a matter of going through and auditing all the
> > places that are using PageReserved to identify pages that they aren't
> > supposed to touch for whatever reason.
> > 
> > From what I can tell the issue appears to be the fact that the reserved
> > bit is used to identify if a region of memory is "online" or "offline".
> 
> No, this is wrong. pfn_to_online_page does that. PageReserved has
> nothing to do with online vs. offline status. It merely says that you
> shouldn't touch the page unless you own it. Sure we might have few
> places relying on it but nothing should really depend the reserved bit
> check from the MM hotplug POV.
> 
> > So for example the call "online_pages_range" doesn't invoke the
> > online_page_callback unless the first pfn in the range is marked as
> > reserved.
> 
> Yes and there is no fundamental reason to do that. We can easily check
> the online status without that.
> 
> > Another example Dan had pointed out was the saveable_page function in
> > kernel/power/snapshot.c.
> 
> Use pfn_to_online_page there.

Right. Which getting back to my original point, there is a bunch of
other changes that need to happen in order for us to make this work. I
am going to end up with yet another patch set to clean up all the spots
that are using PageReserved that shouldn't be before I can get to the
point of not setting that bit.

> > > > > > > Regarding the post initialization required by devm_memremap_pages and
> > > > > > > potentially others. Can we update the altmap which is already a way how
> > > > > > > to get alternative struct pages by a constructor which we could call
> > > > > > > from memmap_init_zone and do the post initialization? This would reduce
> > > > > > > the additional loop in the caller while it would still fit the overall
> > > > > > > design of the altmap and the core hotplug doesn't have to know anything
> > > > > > > about DAX or whatever needs a special treatment.
> > > > > > > 
> > > > > > > Does that make any sense?
> > > > > > 
> > > > > > I think the only thing that is currently using the altmap is the ZONE_DEVICE
> > > > > > memory init. Specifically I think it is only really used by the
> > > > > > devm_memremap_pages version of things, and then only under certain
> > > > > > circumstances. Also the HMM driver doesn't pass an altmap. What we would
> > > > > > really need is a non-ZONE_DEVICE users of the altmap to really justify
> > > > > > sticking with that as the preferred argument to pass.
> > > > > 
> > > > > I am not aware of any upstream HMM user so I am not sure what are the
> > > > > expectations there. But I thought that ZONE_DEVICE users use altmap. If
> > > > > that is not generally true then we certainly have to think about a
> > > > > better interface.
> > > > 
> > > > I'm just basing my statement on the use of the move_pfn_range_to_zone call.
> > > > The only caller that is actually passing the altmap is devm_memremap_pages
> > > > and if I understand things correctly that is only used when we want to stare
> > > > the vmmemmap on the same memory that we just hotplugged.
> > > 
> > > Yes, and that is what I've called as allocator callback earlier.
> > 
> > I am really not a fan of the callback approach. It just means we will
> > have to do everything multiple times in terms of initialization.
> 
> I do not follow. Could you elaborate?

So there end up being a few different issues with constructors. First
in my mind is that it means we have to initialize the region of memory
and cannot assume what the constructors are going to do for us. As a
result we will have to initialize the LRU pointers, and then overwrite
them with the pgmap and hmm_data. I am generally not a fan of that as
the next patch set I have gets rid of most of the redundancy we already
had in the writes where we were memsetting everything to 0, then
writing the values, and then taking care of the reserved bit and
pgmap/hmm_data fields. Dealing with the init serially like that is just
slow.

Another complication is retpoline making function pointers just more
expensive in general. I know in the networking area we have been
dealing with this for a while as even the DMA ops have been a pain
there.

> > > > That is why it made more sense to me to just create a ZONE_DEVICE specific
> > > > function for handling the page initialization because the one value I do
> > > > have to pass is the dev_pagemap in both HMM and memremap case, and that has
> > > > the altmap already embedded inside of it.
> > > 
> > > And I have argued that this is a wrong approach to the problem. If you
> > > need a very specific struct page initialization then create a init
> > > (constructor) callback.
> > 
> > The callback solution just ends up being more expensive because we lose
> > multiple layers of possible optimization. By putting everything into on
> > initization function we are able to let the compiler go through and
> > optimize things to the point where we are essentially just doing
> > something akin to one bit memcpy/memset where we are able to construct
> > one set of page values and write that to every single page we have to
> > initialize within a given page block.
> 
> You are already doing per-page initialization so I fail to see a larger
> unit to operate on.

I have a patch that makes it so that we can work at a pageblock level
since all of the variables with the exception of only the LRU and page
address fields can be precomputed. Doing that is one of the ways I was
able to reduce page init to 1/3 to 1/4 of the time it was taking
otherwise in the case of deferred page init.

> > My concern is that we are going to see a 2-4x regression if I were to
> > update the current patches I have to improve init performance in order
> > to achieve the purity of the page initilization functions that you are
> > looking for. I feel we are much better off having one function that can
> > handle all cases and do so with high performance, than trying to
> > construct a set of functions that end up having to reinitialize the
> > same memory from the previous step and end up with us wasting cycles
> > and duplicating overhead in multiple spots.
> 
> The memory hotplug is just one pile of unmaintainable mess mostly because
> of this kind of attitude. You just care about _your_ particular usecase
> and not a wee bit beyond that.

I care about all of it. What I am proposing is a solution that works
out best for all memory init. That is why my follow-on patch set had
also improved standard deferred memory initialization.

What I am concerned with is that your insistance that we cannot have
any ZONE_DEVICE information initialized in the generic page init code
is really just you focusing no _your_ particular use case and ignoring
everything else.

The fact is I already gave up quite a bit. With the follow-on patch set
I am only getting it down to about 18s for my 3TB init case. I could
have gotten it down to 12s - 15s, but that would have required moving
the memset and that optimization would have hurt other cases. I opted
not to do that because I wanted to keep the solution generic and as a
net benefit for all cases.
Michal Hocko Oct. 29, 2018, 5:24 p.m. UTC | #48
On Mon 29-10-18 10:01:28, Alexander Duyck wrote:
> On Mon, 2018-10-29 at 17:35 +0100, Michal Hocko wrote:
> > On Mon 29-10-18 08:59:34, Alexander Duyck wrote:
[...]
> > > So for example the call "online_pages_range" doesn't invoke the
> > > online_page_callback unless the first pfn in the range is marked as
> > > reserved.
> > 
> > Yes and there is no fundamental reason to do that. We can easily check
> > the online status without that.
> > 
> > > Another example Dan had pointed out was the saveable_page function in
> > > kernel/power/snapshot.c.
> > 
> > Use pfn_to_online_page there.
> 
> Right. Which getting back to my original point, there is a bunch of
> other changes that need to happen in order for us to make this work.

Which is a standard process of upstreaming stuff. My main point was that
the reason why I've added SetPageReserved was a safety net because I
knew that different code paths would back of on PageReserved while they
wouldn't on a partially initialized structure. Now that you really want
to prevent setting this bit for performance reasons then it makes sense
to revisit that earlier decision and get rid of it rather than build on
top of it and duplicate and special case the low level hotplug init
code.

> I am going to end up with yet another patch set to clean up all the
> spots that are using PageReserved that shouldn't be before I can get
> to the point of not setting that bit.

This would be highly appreciated. There are not that many PageReserved
checks.

> > > > > > > > Regarding the post initialization required by devm_memremap_pages and
> > > > > > > > potentially others. Can we update the altmap which is already a way how
> > > > > > > > to get alternative struct pages by a constructor which we could call
> > > > > > > > from memmap_init_zone and do the post initialization? This would reduce
> > > > > > > > the additional loop in the caller while it would still fit the overall
> > > > > > > > design of the altmap and the core hotplug doesn't have to know anything
> > > > > > > > about DAX or whatever needs a special treatment.
> > > > > > > > 
> > > > > > > > Does that make any sense?
> > > > > > > 
> > > > > > > I think the only thing that is currently using the altmap is the ZONE_DEVICE
> > > > > > > memory init. Specifically I think it is only really used by the
> > > > > > > devm_memremap_pages version of things, and then only under certain
> > > > > > > circumstances. Also the HMM driver doesn't pass an altmap. What we would
> > > > > > > really need is a non-ZONE_DEVICE users of the altmap to really justify
> > > > > > > sticking with that as the preferred argument to pass.
> > > > > > 
> > > > > > I am not aware of any upstream HMM user so I am not sure what are the
> > > > > > expectations there. But I thought that ZONE_DEVICE users use altmap. If
> > > > > > that is not generally true then we certainly have to think about a
> > > > > > better interface.
> > > > > 
> > > > > I'm just basing my statement on the use of the move_pfn_range_to_zone call.
> > > > > The only caller that is actually passing the altmap is devm_memremap_pages
> > > > > and if I understand things correctly that is only used when we want to stare
> > > > > the vmmemmap on the same memory that we just hotplugged.
> > > > 
> > > > Yes, and that is what I've called as allocator callback earlier.
> > > 
> > > I am really not a fan of the callback approach. It just means we will
> > > have to do everything multiple times in terms of initialization.
> > 
> > I do not follow. Could you elaborate?
> 
> So there end up being a few different issues with constructors. First
> in my mind is that it means we have to initialize the region of memory
> and cannot assume what the constructors are going to do for us. As a
> result we will have to initialize the LRU pointers, and then overwrite
> them with the pgmap and hmm_data.

Why we would do that? What does really prevent you from making a fully
customized constructor?

> I am generally not a fan of that as
> the next patch set I have gets rid of most of the redundancy we already
> had in the writes where we were memsetting everything to 0, then
> writing the values, and then taking care of the reserved bit and
> pgmap/hmm_data fields. Dealing with the init serially like that is just
> slow.
> 
> Another complication is retpoline making function pointers just more
> expensive in general. I know in the networking area we have been
> dealing with this for a while as even the DMA ops have been a pain
> there.

We use callbacks all over the kernel and in hot paths as well. This is
far from anything reminding a hot path AFAICT. It can be time consuming
because we have to touch each and every page but that is a fundamental
thing to do. We cannot simply batch the large part of the initialization
to multiple pages at the time.

Have you measured a potential retpoline overhead to have some actual
numbers that would confirm this is just too expensive? Or how much of
performance are we talking about here.

> > > > > That is why it made more sense to me to just create a ZONE_DEVICE specific
> > > > > function for handling the page initialization because the one value I do
> > > > > have to pass is the dev_pagemap in both HMM and memremap case, and that has
> > > > > the altmap already embedded inside of it.
> > > > 
> > > > And I have argued that this is a wrong approach to the problem. If you
> > > > need a very specific struct page initialization then create a init
> > > > (constructor) callback.
> > > 
> > > The callback solution just ends up being more expensive because we lose
> > > multiple layers of possible optimization. By putting everything into on
> > > initization function we are able to let the compiler go through and
> > > optimize things to the point where we are essentially just doing
> > > something akin to one bit memcpy/memset where we are able to construct
> > > one set of page values and write that to every single page we have to
> > > initialize within a given page block.
> > 
> > You are already doing per-page initialization so I fail to see a larger
> > unit to operate on.
> 
> I have a patch that makes it so that we can work at a pageblock level
> since all of the variables with the exception of only the LRU and page
> address fields can be precomputed. Doing that is one of the ways I was
> able to reduce page init to 1/3 to 1/4 of the time it was taking
> otherwise in the case of deferred page init.

You still have to call set_page_links for each page. But let's assume we
can do initialization per larger units. Nothing really prevent to hide
that into constructor as well.
Dan Williams Oct. 29, 2018, 5:34 p.m. UTC | #49
On Mon, Oct 29, 2018 at 10:24 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 29-10-18 10:01:28, Alexander Duyck wrote:
> > On Mon, 2018-10-29 at 17:35 +0100, Michal Hocko wrote:
[..]
> > > You are already doing per-page initialization so I fail to see a larger
> > > unit to operate on.
> >
> > I have a patch that makes it so that we can work at a pageblock level
> > since all of the variables with the exception of only the LRU and page
> > address fields can be precomputed. Doing that is one of the ways I was
> > able to reduce page init to 1/3 to 1/4 of the time it was taking
> > otherwise in the case of deferred page init.
>
> You still have to call set_page_links for each page. But let's assume we
> can do initialization per larger units. Nothing really prevent to hide
> that into constructor as well.

A constructor / indirect function call makes sense when there are
multiple sub-classes of object initialization, on the table I only see
3 cases: typical hotplug, base ZONE_DEVICE, ZONE_DEVICE + HMM. I think
we can look to move the HMM special casing out of line, then we're
down to 2. Even at 3 cases we're better off open-coding than a
constructor for such a low number of sub-cases to handle. I do not
foresee more cases arriving, so I struggle to see what the constructor
buys us in terms of code readability / maintainability?
Alexander Duyck Oct. 29, 2018, 5:42 p.m. UTC | #50
On Mon, 2018-10-29 at 18:24 +0100, Michal Hocko wrote:
> On Mon 29-10-18 10:01:28, Alexander Duyck wrote:
> > On Mon, 2018-10-29 at 17:35 +0100, Michal Hocko wrote:
> > > On Mon 29-10-18 08:59:34, Alexander Duyck wrote:
> 
> [...]
> > > > So for example the call "online_pages_range" doesn't invoke the
> > > > online_page_callback unless the first pfn in the range is marked as
> > > > reserved.
> > > 
> > > Yes and there is no fundamental reason to do that. We can easily check
> > > the online status without that.
> > > 
> > > > Another example Dan had pointed out was the saveable_page function in
> > > > kernel/power/snapshot.c.
> > > 
> > > Use pfn_to_online_page there.
> > 
> > Right. Which getting back to my original point, there is a bunch of
> > other changes that need to happen in order for us to make this work.
> 
> Which is a standard process of upstreaming stuff. My main point was that
> the reason why I've added SetPageReserved was a safety net because I
> knew that different code paths would back of on PageReserved while they
> wouldn't on a partially initialized structure. Now that you really want
> to prevent setting this bit for performance reasons then it makes sense
> to revisit that earlier decision and get rid of it rather than build on
> top of it and duplicate and special case the low level hotplug init
> code.

Just to clarify not setting the reserved bit isn't so much a
performance optimization as it is a functional issue. My understanding
is that they cannot get the DAX pages through KVM currently as it
doesn't recognize them as being "online". So in going through and
cleaning up the checks for the reserved bit this problem will likely
solve itself.

> > I am going to end up with yet another patch set to clean up all the
> > spots that are using PageReserved that shouldn't be before I can get
> > to the point of not setting that bit.
> 
> This would be highly appreciated. There are not that many PageReserved
> checks.

Yeah, not many, only about 3 dozen. It should only take me a week or
two to get them all sorted.

> > > > > > > > > Regarding the post initialization required by devm_memremap_pages and
> > > > > > > > > potentially others. Can we update the altmap which is already a way how
> > > > > > > > > to get alternative struct pages by a constructor which we could call
> > > > > > > > > from memmap_init_zone and do the post initialization? This would reduce
> > > > > > > > > the additional loop in the caller while it would still fit the overall
> > > > > > > > > design of the altmap and the core hotplug doesn't have to know anything
> > > > > > > > > about DAX or whatever needs a special treatment.
> > > > > > > > > 
> > > > > > > > > Does that make any sense?
> > > > > > > > 
> > > > > > > > I think the only thing that is currently using the altmap is the ZONE_DEVICE
> > > > > > > > memory init. Specifically I think it is only really used by the
> > > > > > > > devm_memremap_pages version of things, and then only under certain
> > > > > > > > circumstances. Also the HMM driver doesn't pass an altmap. What we would
> > > > > > > > really need is a non-ZONE_DEVICE users of the altmap to really justify
> > > > > > > > sticking with that as the preferred argument to pass.
> > > > > > > 
> > > > > > > I am not aware of any upstream HMM user so I am not sure what are the
> > > > > > > expectations there. But I thought that ZONE_DEVICE users use altmap. If
> > > > > > > that is not generally true then we certainly have to think about a
> > > > > > > better interface.
> > > > > > 
> > > > > > I'm just basing my statement on the use of the move_pfn_range_to_zone call.
> > > > > > The only caller that is actually passing the altmap is devm_memremap_pages
> > > > > > and if I understand things correctly that is only used when we want to stare
> > > > > > the vmmemmap on the same memory that we just hotplugged.
> > > > > 
> > > > > Yes, and that is what I've called as allocator callback earlier.
> > > > 
> > > > I am really not a fan of the callback approach. It just means we will
> > > > have to do everything multiple times in terms of initialization.
> > > 
> > > I do not follow. Could you elaborate?
> > 
> > So there end up being a few different issues with constructors. First
> > in my mind is that it means we have to initialize the region of memory
> > and cannot assume what the constructors are going to do for us. As a
> > result we will have to initialize the LRU pointers, and then overwrite
> > them with the pgmap and hmm_data.
> 
> Why we would do that? What does really prevent you from making a fully
> customized constructor?

It is more an argument of complexity. Do I just pass a single pointer
and write that value, or the LRU values in init, or do I have to pass a
function pointer, some abstracted data, and then call said function
pointer while passing the page and the abstracted data?

> > I am generally not a fan of that as
> > the next patch set I have gets rid of most of the redundancy we already
> > had in the writes where we were memsetting everything to 0, then
> > writing the values, and then taking care of the reserved bit and
> > pgmap/hmm_data fields. Dealing with the init serially like that is just
> > slow.
> > 
> > Another complication is retpoline making function pointers just more
> > expensive in general. I know in the networking area we have been
> > dealing with this for a while as even the DMA ops have been a pain
> > there.
> 
> We use callbacks all over the kernel and in hot paths as well. This is
> far from anything reminding a hot path AFAICT. It can be time consuming
> because we have to touch each and every page but that is a fundamental
> thing to do. We cannot simply batch the large part of the initialization
> to multiple pages at the time.
> 
> Have you measured a potential retpoline overhead to have some actual
> numbers that would confirm this is just too expensive? Or how much of
> performance are we talking about here.

So admittedly my experience is somewhat anecdotal. However I know for
example with XDP in the networking stack we saw some tests drop from
13Mpps to 6Mpps from just the retpoline workaround being turned on. My
concern is us seeing something like that since with the
__init_pageblock function I had introduced I had reduce the memory init
down to a very tight loop, and I am concerned that having to call a
function pointer in the middle of that with the retpoline overhead is
just going to bring the memory initialization to a crawl.

If I have to implement the code to verify the slowdown I will, but I
really feel like it is just going to be time wasted since we have seen
this in other spots within the kernel.

> > > > > > That is why it made more sense to me to just create a ZONE_DEVICE specific
> > > > > > function for handling the page initialization because the one value I do
> > > > > > have to pass is the dev_pagemap in both HMM and memremap case, and that has
> > > > > > the altmap already embedded inside of it.
> > > > > 
> > > > > And I have argued that this is a wrong approach to the problem. If you
> > > > > need a very specific struct page initialization then create a init
> > > > > (constructor) callback.
> > > > 
> > > > The callback solution just ends up being more expensive because we lose
> > > > multiple layers of possible optimization. By putting everything into on
> > > > initization function we are able to let the compiler go through and
> > > > optimize things to the point where we are essentially just doing
> > > > something akin to one bit memcpy/memset where we are able to construct
> > > > one set of page values and write that to every single page we have to
> > > > initialize within a given page block.
> > > 
> > > You are already doing per-page initialization so I fail to see a larger
> > > unit to operate on.
> > 
> > I have a patch that makes it so that we can work at a pageblock level
> > since all of the variables with the exception of only the LRU and page
> > address fields can be precomputed. Doing that is one of the ways I was
> > able to reduce page init to 1/3 to 1/4 of the time it was taking
> > otherwise in the case of deferred page init.
> 
> You still have to call set_page_links for each page. But let's assume we
> can do initialization per larger units. Nothing really prevent to hide
> that into constructor as well.

The set_page_links function actually executes outside of the main loop
in the case of __init_pageblock. With the change of the memset call to
the multiple assignments of 0 the value can be pre-computed per
pageblock and then just written per page. That is why in my latest
version of the patch set I had folded the reserved bit into the
set_page_links function so that it could be either set or unset at
almost no additional cost to the page initialization.
Michal Hocko Oct. 29, 2018, 5:45 p.m. UTC | #51
On Mon 29-10-18 10:34:22, Dan Williams wrote:
> On Mon, Oct 29, 2018 at 10:24 AM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 29-10-18 10:01:28, Alexander Duyck wrote:
> > > On Mon, 2018-10-29 at 17:35 +0100, Michal Hocko wrote:
> [..]
> > > > You are already doing per-page initialization so I fail to see a larger
> > > > unit to operate on.
> > >
> > > I have a patch that makes it so that we can work at a pageblock level
> > > since all of the variables with the exception of only the LRU and page
> > > address fields can be precomputed. Doing that is one of the ways I was
> > > able to reduce page init to 1/3 to 1/4 of the time it was taking
> > > otherwise in the case of deferred page init.
> >
> > You still have to call set_page_links for each page. But let's assume we
> > can do initialization per larger units. Nothing really prevent to hide
> > that into constructor as well.
> 
> A constructor / indirect function call makes sense when there are
> multiple sub-classes of object initialization, on the table I only see
> 3 cases: typical hotplug, base ZONE_DEVICE, ZONE_DEVICE + HMM. I think
> we can look to move the HMM special casing out of line, then we're
> down to 2. Even at 3 cases we're better off open-coding than a
> constructor for such a low number of sub-cases to handle. I do not
> foresee more cases arriving, so I struggle to see what the constructor
> buys us in terms of code readability / maintainability?

I haven't dreamed of ZONE_DEVICE and HMM few years back. But anyway,
let me note that I am not in love with callbacks. I find them to be a
useful abstraction. I can be convinced (by numbers) that special casing
inside the core hotplug code is really beneficial. But let's do that at
a single place.

All I am arguing against throughout this thread is the
memmap_init_zone_device and the whole code duplication just because zone
device need something special.
Michal Hocko Oct. 29, 2018, 6:18 p.m. UTC | #52
On Mon 29-10-18 10:42:33, Alexander Duyck wrote:
> On Mon, 2018-10-29 at 18:24 +0100, Michal Hocko wrote:
> > On Mon 29-10-18 10:01:28, Alexander Duyck wrote:
[...]
> > > So there end up being a few different issues with constructors. First
> > > in my mind is that it means we have to initialize the region of memory
> > > and cannot assume what the constructors are going to do for us. As a
> > > result we will have to initialize the LRU pointers, and then overwrite
> > > them with the pgmap and hmm_data.
> > 
> > Why we would do that? What does really prevent you from making a fully
> > customized constructor?
> 
> It is more an argument of complexity. Do I just pass a single pointer
> and write that value, or the LRU values in init, or do I have to pass a
> function pointer, some abstracted data, and then call said function
> pointer while passing the page and the abstracted data?

I though you have said that pgmap is the current common denominator for
zone device users. I really do not see what is the problem to do
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..9105a4ed2c96 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5516,7 +5516,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 
 not_early:
 		page = pfn_to_page(pfn);
-		__init_single_page(page, pfn, zone, nid);
+		if (pgmap && pgmap->init_page)
+			pgmap->init_page(page, pfn, zone, nid, pgmap);
+		else
+			__init_single_page(page, pfn, zone, nid);
 		if (context == MEMMAP_HOTPLUG)
 			SetPageReserved(page);
 
that would require to replace altmap throughout the call chain and
replace it by pgmap. Altmap could be then renamed to something more
clear
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..048e4cc72fdf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5474,8 +5474,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	 * Honor reservation requested by the driver for this ZONE_DEVICE
 	 * memory
 	 */
-	if (altmap && start_pfn == altmap->base_pfn)
-		start_pfn += altmap->reserve;
+	if (pgmap && pgmap->get_memmap)
+		start_pfn = pgmap->get_memmap(pgmap, start_pfn);
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*

[...]

> If I have to implement the code to verify the slowdown I will, but I
> really feel like it is just going to be time wasted since we have seen
> this in other spots within the kernel.

Please try to understand that I am not trying to force you write some
artificial benchmarks. All I really do care about is that we have sane
interfaces with reasonable performance. Especially for one-off things
in relattively slow paths. I fully recognize that ZONE_DEVICE begs for a
better integration but really, try to go incremental and try to unify
the code first and microptimize on top. Is that way too much to ask for?

Anyway we have gone into details while the primary problem here was that
the hotplug lock doesn't scale AFAIR. And my question was why cannot we
pull move_pfn_range_to_zone and what has to be done to achieve that.
That is a fundamental thing to address first. Then you can microptimize
on top.
Alexander Duyck Oct. 29, 2018, 7:59 p.m. UTC | #53
On Mon, 2018-10-29 at 19:18 +0100, Michal Hocko wrote:
> On Mon 29-10-18 10:42:33, Alexander Duyck wrote:
> > On Mon, 2018-10-29 at 18:24 +0100, Michal Hocko wrote:
> > > On Mon 29-10-18 10:01:28, Alexander Duyck wrote:
> 
> [...]
> > > > So there end up being a few different issues with constructors. First
> > > > in my mind is that it means we have to initialize the region of memory
> > > > and cannot assume what the constructors are going to do for us. As a
> > > > result we will have to initialize the LRU pointers, and then overwrite
> > > > them with the pgmap and hmm_data.
> > > 
> > > Why we would do that? What does really prevent you from making a fully
> > > customized constructor?
> > 
> > It is more an argument of complexity. Do I just pass a single pointer
> > and write that value, or the LRU values in init, or do I have to pass a
> > function pointer, some abstracted data, and then call said function
> > pointer while passing the page and the abstracted data?
> 
> I though you have said that pgmap is the current common denominator for
> zone device users. I really do not see what is the problem to do
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 89d2a2ab3fe6..9105a4ed2c96 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5516,7 +5516,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  
>  not_early:
>  		page = pfn_to_page(pfn);
> -		__init_single_page(page, pfn, zone, nid);
> +		if (pgmap && pgmap->init_page)
> +			pgmap->init_page(page, pfn, zone, nid, pgmap);
> +		else
> +			__init_single_page(page, pfn, zone, nid);
>  		if (context == MEMMAP_HOTPLUG)
>  			SetPageReserved(page);
>  
> that would require to replace altmap throughout the call chain and
> replace it by pgmap. Altmap could be then renamed to something more
> clear

So as I had pointed out earlier doing per-page init is much slower than
initializing pages in bulk. Ideally I would like to see us seperate the
memmap_init_zone function into two pieces, one section for handling
hotplug and another for the everything else case. As is the fact that
you have to jump over a bunch of tests for the "not_early" case is
quite ugly in my opinion.

I could probably take your patch and test it. I'm suspecting this is
going to be a signficant slow-down in general as the indirect function
pointer stuff is probably going to come into play.

The "init_page" function in this case is going to end up being much
more complex then it really needs to be in this design as well since I
have to get the altmap and figure out if the page was used for vmmemmap
storage or is an actual DAX page. I might just see if I could add an
additional test for the pfn being before the end of the vmmemmap in the
case of pgmap being present.

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 89d2a2ab3fe6..048e4cc72fdf 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5474,8 +5474,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  	 * Honor reservation requested by the driver for this ZONE_DEVICE
>  	 * memory
>  	 */
> -	if (altmap && start_pfn == altmap->base_pfn)
> -		start_pfn += altmap->reserve;
> +	if (pgmap && pgmap->get_memmap)
> +		start_pfn = pgmap->get_memmap(pgmap, start_pfn);
>  
>  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>  		/*
> 
> [...]

The only reason why I hadn't bothered with these bits is that I was
actually trying to leave this generic since I thought I had seen other
discussions about hotplug scenerios where memory may want to change
where the vmmemmap is initialized other than just the case of
ZONE_DEVICE pages. So I was thinking at some point we may see altmap
without the pgmap.

> > If I have to implement the code to verify the slowdown I will, but I
> > really feel like it is just going to be time wasted since we have seen
> > this in other spots within the kernel.
> 
> Please try to understand that I am not trying to force you write some
> artificial benchmarks. All I really do care about is that we have sane
> interfaces with reasonable performance. Especially for one-off things
> in relattively slow paths. I fully recognize that ZONE_DEVICE begs for a
> better integration but really, try to go incremental and try to unify
> the code first and microptimize on top. Is that way too much to ask for?

No, but the patches I had already proposed I thought were heading in
that direction. I had unified memmap_init_zone,
memmap_init_zone_device, and the deferred page initialization onto a
small set of functions and all had improved performance as a result.

> Anyway we have gone into details while the primary problem here was that
> the hotplug lock doesn't scale AFAIR. And my question was why cannot we
> pull move_pfn_range_to_zone and what has to be done to achieve that.
> That is a fundamental thing to address first. Then you can microptimize
> on top.

Yes, the hotplug lock was part of the original issue. However that
starts to drift into the area I believe Oscar was working on as a part
of his patch set in encapsulating the move_pfn_range_to_zone and other
calls that were contained in the hotplug lock into their own functions.

Most of the changes I have in my follow-on patch set can work
regardless of how we deal with the lock issue. I just feel like what
you are pushing for is going to be a massive patch set by the time we
are done and I really need to be able to work this a piece at a time. 

The patches Andrew pushed addressed the immediate issue so that now
systems with nvdimm/DAX memory can at least initialize quick enough
that systemd doesn't refuse to mount the root file system due to a
timeout. The next patch set I have refactors things to reduce code and
allow us to reuse some of the hotplug code for the deferred page init, 
https://lore.kernel.org/lkml/20181017235043.17213.92459.stgit@localhost.localdomain/
. After that I was planning to work on dealing with the PageReserved
flag and trying to get that sorted out.

I was hoping to wait until after Dan's HMM patches and Oscar's changes
had been sorted before I get into any further refactor of this specific
code.
Michal Hocko Oct. 30, 2018, 6:29 a.m. UTC | #54
On Mon 29-10-18 12:59:11, Alexander Duyck wrote:
> On Mon, 2018-10-29 at 19:18 +0100, Michal Hocko wrote:
[...]

I will try to get to your other points later.

> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 89d2a2ab3fe6..048e4cc72fdf 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -5474,8 +5474,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> >  	 * Honor reservation requested by the driver for this ZONE_DEVICE
> >  	 * memory
> >  	 */
> > -	if (altmap && start_pfn == altmap->base_pfn)
> > -		start_pfn += altmap->reserve;
> > +	if (pgmap && pgmap->get_memmap)
> > +		start_pfn = pgmap->get_memmap(pgmap, start_pfn);
> >  
> >  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> >  		/*
> > 
> > [...]
> 
> The only reason why I hadn't bothered with these bits is that I was
> actually trying to leave this generic since I thought I had seen other
> discussions about hotplug scenerios where memory may want to change
> where the vmmemmap is initialized other than just the case of
> ZONE_DEVICE pages. So I was thinking at some point we may see altmap
> without the pgmap.

I wanted to abuse altmap to allocate struct pages from the physical
range to be added. In that case I would abstract the
allocation/initialization part of pgmap into a more abstract type.
Something trivially to be done without affecting end users of the
hotplug API.

[...]
> > Anyway we have gone into details while the primary problem here was that
> > the hotplug lock doesn't scale AFAIR. And my question was why cannot we
> > pull move_pfn_range_to_zone and what has to be done to achieve that.
> > That is a fundamental thing to address first. Then you can microptimize
> > on top.
> 
> Yes, the hotplug lock was part of the original issue. However that
> starts to drift into the area I believe Oscar was working on as a part
> of his patch set in encapsulating the move_pfn_range_to_zone and other
> calls that were contained in the hotplug lock into their own functions.

Well, I would really love to keep the external API as simple as
possible. That means that we need arch_add_memory/add_pages and 
move_pfn_range_to_zone to associate pages with a zone. The hotplug lock
should be preferably hidden from callers of those two and ideally it
shouldn't be a global lock. We should be good with a range lock.
 
> The patches Andrew pushed addressed the immediate issue so that now
> systems with nvdimm/DAX memory can at least initialize quick enough
> that systemd doesn't refuse to mount the root file system due to a
> timeout.

This is about the first time you actually mention that. I have re-read
the cover letter and all changelogs of patches in this serious. Unless I
have missed something there is nothing about real users hitting issues
out there. nvdimm is still considered a toy because there is no real HW
users can play with.

And hence my complains about half baked solutions rushed in just to fix
a performance regression. I can certainly understand that a pressing
problem might justify to rush things a bit but this should be always
carefuly justified.

> The next patch set I have refactors things to reduce code and
> allow us to reuse some of the hotplug code for the deferred page init, 
> https://lore.kernel.org/lkml/20181017235043.17213.92459.stgit@localhost.localdomain/
> . After that I was planning to work on dealing with the PageReserved
> flag and trying to get that sorted out.
> 
> I was hoping to wait until after Dan's HMM patches and Oscar's changes
> had been sorted before I get into any further refactor of this specific
> code.

Yes there is quite a lot going on here. I would really appreciate if we
all sit and actually try to come up with something robust rather than
hack here and there. I haven't yet seen your follow up series completely
so maybe you are indeed heading the correct direction.
Dan Williams Oct. 30, 2018, 6:55 a.m. UTC | #55
On Mon, Oct 29, 2018 at 11:29 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 29-10-18 12:59:11, Alexander Duyck wrote:
> > On Mon, 2018-10-29 at 19:18 +0100, Michal Hocko wrote:
[..]
> > The patches Andrew pushed addressed the immediate issue so that now
> > systems with nvdimm/DAX memory can at least initialize quick enough
> > that systemd doesn't refuse to mount the root file system due to a
> > timeout.
>
> This is about the first time you actually mention that. I have re-read
> the cover letter and all changelogs of patches in this serious. Unless I
> have missed something there is nothing about real users hitting issues
> out there. nvdimm is still considered a toy because there is no real HW
> users can play with.

Yes, you have missed something, because that's incorrect. There's been
public articles about these parts sampling since May.

    https://www.anandtech.com/show/12828/intel-launches-optane-dimms-up-to-512gb-apache-pass-is-here

That testing identified this initialization performance problem and
thankfully got it addressed in time for the current merge window.
Oscar Salvador Oct. 30, 2018, 8:05 a.m. UTC | #56
> Yes, the hotplug lock was part of the original issue. However that
> starts to drift into the area I believe Oscar was working on as a part
> of his patch set in encapsulating the move_pfn_range_to_zone and other
> calls that were contained in the hotplug lock into their own functions.


While reworking it for my patchset, I thought that we can move
move_pfn_range_to_zone
out of hotplug lock.
But then I __think__ we would have to move init_currently_empty_zone() within
the span lock as zone->zone_start_pfn is being touched there.
At least that is what the zone locking rules say about it.

Since I saw that Dan was still reworking his patchset about unify HMM/devm code,
I just took one step back and I went simpler [1].
The main reason for backing off was I felt a bit demotivated due to
the lack of feedback,
and I did not want to interfer either with your work or Dan's work.
Plus I also was unsure about some other things like whether it is ok calling
kasan_add_zero_shadow/kasan_remove_zero_shadow out of the lock.
So I decided to make less changes in regard of HMM/devm.

Unfortunately, I did not get a lot of feedback there yet.
Just some reviews from David and a confirmation that fixes one of the
issues Jonathan reported [2].

>
> I was hoping to wait until after Dan's HMM patches and Oscar's changes
> had been sorted before I get into any further refactor of this specific
> code.


I plan to ping the series, but I wanted to give more time to people
since we are in the merge window now.

[1] https://patchwork.kernel.org/cover/10642049/
[2] https://patchwork.kernel.org/patch/10642057/#22275173
Michal Hocko Oct. 30, 2018, 8:17 a.m. UTC | #57
On Mon 29-10-18 23:55:12, Dan Williams wrote:
> On Mon, Oct 29, 2018 at 11:29 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 29-10-18 12:59:11, Alexander Duyck wrote:
> > > On Mon, 2018-10-29 at 19:18 +0100, Michal Hocko wrote:
> [..]
> > > The patches Andrew pushed addressed the immediate issue so that now
> > > systems with nvdimm/DAX memory can at least initialize quick enough
> > > that systemd doesn't refuse to mount the root file system due to a
> > > timeout.
> >
> > This is about the first time you actually mention that. I have re-read
> > the cover letter and all changelogs of patches in this serious. Unless I
> > have missed something there is nothing about real users hitting issues
> > out there. nvdimm is still considered a toy because there is no real HW
> > users can play with.
> 
> Yes, you have missed something, because that's incorrect. There's been
> public articles about these parts sampling since May.
> 
>     https://www.anandtech.com/show/12828/intel-launches-optane-dimms-up-to-512gb-apache-pass-is-here

indeed!

> That testing identified this initialization performance problem and
> thankfully got it addressed in time for the current merge window.

And I still cannot see a word about that in changelogs.
Dan Williams Oct. 30, 2018, 3:57 p.m. UTC | #58
On Tue, Oct 30, 2018 at 1:18 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 29-10-18 23:55:12, Dan Williams wrote:
> > On Mon, Oct 29, 2018 at 11:29 PM Michal Hocko <mhocko@kernel.org> wrote:
[..]
> > That testing identified this initialization performance problem and
> > thankfully got it addressed in time for the current merge window.
>
> And I still cannot see a word about that in changelogs.

True, I would have preferred that commit 966cf44f637e "mm: defer
ZONE_DEVICE page initialization to the point where we init pgmap"
include some notes about the scaling advantages afforded by not
serializing memmap_init_zone() work. I think this information got
distributed across several patch series because removing the lock was
not sufficient by itself, Alex went further to also rework the
physical socket affinity of the nvdimm sub-system's async
initialization threads.

As the code gets refactored further it's a chance to add commentary on
the scaling expectations of the design.

Patch
diff mbox series

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06d7d7576f8d..7312fb78ef31 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -848,6 +848,8 @@  static inline bool is_zone_device_page(const struct page *page)
 {
 	return page_zonenum(page) == ZONE_DEVICE;
 }
+extern void memmap_init_zone_device(struct zone *, unsigned long,
+				    unsigned long, struct dev_pagemap *);
 #else
 static inline bool is_zone_device_page(const struct page *page)
 {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5b8600d39931..d0c32e473f82 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -175,10 +175,10 @@  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	struct vmem_altmap *altmap = pgmap->altmap_valid ?
 			&pgmap->altmap : NULL;
 	struct resource *res = &pgmap->res;
-	unsigned long pfn, pgoff, order;
+	struct dev_pagemap *conflict_pgmap;
 	pgprot_t pgprot = PAGE_KERNEL;
+	unsigned long pgoff, order;
 	int error, nid, is_ram;
-	struct dev_pagemap *conflict_pgmap;
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -256,19 +256,13 @@  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	if (error)
 		goto err_add_memory;
 
-	for_each_device_pfn(pfn, pgmap) {
-		struct page *page = pfn_to_page(pfn);
-
-		/*
-		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
-		 * pointer.  It is a bug if a ZONE_DEVICE page is ever
-		 * freed or placed on a driver-private list.  Seed the
-		 * storage with LIST_POISON* values.
-		 */
-		list_del(&page->lru);
-		page->pgmap = pgmap;
-		percpu_ref_get(pgmap->ref);
-	}
+	/*
+	 * Initialization of the pages has been deferred until now in order
+	 * to allow us to do the work while not holding the hotplug lock.
+	 */
+	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+				align_start >> PAGE_SHIFT,
+				align_size >> PAGE_SHIFT, pgmap);
 
 	devm_add_action(dev, devm_memremap_pages_release, pgmap);
 
diff --git a/mm/hmm.c b/mm/hmm.c
index c968e49f7a0c..774d684fa2b4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1024,7 +1024,6 @@  static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
 	resource_size_t key, align_start, align_size, align_end;
 	struct device *device = devmem->device;
 	int ret, nid, is_ram;
-	unsigned long pfn;
 
 	align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
 	align_size = ALIGN(devmem->resource->start +
@@ -1109,11 +1108,14 @@  static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
 				align_size >> PAGE_SHIFT, NULL);
 	mem_hotplug_done();
 
-	for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
-		struct page *page = pfn_to_page(pfn);
+	/*
+	 * Initialization of the pages has been deferred until now in order
+	 * to allow us to do the work while not holding the hotplug lock.
+	 */
+	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+				align_start >> PAGE_SHIFT,
+				align_size >> PAGE_SHIFT, &devmem->pagemap);
 
-		page->pgmap = &devmem->pagemap;
-	}
 	return 0;
 
 error_add_memory:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 926ad3083b28..7ec0997ded39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5489,12 +5489,23 @@  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 
+#ifdef CONFIG_ZONE_DEVICE
 	/*
 	 * Honor reservation requested by the driver for this ZONE_DEVICE
-	 * memory
+	 * memory. We limit the total number of pages to initialize to just
+	 * those that might contain the memory mapping. We will defer the
+	 * ZONE_DEVICE page initialization until after we have released
+	 * the hotplug lock.
 	 */
-	if (altmap && start_pfn == altmap->base_pfn)
-		start_pfn += altmap->reserve;
+	if (zone == ZONE_DEVICE) {
+		if (!altmap)
+			return;
+
+		if (start_pfn == altmap->base_pfn)
+			start_pfn += altmap->reserve;
+		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+	}
+#endif
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
@@ -5538,6 +5549,81 @@  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	}
 }
 
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+				   unsigned long start_pfn,
+				   unsigned long size,
+				   struct dev_pagemap *pgmap)
+{
+	unsigned long pfn, end_pfn = start_pfn + size;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	unsigned long zone_idx = zone_idx(zone);
+	unsigned long start = jiffies;
+	int nid = pgdat->node_id;
+
+	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+		return;
+
+	/*
+	 * The call to memmap_init_zone should have already taken care
+	 * of the pages reserved for the memmap, so we can just jump to
+	 * the end of that region and start processing the device pages.
+	 */
+	if (pgmap->altmap_valid) {
+		struct vmem_altmap *altmap = &pgmap->altmap;
+
+		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+		size = end_pfn - start_pfn;
+	}
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		struct page *page = pfn_to_page(pfn);
+
+		__init_single_page(page, pfn, zone_idx, nid);
+
+		/*
+		 * Mark page reserved as it will need to wait for onlining
+		 * phase for it to be fully associated with a zone.
+		 *
+		 * We can use the non-atomic __set_bit operation for setting
+		 * the flag as we are still initializing the pages.
+		 */
+		__SetPageReserved(page);
+
+		/*
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
+		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
+		 * page is ever freed or placed on a driver-private list.
+		 */
+		page->pgmap = pgmap;
+		page->hmm_data = 0;
+
+		/*
+		 * Mark the block movable so that blocks are reserved for
+		 * movable at startup. This will force kernel allocations
+		 * to reserve their blocks rather than leaking throughout
+		 * the address space during boot when many long-lived
+		 * kernel allocations are made.
+		 *
+		 * bitmap is created for zone's valid pfn range. but memmap
+		 * can be created for invalid pages (for alignment)
+		 * check here not to call set_pageblock_migratetype() against
+		 * pfn out of zone.
+		 *
+		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+		 * because this is done early in sparse_add_one_section
+		 */
+		if (!(pfn & (pageblock_nr_pages - 1))) {
+			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+			cond_resched();
+		}
+	}
+
+	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+		size, jiffies_to_msecs(jiffies - start));
+}
+
+#endif
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	unsigned int order, t;