diff mbox series

[v7] mm/page_alloc.c: memory_hotplug: free pages as higher order

Message ID 1546578076-31716-1-git-send-email-arunks@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series [v7] mm/page_alloc.c: memory_hotplug: free pages as higher order | expand

Commit Message

Arun KS Jan. 4, 2019, 5:01 a.m. UTC
When freeing pages are done with higher order, time spent on coalescing
pages by buddy allocator can be reduced.  With section size of 256MB, hot
add latency of a single section shows improvement from 50-60 ms to less
than 1 ms, hence improving the hot add latency by 60 times.  Modify
external providers of online callback to align with the change.

Signed-off-by: Arun KS <arunks@codeaurora.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
---
Changes since v6:
- Rebased to 4.20
- Changelog updated.
- No improvement seen on arm64, hence removed removal of prefetch.

Changes since v5:
- Rebased to 4.20-rc1.
- Changelog updated.

Changes since v4:
- As suggested by Michal Hocko,
- Simplify logic in online_pages_block() by using get_order().
- Seperate out removal of prefetch from __free_pages_core().

Changes since v3:
- Renamed _free_pages_boot_core -> __free_pages_core.
- Removed prefetch from __free_pages_core.
- Removed xen_online_page().

Changes since v2:
- Reuse code from __free_pages_boot_core().

Changes since v1:
- Removed prefetch().

Changes since RFC:
- Rebase.
- As suggested by Michal Hocko remove pages_per_block.
- Modifed external providers of online_page_callback.

v6: https://lore.kernel.org/patchwork/patch/1007253/
v5: https://lore.kernel.org/patchwork/patch/995739/
v4: https://lore.kernel.org/patchwork/patch/995111/
v3: https://lore.kernel.org/patchwork/patch/992348/
v2: https://lore.kernel.org/patchwork/patch/991363/
v1: https://lore.kernel.org/patchwork/patch/989445/
RFC: https://lore.kernel.org/patchwork/patch/984754/

---
 drivers/hv/hv_balloon.c        |  6 ++++--
 drivers/xen/balloon.c          | 23 +++++++++++++++--------
 include/linux/memory_hotplug.h |  2 +-
 mm/internal.h                  |  1 +
 mm/memory_hotplug.c            | 42 ++++++++++++++++++++++++++++++------------
 mm/page_alloc.c                |  8 ++++----
 6 files changed, 55 insertions(+), 27 deletions(-)

Comments

Alexander Duyck Jan. 8, 2019, 5:56 p.m. UTC | #1
On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> When freeing pages are done with higher order, time spent on coalescing
> pages by buddy allocator can be reduced.  With section size of 256MB, hot
> add latency of a single section shows improvement from 50-60 ms to less
> than 1 ms, hence improving the hot add latency by 60 times.  Modify
> external providers of online callback to align with the change.
> 
> Signed-off-by: Arun KS <arunks@codeaurora.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Reviewed-by: Oscar Salvador <osalvador@suse.de>
> ---
> Changes since v6:
> - Rebased to 4.20
> - Changelog updated.
> - No improvement seen on arm64, hence removed removal of prefetch.
> 
> Changes since v5:
> - Rebased to 4.20-rc1.
> - Changelog updated.
> 
> Changes since v4:
> - As suggested by Michal Hocko,
> - Simplify logic in online_pages_block() by using get_order().
> - Seperate out removal of prefetch from __free_pages_core().
> 
> Changes since v3:
> - Renamed _free_pages_boot_core -> __free_pages_core.
> - Removed prefetch from __free_pages_core.
> - Removed xen_online_page().
> 
> Changes since v2:
> - Reuse code from __free_pages_boot_core().
> 
> Changes since v1:
> - Removed prefetch().
> 
> Changes since RFC:
> - Rebase.
> - As suggested by Michal Hocko remove pages_per_block.
> - Modifed external providers of online_page_callback.
> 
> v6: https://lore.kernel.org/patchwork/patch/1007253/
> v5: https://lore.kernel.org/patchwork/patch/995739/
> v4: https://lore.kernel.org/patchwork/patch/995111/
> v3: https://lore.kernel.org/patchwork/patch/992348/
> v2: https://lore.kernel.org/patchwork/patch/991363/
> v1: https://lore.kernel.org/patchwork/patch/989445/
> RFC: https://lore.kernel.org/patchwork/patch/984754/
> 
> ---
>  drivers/hv/hv_balloon.c        |  6 ++++--
>  drivers/xen/balloon.c          | 23 +++++++++++++++--------
>  include/linux/memory_hotplug.h |  2 +-
>  mm/internal.h                  |  1 +
>  mm/memory_hotplug.c            | 42 ++++++++++++++++++++++++++++++------------
>  mm/page_alloc.c                |  8 ++++----
>  6 files changed, 55 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 5301fef..211f3fe 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  	}
>  }
>  
> -static void hv_online_page(struct page *pg)
> +static int hv_online_page(struct page *pg, unsigned int order)
>  {
>  	struct hv_hotadd_state *has;
>  	unsigned long flags;
> @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg)
>  		if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
>  			continue;
>  
> -		hv_page_online_one(has, pg);
> +		hv_bring_pgs_online(has, pfn, (1UL << order));
>  		break;
>  	}
>  	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> +	return 0;
>  }
>  
>  static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index ceb5048..95f888f 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -345,8 +345,8 @@ static enum bp_state reserve_additional_memory(void)
>  
>  	/*
>  	 * add_memory_resource() will call online_pages() which in its turn
> -	 * will call xen_online_page() callback causing deadlock if we don't
> -	 * release balloon_mutex here. Unlocking here is safe because the
> +	 * will call xen_bring_pgs_online() callback causing deadlock if we
> +	 * don't release balloon_mutex here. Unlocking here is safe because the
>  	 * callers drop the mutex before trying again.
>  	 */
>  	mutex_unlock(&balloon_mutex);
> @@ -369,15 +369,22 @@ static enum bp_state reserve_additional_memory(void)
>  	return BP_ECANCELED;
>  }
>  
> -static void xen_online_page(struct page *page)
> +static int xen_bring_pgs_online(struct page *pg, unsigned int order)
>  {
> -	__online_page_set_limits(page);
> +	unsigned long i, size = (1 << order);
> +	unsigned long start_pfn = page_to_pfn(pg);
> +	struct page *p;
>  
> +	pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
>  	mutex_lock(&balloon_mutex);
> -
> -	__balloon_append(page);
> -
> +	for (i = 0; i < size; i++) {
> +		p = pfn_to_page(start_pfn + i);
> +		__online_page_set_limits(p);
> +		__balloon_append(p);
> +	}
>  	mutex_unlock(&balloon_mutex);
> +
> +	return 0;
>  }
>  
>  static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
> @@ -702,7 +709,7 @@ static int __init balloon_init(void)
>  	balloon_stats.max_retry_count = RETRY_UNLIMITED;
>  
>  #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> -	set_online_page_callback(&xen_online_page);
> +	set_online_page_callback(&xen_bring_pgs_online);
>  	register_memory_notifier(&xen_memory_nb);
>  	register_sysctl_table(xen_root);
>  #endif
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 07da5c6..d56bfba 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -87,7 +87,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
>  	unsigned long *valid_start, unsigned long *valid_end);
>  extern void __offline_isolated_pages(unsigned long, unsigned long);
>  
> -typedef void (*online_page_callback_t)(struct page *page);
> +typedef int (*online_page_callback_t)(struct page *page, unsigned int order);
>  
>  extern int set_online_page_callback(online_page_callback_t callback);
>  extern int restore_online_page_callback(online_page_callback_t callback);
> diff --git a/mm/internal.h b/mm/internal.h
> index f4a7bb0..536bc2a 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
>  extern int __isolate_free_page(struct page *page, unsigned int order);
>  extern void memblock_free_pages(struct page *page, unsigned long pfn,
>  					unsigned int order);
> +extern void __free_pages_core(struct page *page, unsigned int order);
>  extern void prep_compound_page(struct page *page, unsigned int order);
>  extern void post_alloc_hook(struct page *page, unsigned int order,
>  					gfp_t gfp_flags);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index b9a667d..0ea0eb1 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -47,7 +47,7 @@
>   * and restore_online_page_callback() for generic callback restore.
>   */
>  
> -static void generic_online_page(struct page *page);
> +static int generic_online_page(struct page *page, unsigned int order);
>  
>  static online_page_callback_t online_page_callback = generic_online_page;
>  static DEFINE_MUTEX(online_page_callback_lock);
> @@ -656,26 +656,44 @@ void __online_page_free(struct page *page)
>  }
>  EXPORT_SYMBOL_GPL(__online_page_free);
>  
> -static void generic_online_page(struct page *page)
> +static int generic_online_page(struct page *page, unsigned int order)
>  {
> -	__online_page_set_limits(page);
> -	__online_page_increment_counters(page);
> -	__online_page_free(page);
> +	__free_pages_core(page, order);
> +	totalram_pages_add(1UL << order);
> +#ifdef CONFIG_HIGHMEM
> +	if (PageHighMem(page))
> +		totalhigh_pages_add(1UL << order);
> +#endif
> +	return 0;
> +}
> +
> +static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
> +{
> +	unsigned long end = start + nr_pages;
> +	int order, ret, onlined_pages = 0;
> +
> +	while (start < end) {
> +		order = min(MAX_ORDER - 1,
> +			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
> +
> +		ret = (*online_page_callback)(pfn_to_page(start), order);
> +		if (!ret)
> +			onlined_pages += (1UL << order);
> +		else if (ret > 0)
> +			onlined_pages += ret;
> +
> +		start += (1UL << order);
> +	}
> +	return onlined_pages;
>  }
>  
>  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>  			void *arg)
>  {
> -	unsigned long i;
>  	unsigned long onlined_pages = *(unsigned long *)arg;
> -	struct page *page;
>  
>  	if (PageReserved(pfn_to_page(start_pfn)))
> -		for (i = 0; i < nr_pages; i++) {
> -			page = pfn_to_page(start_pfn + i);
> -			(*online_page_callback)(page);
> -			onlined_pages++;
> -		}
> +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);

Shouldn't this be a "+=" instead of an "="? It seems like you are going
to lose your count otherwise.

>  
>  	online_mem_sections(start_pfn, start_pfn + nr_pages);
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index cde5dac..f51a920 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1303,7 +1303,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
>  	local_irq_restore(flags);
>  }
>  
> -static void __init __free_pages_boot_core(struct page *page, unsigned int order)
> +void __free_pages_core(struct page *page, unsigned int order)
>  {
>  	unsigned int nr_pages = 1 << order;
>  	struct page *p = page;
> @@ -1382,7 +1382,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
>  {
>  	if (early_page_uninitialised(pfn))
>  		return;
> -	return __free_pages_boot_core(page, order);
> +	__free_pages_core(page, order);
>  }
>  
>  /*
> @@ -1472,14 +1472,14 @@ static void __init deferred_free_range(unsigned long pfn,
>  	if (nr_pages == pageblock_nr_pages &&
>  	    (pfn & (pageblock_nr_pages - 1)) == 0) {
>  		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
> -		__free_pages_boot_core(page, pageblock_order);
> +		__free_pages_core(page, pageblock_order);
>  		return;
>  	}
>  
>  	for (i = 0; i < nr_pages; i++, page++, pfn++) {
>  		if ((pfn & (pageblock_nr_pages - 1)) == 0)
>  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
> -		__free_pages_boot_core(page, 0);
> +		__free_pages_core(page, 0);
>  	}
>  }
>
Michal Hocko Jan. 8, 2019, 6:13 p.m. UTC | #2
On Tue 08-01-19 09:56:09, Alexander Duyck wrote:
> On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
[...]
> >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> >  			void *arg)
> >  {
> > -	unsigned long i;
> >  	unsigned long onlined_pages = *(unsigned long *)arg;
> > -	struct page *page;
> >  
> >  	if (PageReserved(pfn_to_page(start_pfn)))
> > -		for (i = 0; i < nr_pages; i++) {
> > -			page = pfn_to_page(start_pfn + i);
> > -			(*online_page_callback)(page);
> > -			onlined_pages++;
> > -		}
> > +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
> 
> Shouldn't this be a "+=" instead of an "="? It seems like you are going
> to lose your count otherwise.

You are right of course. I should have noticed during the review.
Thanks!
Alexander Duyck Jan. 8, 2019, 6:40 p.m. UTC | #3
On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> When freeing pages are done with higher order, time spent on coalescing
> pages by buddy allocator can be reduced.  With section size of 256MB, hot
> add latency of a single section shows improvement from 50-60 ms to less
> than 1 ms, hence improving the hot add latency by 60 times.  Modify
> external providers of online callback to align with the change.
> 
> Signed-off-by: Arun KS <arunks@codeaurora.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Reviewed-by: Oscar Salvador <osalvador@suse.de>

After running into my initial issue I actually had a few more questions
about this patch.

> [...]
> +static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
> +{
> +	unsigned long end = start + nr_pages;
> +	int order, ret, onlined_pages = 0;
> +
> +	while (start < end) {
> +		order = min(MAX_ORDER - 1,
> +			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
> +
> +		ret = (*online_page_callback)(pfn_to_page(start), order);
> +		if (!ret)
> +			onlined_pages += (1UL << order);
> +		else if (ret > 0)
> +			onlined_pages += ret;
> +
> +		start += (1UL << order);
> +	}
> +	return onlined_pages;
>  }
>  

Should the limit for this really be MAX_ORDER - 1 or should it be
pageblock_order? In some cases this will be the same value, but I seem
to recall that for x86 MAX_ORDER can be several times larger than
pageblock_order.

>  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>  			void *arg)
>  {
> -	unsigned long i;
>  	unsigned long onlined_pages = *(unsigned long *)arg;
> -	struct page *page;
>  
>  	if (PageReserved(pfn_to_page(start_pfn)))

I'm not sure we even really need this check. Getting back to the
discussion I have been having with Michal in regards to the need for
the DAX pages to not have the reserved bit cleared I was originally
wondering if we could replace this check with a call to
online_section_nr since the section shouldn't be online until we set
the bit below in online_mem_sections.

However after doing some further digging it looks like this could
probably be dropped entirely since we only call this function from
online_pages and that function is only called by memory_block_action if
pages_correctly_probed returns true. However pages_correctly_probed
should return false if any of the sections contained in the page range
is already online.

> -		for (i = 0; i < nr_pages; i++) {
> -			page = pfn_to_page(start_pfn + i);
> -			(*online_page_callback)(page);
> -			onlined_pages++;
> -		}
> +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
>  
>  	online_mem_sections(start_pfn, start_pfn + nr_pages);
>
Michal Hocko Jan. 8, 2019, 8:04 p.m. UTC | #4
On Tue 08-01-19 10:40:18, Alexander Duyck wrote:
> On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> > When freeing pages are done with higher order, time spent on coalescing
> > pages by buddy allocator can be reduced.  With section size of 256MB, hot
> > add latency of a single section shows improvement from 50-60 ms to less
> > than 1 ms, hence improving the hot add latency by 60 times.  Modify
> > external providers of online callback to align with the change.
> > 
> > Signed-off-by: Arun KS <arunks@codeaurora.org>
> > Acked-by: Michal Hocko <mhocko@suse.com>
> > Reviewed-by: Oscar Salvador <osalvador@suse.de>
> 
> After running into my initial issue I actually had a few more questions
> about this patch.
> 
> > [...]
> > +static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
> > +{
> > +	unsigned long end = start + nr_pages;
> > +	int order, ret, onlined_pages = 0;
> > +
> > +	while (start < end) {
> > +		order = min(MAX_ORDER - 1,
> > +			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
> > +
> > +		ret = (*online_page_callback)(pfn_to_page(start), order);
> > +		if (!ret)
> > +			onlined_pages += (1UL << order);
> > +		else if (ret > 0)
> > +			onlined_pages += ret;
> > +
> > +		start += (1UL << order);
> > +	}
> > +	return onlined_pages;
> >  }
> >  
> 
> Should the limit for this really be MAX_ORDER - 1 or should it be
> pageblock_order? In some cases this will be the same value, but I seem
> to recall that for x86 MAX_ORDER can be several times larger than
> pageblock_order.

Does it make any difference when we are in fact trying to onine nr_pages
and we clamp to it properly?

> >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> >  			void *arg)
> >  {
> > -	unsigned long i;
> >  	unsigned long onlined_pages = *(unsigned long *)arg;
> > -	struct page *page;
> >  
> >  	if (PageReserved(pfn_to_page(start_pfn)))
> 
> I'm not sure we even really need this check. Getting back to the
> discussion I have been having with Michal in regards to the need for
> the DAX pages to not have the reserved bit cleared I was originally
> wondering if we could replace this check with a call to
> online_section_nr since the section shouldn't be online until we set
> the bit below in online_mem_sections.
> 
> However after doing some further digging it looks like this could
> probably be dropped entirely since we only call this function from
> online_pages and that function is only called by memory_block_action if
> pages_correctly_probed returns true. However pages_correctly_probed
> should return false if any of the sections contained in the page range
> is already online.

Yes you are right but I guess it would be better to address in a
separate patch that deals with PageReserved manipulation in general.
I do not think we want to remove the check silently. People who might be
interested in backporting this for whatever reason might screatch their
head why the test is not needed anymore.
Alexander Duyck Jan. 8, 2019, 9:53 p.m. UTC | #5
On Tue, 2019-01-08 at 21:04 +0100, Michal Hocko wrote:
> On Tue 08-01-19 10:40:18, Alexander Duyck wrote:
> > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> > > When freeing pages are done with higher order, time spent on coalescing
> > > pages by buddy allocator can be reduced.  With section size of 256MB, hot
> > > add latency of a single section shows improvement from 50-60 ms to less
> > > than 1 ms, hence improving the hot add latency by 60 times.  Modify
> > > external providers of online callback to align with the change.
> > > 
> > > Signed-off-by: Arun KS <arunks@codeaurora.org>
> > > Acked-by: Michal Hocko <mhocko@suse.com>
> > > Reviewed-by: Oscar Salvador <osalvador@suse.de>
> > 
> > After running into my initial issue I actually had a few more questions
> > about this patch.
> > 
> > > [...]
> > > +static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
> > > +{
> > > +	unsigned long end = start + nr_pages;
> > > +	int order, ret, onlined_pages = 0;
> > > +
> > > +	while (start < end) {
> > > +		order = min(MAX_ORDER - 1,
> > > +			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
> > > +
> > > +		ret = (*online_page_callback)(pfn_to_page(start), order);
> > > +		if (!ret)
> > > +			onlined_pages += (1UL << order);
> > > +		else if (ret > 0)
> > > +			onlined_pages += ret;
> > > +
> > > +		start += (1UL << order);
> > > +	}
> > > +	return onlined_pages;
> > >  }
> > >  
> > 
> > Should the limit for this really be MAX_ORDER - 1 or should it be
> > pageblock_order? In some cases this will be the same value, but I seem
> > to recall that for x86 MAX_ORDER can be several times larger than
> > pageblock_order.
> 
> Does it make any difference when we are in fact trying to onine nr_pages
> and we clamp to it properly?

I'm not entirely sure if it does or not.

What I notice looking through the code though is that there are a
number of checks for the pageblock migrate type. There ends up being
checks in __free_one_page, free_one_page, and __free_pages_ok all
related to this. It might be moot since we are starting with a offline
section, but I just brought this up because I know in the case of
deferred page init we were limiting ourselves to pageblock_order and I
wasn't sure if there was some specific reason for doing that.

> > >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> > >  			void *arg)
> > >  {
> > > -	unsigned long i;
> > >  	unsigned long onlined_pages = *(unsigned long *)arg;
> > > -	struct page *page;
> > >  
> > >  	if (PageReserved(pfn_to_page(start_pfn)))
> > 
> > I'm not sure we even really need this check. Getting back to the
> > discussion I have been having with Michal in regards to the need for
> > the DAX pages to not have the reserved bit cleared I was originally
> > wondering if we could replace this check with a call to
> > online_section_nr since the section shouldn't be online until we set
> > the bit below in online_mem_sections.
> > 
> > However after doing some further digging it looks like this could
> > probably be dropped entirely since we only call this function from
> > online_pages and that function is only called by memory_block_action if
> > pages_correctly_probed returns true. However pages_correctly_probed
> > should return false if any of the sections contained in the page range
> > is already online.
> 
> Yes you are right but I guess it would be better to address in a
> separate patch that deals with PageReserved manipulation in general.
> I do not think we want to remove the check silently. People who might be
> interested in backporting this for whatever reason might screatch their
> head why the test is not needed anymore.

Yeah I am already working on that, it is what led me to review this
patch. Just thought I would bring it up since it would make it possible
to essentially reduce the size and/or need for a new function.
Alexander Duyck Jan. 8, 2019, 10:17 p.m. UTC | #6
On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> When freeing pages are done with higher order, time spent on coalescing
> pages by buddy allocator can be reduced.  With section size of 256MB, hot
> add latency of a single section shows improvement from 50-60 ms to less
> than 1 ms, hence improving the hot add latency by 60 times.  Modify
> external providers of online callback to align with the change.
> 
> Signed-off-by: Arun KS <arunks@codeaurora.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Reviewed-by: Oscar Salvador <osalvador@suse.de>

Sorry, ended up encountering a couple more things that have me a bit
confused.

[...]

> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 5301fef..211f3fe 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  	}
>  }
>  
> -static void hv_online_page(struct page *pg)
> +static int hv_online_page(struct page *pg, unsigned int order)
>  {
>  	struct hv_hotadd_state *has;
>  	unsigned long flags;
> @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg)
>  		if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
>  			continue;
>  
> -		hv_page_online_one(has, pg);
> +		hv_bring_pgs_online(has, pfn, (1UL << order));
>  		break;
>  	}
>  	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> +	return 0;
>  }
>  
>  static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)

So the question I have is why was a return value added to these
functions? They were previously void types and now they are int. What
is the return value expected other than 0?

> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index ceb5048..95f888f 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -345,8 +345,8 @@ static enum bp_state reserve_additional_memory(void)
>  
>  	/*
>  	 * add_memory_resource() will call online_pages() which in its turn
> -	 * will call xen_online_page() callback causing deadlock if we don't
> -	 * release balloon_mutex here. Unlocking here is safe because the
> +	 * will call xen_bring_pgs_online() callback causing deadlock if we
> +	 * don't release balloon_mutex here. Unlocking here is safe because the
>  	 * callers drop the mutex before trying again.
>  	 */
>  	mutex_unlock(&balloon_mutex);
> @@ -369,15 +369,22 @@ static enum bp_state reserve_additional_memory(void)
>  	return BP_ECANCELED;
>  }
>  
> -static void xen_online_page(struct page *page)
> +static int xen_bring_pgs_online(struct page *pg, unsigned int order)

Why did we rename this function? I see it was added as a new function
in v3, however in v4 we ended up replacing it completely. So why not
just keep the same name and make it easier for us to identify that the
is the Xen version of the XXX_online_pages callback?

[...]

> +static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
> +{
> +	unsigned long end = start + nr_pages;
> +	int order, ret, onlined_pages = 0;
> +
> +	while (start < end) {
> +		order = min(MAX_ORDER - 1,
> +			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
> +
> +		ret = (*online_page_callback)(pfn_to_page(start), order);
> +		if (!ret)
> +			onlined_pages += (1UL << order);
> +		else if (ret > 0)
> +			onlined_pages += ret;
> +

So if the ret > 0 it is supposed to represent how many pages were
onlined within a given block? What if the ret was negative? Really I am
not a fan of adding a return value to the online functions unless we
specifically document what the expected return values are supposed to
be. If we don't have any return values other than 0 there isn't much
point in having one anyway.
Arun KS Jan. 9, 2019, 5:58 a.m. UTC | #7
On 2019-01-08 23:43, Michal Hocko wrote:
> On Tue 08-01-19 09:56:09, Alexander Duyck wrote:
>> On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> [...]
>> >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>> >  			void *arg)
>> >  {
>> > -	unsigned long i;
>> >  	unsigned long onlined_pages = *(unsigned long *)arg;
>> > -	struct page *page;
>> >
>> >  	if (PageReserved(pfn_to_page(start_pfn)))
>> > -		for (i = 0; i < nr_pages; i++) {
>> > -			page = pfn_to_page(start_pfn + i);
>> > -			(*online_page_callback)(page);
>> > -			onlined_pages++;
>> > -		}
>> > +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
>> 
>> Shouldn't this be a "+=" instead of an "="? It seems like you are 
>> going
>> to lose your count otherwise.
> 
> You are right of course. I should have noticed during the review.
> Thanks!

I think we don't need to. The caller function is setting onlined_pages = 
0 before calling online_pages_range().
And there are no other reference to online_pages_range other than from 
online_pages().

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/memory_hotplug.c?h=v5.0-rc1#n845

int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int 
online_type)
{
         unsigned long flags;
         unsigned long onlined_pages = 0;

Regards,
Arun
Arun KS Jan. 9, 2019, 6:21 a.m. UTC | #8
On 2019-01-09 03:47, Alexander Duyck wrote:
> On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
>> When freeing pages are done with higher order, time spent on 
>> coalescing
>> pages by buddy allocator can be reduced.  With section size of 256MB, 
>> hot
>> add latency of a single section shows improvement from 50-60 ms to 
>> less
>> than 1 ms, hence improving the hot add latency by 60 times.  Modify
>> external providers of online callback to align with the change.
>> 
>> Signed-off-by: Arun KS <arunks@codeaurora.org>
>> Acked-by: Michal Hocko <mhocko@suse.com>
>> Reviewed-by: Oscar Salvador <osalvador@suse.de>
> 
> Sorry, ended up encountering a couple more things that have me a bit
> confused.
> 
> [...]
> 
>> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
>> index 5301fef..211f3fe 100644
>> --- a/drivers/hv/hv_balloon.c
>> +++ b/drivers/hv/hv_balloon.c
>> @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, 
>> unsigned long size,
>>  	}
>>  }
>> 
>> -static void hv_online_page(struct page *pg)
>> +static int hv_online_page(struct page *pg, unsigned int order)
>>  {
>>  	struct hv_hotadd_state *has;
>>  	unsigned long flags;
>> @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg)
>>  		if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
>>  			continue;
>> 
>> -		hv_page_online_one(has, pg);
>> +		hv_bring_pgs_online(has, pfn, (1UL << order));
>>  		break;
>>  	}
>>  	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
>> +
>> +	return 0;
>>  }
>> 
>>  static int pfn_covered(unsigned long start_pfn, unsigned long 
>> pfn_cnt)
> 
> So the question I have is why was a return value added to these
> functions? They were previously void types and now they are int. What
> is the return value expected other than 0?

Earlier with returning a void there was now way for an arch code to 
denying onlining of this particular page. By using an int as return 
type, we can implement this. In one of the boards I was using, there are 
some pages which should not be onlined because they are used for other 
purposes(like secure trust zone or hypervisor).

> 
>> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
>> index ceb5048..95f888f 100644
>> --- a/drivers/xen/balloon.c
>> +++ b/drivers/xen/balloon.c
>> @@ -345,8 +345,8 @@ static enum bp_state 
>> reserve_additional_memory(void)
>> 
>>  	/*
>>  	 * add_memory_resource() will call online_pages() which in its turn
>> -	 * will call xen_online_page() callback causing deadlock if we don't
>> -	 * release balloon_mutex here. Unlocking here is safe because the
>> +	 * will call xen_bring_pgs_online() callback causing deadlock if we
>> +	 * don't release balloon_mutex here. Unlocking here is safe because 
>> the
>>  	 * callers drop the mutex before trying again.
>>  	 */
>>  	mutex_unlock(&balloon_mutex);
>> @@ -369,15 +369,22 @@ static enum bp_state 
>> reserve_additional_memory(void)
>>  	return BP_ECANCELED;
>>  }
>> 
>> -static void xen_online_page(struct page *page)
>> +static int xen_bring_pgs_online(struct page *pg, unsigned int order)
> 
> Why did we rename this function? I see it was added as a new function
> in v3, however in v4 we ended up replacing it completely. So why not
> just keep the same name and make it easier for us to identify that the
> is the Xen version of the XXX_online_pages callback?

Point taken. Will send a patch.

> 
> [...]
> 
>> +static int online_pages_blocks(unsigned long start, unsigned long 
>> nr_pages)
>> +{
>> +	unsigned long end = start + nr_pages;
>> +	int order, ret, onlined_pages = 0;
>> +
>> +	while (start < end) {
>> +		order = min(MAX_ORDER - 1,
>> +			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
>> +
>> +		ret = (*online_page_callback)(pfn_to_page(start), order);
>> +		if (!ret)
>> +			onlined_pages += (1UL << order);
>> +		else if (ret > 0)
>> +			onlined_pages += ret;
>> +
> 
> So if the ret > 0 it is supposed to represent how many pages were
> onlined within a given block? What if the ret was negative? Really I am
> not a fan of adding a return value to the online functions unless we
> specifically document what the expected return values are supposed to
> be. If we don't have any return values other than 0 there isn't much
> point in having one anyway.

I ll document this.

Regards,
Arun
Michal Hocko Jan. 9, 2019, 7:37 a.m. UTC | #9
On Wed 09-01-19 11:28:52, Arun KS wrote:
> On 2019-01-08 23:43, Michal Hocko wrote:
> > On Tue 08-01-19 09:56:09, Alexander Duyck wrote:
> > > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> > [...]
> > > >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> > > >  			void *arg)
> > > >  {
> > > > -	unsigned long i;
> > > >  	unsigned long onlined_pages = *(unsigned long *)arg;
> > > > -	struct page *page;
> > > >
> > > >  	if (PageReserved(pfn_to_page(start_pfn)))
> > > > -		for (i = 0; i < nr_pages; i++) {
> > > > -			page = pfn_to_page(start_pfn + i);
> > > > -			(*online_page_callback)(page);
> > > > -			onlined_pages++;
> > > > -		}
> > > > +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
> > > 
> > > Shouldn't this be a "+=" instead of an "="? It seems like you are
> > > going
> > > to lose your count otherwise.
> > 
> > You are right of course. I should have noticed during the review.
> > Thanks!
> 
> I think we don't need to. The caller function is setting onlined_pages = 0
> before calling online_pages_range().
> And there are no other reference to online_pages_range other than from
> online_pages().

Are you missing that we accumulate onlined_pages via
	*(unsigned long *)arg = onlined_pages;
in online_pages_range?
Arun KS Jan. 9, 2019, 8:28 a.m. UTC | #10
On 2019-01-09 13:07, Michal Hocko wrote:
> On Wed 09-01-19 11:28:52, Arun KS wrote:
>> On 2019-01-08 23:43, Michal Hocko wrote:
>> > On Tue 08-01-19 09:56:09, Alexander Duyck wrote:
>> > > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
>> > [...]
>> > > >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>> > > >  			void *arg)
>> > > >  {
>> > > > -	unsigned long i;
>> > > >  	unsigned long onlined_pages = *(unsigned long *)arg;
>> > > > -	struct page *page;
>> > > >
>> > > >  	if (PageReserved(pfn_to_page(start_pfn)))
>> > > > -		for (i = 0; i < nr_pages; i++) {
>> > > > -			page = pfn_to_page(start_pfn + i);
>> > > > -			(*online_page_callback)(page);
>> > > > -			onlined_pages++;
>> > > > -		}
>> > > > +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
>> > >
>> > > Shouldn't this be a "+=" instead of an "="? It seems like you are
>> > > going
>> > > to lose your count otherwise.
>> >
>> > You are right of course. I should have noticed during the review.
>> > Thanks!
>> 
>> I think we don't need to. The caller function is setting onlined_pages 
>> = 0
>> before calling online_pages_range().
>> And there are no other reference to online_pages_range other than from
>> online_pages().
> 
> Are you missing that we accumulate onlined_pages via
> 	*(unsigned long *)arg = onlined_pages;
> in online_pages_range?

In my testing I didn't find any problem. To match the code being 
replaced and to avoid any corner cases, it is better to use +=
Will update the patch.

Regards,
Arun
Michal Hocko Jan. 9, 2019, 8:40 a.m. UTC | #11
On Wed 09-01-19 13:58:50, Arun KS wrote:
> On 2019-01-09 13:07, Michal Hocko wrote:
> > On Wed 09-01-19 11:28:52, Arun KS wrote:
> > > On 2019-01-08 23:43, Michal Hocko wrote:
> > > > On Tue 08-01-19 09:56:09, Alexander Duyck wrote:
> > > > > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> > > > [...]
> > > > > >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> > > > > >  			void *arg)
> > > > > >  {
> > > > > > -	unsigned long i;
> > > > > >  	unsigned long onlined_pages = *(unsigned long *)arg;
> > > > > > -	struct page *page;
> > > > > >
> > > > > >  	if (PageReserved(pfn_to_page(start_pfn)))
> > > > > > -		for (i = 0; i < nr_pages; i++) {
> > > > > > -			page = pfn_to_page(start_pfn + i);
> > > > > > -			(*online_page_callback)(page);
> > > > > > -			onlined_pages++;
> > > > > > -		}
> > > > > > +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
> > > > >
> > > > > Shouldn't this be a "+=" instead of an "="? It seems like you are
> > > > > going
> > > > > to lose your count otherwise.
> > > >
> > > > You are right of course. I should have noticed during the review.
> > > > Thanks!
> > > 
> > > I think we don't need to. The caller function is setting
> > > onlined_pages = 0
> > > before calling online_pages_range().
> > > And there are no other reference to online_pages_range other than from
> > > online_pages().
> > 
> > Are you missing that we accumulate onlined_pages via
> > 	*(unsigned long *)arg = onlined_pages;
> > in online_pages_range?
> 
> In my testing I didn't find any problem. To match the code being replaced
> and to avoid any corner cases, it is better to use +=
> Will update the patch.

Have you checked that the number of present pages both in the zone and
the node is correct because I fail to see how that would be possible.
Arun KS Jan. 9, 2019, 10:42 a.m. UTC | #12
On 2019-01-09 14:10, Michal Hocko wrote:
> On Wed 09-01-19 13:58:50, Arun KS wrote:
>> On 2019-01-09 13:07, Michal Hocko wrote:
>> > On Wed 09-01-19 11:28:52, Arun KS wrote:
>> > > On 2019-01-08 23:43, Michal Hocko wrote:
>> > > > On Tue 08-01-19 09:56:09, Alexander Duyck wrote:
>> > > > > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
>> > > > [...]
>> > > > > >  static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>> > > > > >  			void *arg)
>> > > > > >  {
>> > > > > > -	unsigned long i;
>> > > > > >  	unsigned long onlined_pages = *(unsigned long *)arg;
>> > > > > > -	struct page *page;
>> > > > > >
>> > > > > >  	if (PageReserved(pfn_to_page(start_pfn)))
>> > > > > > -		for (i = 0; i < nr_pages; i++) {
>> > > > > > -			page = pfn_to_page(start_pfn + i);
>> > > > > > -			(*online_page_callback)(page);
>> > > > > > -			onlined_pages++;
>> > > > > > -		}
>> > > > > > +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
>> > > > >
>> > > > > Shouldn't this be a "+=" instead of an "="? It seems like you are
>> > > > > going
>> > > > > to lose your count otherwise.
>> > > >
>> > > > You are right of course. I should have noticed during the review.
>> > > > Thanks!
>> > >
>> > > I think we don't need to. The caller function is setting
>> > > onlined_pages = 0
>> > > before calling online_pages_range().
>> > > And there are no other reference to online_pages_range other than from
>> > > online_pages().
>> >
>> > Are you missing that we accumulate onlined_pages via
>> > 	*(unsigned long *)arg = onlined_pages;
>> > in online_pages_range?
>> 
>> In my testing I didn't find any problem. To match the code being 
>> replaced
>> and to avoid any corner cases, it is better to use +=
>> Will update the patch.
> 
> Have you checked that the number of present pages both in the zone and
> the node is correct because I fail to see how that would be possible.

Yes they are showing correct values.

Previous value of cat /proc/zoneinfo,

Node 0, zone   Normal
   pages free     65492
         min      300
         low      375
         high     450
         spanned  65536
         present  65536
         managed  65536

Value after hotadd,

Node 0, zone   Normal
   pages free     129970
         min      518
         low      649
         high     780
         spanned  983040
         present  131072
         managed  131072

I added prints in online_pages_range function.
It will be called once per online of a section and the arg value is 
always set to 0 while entering online_pages_range.

/sys/devices/system/memory # echo online > memory16/state
[   52.956558] online_pages_range start_pfn = 100000 nr_pages = 65536 
arg = 0
[   52.964104] Built 1 zonelists, mobility grouping on.  Total pages: 
187367
[   52.964828] Policy zone: Normal

But still I'll change to += to match with the previous code.

Regards,
Arun
Michal Hocko Jan. 9, 2019, 10:57 a.m. UTC | #13
On Wed 09-01-19 16:12:48, Arun KS wrote:
[...]
> It will be called once per online of a section and the arg value is always
> set to 0 while entering online_pages_range.

You rare right that this will be the case in the most simple scenario.
But the point is that the callback can be called several times from
walk_system_ram_range and then your current code wouldn't work properly.
Arun KS Jan. 9, 2019, 11:06 a.m. UTC | #14
On 2019-01-09 16:27, Michal Hocko wrote:
> On Wed 09-01-19 16:12:48, Arun KS wrote:
> [...]
>> It will be called once per online of a section and the arg value is 
>> always
>> set to 0 while entering online_pages_range.
> 
> You rare right that this will be the case in the most simple scenario.
> But the point is that the callback can be called several times from
> walk_system_ram_range and then your current code wouldn't work 
> properly.

Thanks. Will use +=

Regards,
Arun
Alexander Duyck Jan. 9, 2019, 4:09 p.m. UTC | #15
On Wed, 2019-01-09 at 11:51 +0530, Arun KS wrote:
> On 2019-01-09 03:47, Alexander Duyck wrote:
> > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
> > > When freeing pages are done with higher order, time spent on 
> > > coalescing
> > > pages by buddy allocator can be reduced.  With section size of 256MB, 
> > > hot
> > > add latency of a single section shows improvement from 50-60 ms to 
> > > less
> > > than 1 ms, hence improving the hot add latency by 60 times.  Modify
> > > external providers of online callback to align with the change.
> > > 
> > > Signed-off-by: Arun KS <arunks@codeaurora.org>
> > > Acked-by: Michal Hocko <mhocko@suse.com>
> > > Reviewed-by: Oscar Salvador <osalvador@suse.de>
> > 
> > Sorry, ended up encountering a couple more things that have me a bit
> > confused.
> > 
> > [...]
> > 
> > > diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> > > index 5301fef..211f3fe 100644
> > > --- a/drivers/hv/hv_balloon.c
> > > +++ b/drivers/hv/hv_balloon.c
> > > @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, 
> > > unsigned long size,
> > >  	}
> > >  }
> > > 
> > > -static void hv_online_page(struct page *pg)
> > > +static int hv_online_page(struct page *pg, unsigned int order)
> > >  {
> > >  	struct hv_hotadd_state *has;
> > >  	unsigned long flags;
> > > @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg)
> > >  		if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
> > >  			continue;
> > > 
> > > -		hv_page_online_one(has, pg);
> > > +		hv_bring_pgs_online(has, pfn, (1UL << order));
> > >  		break;
> > >  	}
> > >  	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> > > +
> > > +	return 0;
> > >  }
> > > 
> > >  static int pfn_covered(unsigned long start_pfn, unsigned long 
> > > pfn_cnt)
> > 
> > So the question I have is why was a return value added to these
> > functions? They were previously void types and now they are int. What
> > is the return value expected other than 0?
> 
> Earlier with returning a void there was now way for an arch code to 
> denying onlining of this particular page. By using an int as return 
> type, we can implement this. In one of the boards I was using, there are 
> some pages which should not be onlined because they are used for other 
> purposes(like secure trust zone or hypervisor).

So where is the code using that? I don't see any functions in the
kernel that are returning anything other than 0. Maybe you should hold
off on changing the return type and make that a separate patch to be
enabled when you add the new functions that can return non-zero values.

That way if someone wants to backport this they are just getting the
bits needed to enable the improved hot-plug times without adding the
extra overhead for changing the return type.
Andrew Morton Jan. 9, 2019, 6:56 p.m. UTC | #16
On Wed, 09 Jan 2019 16:36:36 +0530 Arun KS <arunks@codeaurora.org> wrote:

> On 2019-01-09 16:27, Michal Hocko wrote:
> > On Wed 09-01-19 16:12:48, Arun KS wrote:
> > [...]
> >> It will be called once per online of a section and the arg value is 
> >> always
> >> set to 0 while entering online_pages_range.
> > 
> > You rare right that this will be the case in the most simple scenario.
> > But the point is that the callback can be called several times from
> > walk_system_ram_range and then your current code wouldn't work 
> > properly.
> 
> Thanks. Will use +=

The v8 patch
https://lore.kernel.org/lkml/1547032395-24582-1-git-send-email-arunks@codeaurora.org/T/#u

(which you apparently sent 7 minutes after typing the above) still has

 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 			void *arg)
 {
-	unsigned long i;
 	unsigned long onlined_pages = *(unsigned long *)arg;
-	struct page *page;
 
 	if (PageReserved(pfn_to_page(start_pfn)))
-		for (i = 0; i < nr_pages; i++) {
-			page = pfn_to_page(start_pfn + i);
-			(*online_page_callback)(page);
-			onlined_pages++;
-		}
+		onlined_pages = online_pages_blocks(start_pfn, nr_pages);


Even then the code makes no sense.

static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
{
	unsigned long onlined_pages = *(unsigned long *)arg;

	if (PageReserved(pfn_to_page(start_pfn)))
		onlined_pages += online_pages_blocks(start_pfn, nr_pages);

	online_mem_sections(start_pfn, start_pfn + nr_pages);

	*(unsigned long *)arg += onlined_pages;
	return 0;
}

Either the final assignment should be

	*(unsigned long *)arg = onlined_pages;

or the initialization should be

	unsigned long onlined_pages = 0;



This is becoming a tad tiresome and I'd prefer not to have to check up
on such things.  Can we please get this right?
Arun KS Jan. 10, 2019, 4:39 a.m. UTC | #17
On 2019-01-09 21:39, Alexander Duyck wrote:
> On Wed, 2019-01-09 at 11:51 +0530, Arun KS wrote:
>> On 2019-01-09 03:47, Alexander Duyck wrote:
>> > On Fri, 2019-01-04 at 10:31 +0530, Arun KS wrote:
>> > > When freeing pages are done with higher order, time spent on
>> > > coalescing
>> > > pages by buddy allocator can be reduced.  With section size of 256MB,
>> > > hot
>> > > add latency of a single section shows improvement from 50-60 ms to
>> > > less
>> > > than 1 ms, hence improving the hot add latency by 60 times.  Modify
>> > > external providers of online callback to align with the change.
>> > >
>> > > Signed-off-by: Arun KS <arunks@codeaurora.org>
>> > > Acked-by: Michal Hocko <mhocko@suse.com>
>> > > Reviewed-by: Oscar Salvador <osalvador@suse.de>
>> >
>> > Sorry, ended up encountering a couple more things that have me a bit
>> > confused.
>> >
>> > [...]
>> >
>> > > diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
>> > > index 5301fef..211f3fe 100644
>> > > --- a/drivers/hv/hv_balloon.c
>> > > +++ b/drivers/hv/hv_balloon.c
>> > > @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start,
>> > > unsigned long size,
>> > >  	}
>> > >  }
>> > >
>> > > -static void hv_online_page(struct page *pg)
>> > > +static int hv_online_page(struct page *pg, unsigned int order)
>> > >  {
>> > >  	struct hv_hotadd_state *has;
>> > >  	unsigned long flags;
>> > > @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg)
>> > >  		if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
>> > >  			continue;
>> > >
>> > > -		hv_page_online_one(has, pg);
>> > > +		hv_bring_pgs_online(has, pfn, (1UL << order));
>> > >  		break;
>> > >  	}
>> > >  	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
>> > > +
>> > > +	return 0;
>> > >  }
>> > >
>> > >  static int pfn_covered(unsigned long start_pfn, unsigned long
>> > > pfn_cnt)
>> >
>> > So the question I have is why was a return value added to these
>> > functions? They were previously void types and now they are int. What
>> > is the return value expected other than 0?
>> 
>> Earlier with returning a void there was now way for an arch code to
>> denying onlining of this particular page. By using an int as return
>> type, we can implement this. In one of the boards I was using, there 
>> are
>> some pages which should not be onlined because they are used for other
>> purposes(like secure trust zone or hypervisor).
> 
> So where is the code using that? I don't see any functions in the
> kernel that are returning anything other than 0. Maybe you should hold
> off on changing the return type and make that a separate patch to be
> enabled when you add the new functions that can return non-zero values.
> 
> That way if someone wants to backport this they are just getting the
> bits needed to enable the improved hot-plug times without adding the
> extra overhead for changing the return type.

The implementation was in our downstream code. I thought this might be 
useful for someone else in similar situations.
Considering the above mentioned reasons, I ll remove changing the return 
type.

Regards,
Arun
Arun KS Jan. 10, 2019, 5:06 a.m. UTC | #18
On 2019-01-10 00:26, Andrew Morton wrote:
> On Wed, 09 Jan 2019 16:36:36 +0530 Arun KS <arunks@codeaurora.org> 
> wrote:
> 
>> On 2019-01-09 16:27, Michal Hocko wrote:
>> > On Wed 09-01-19 16:12:48, Arun KS wrote:
>> > [...]
>> >> It will be called once per online of a section and the arg value is
>> >> always
>> >> set to 0 while entering online_pages_range.
>> >
>> > You rare right that this will be the case in the most simple scenario.
>> > But the point is that the callback can be called several times from
>> > walk_system_ram_range and then your current code wouldn't work
>> > properly.
>> 
>> Thanks. Will use +=
> 
> The v8 patch
> https://lore.kernel.org/lkml/1547032395-24582-1-git-send-email-arunks@codeaurora.org/T/#u
> 
> (which you apparently sent 7 minutes after typing the above) still has
> 
>  static int online_pages_range(unsigned long start_pfn, unsigned long 
> nr_pages,
>  			void *arg)
>  {
> -	unsigned long i;
>  	unsigned long onlined_pages = *(unsigned long *)arg;
> -	struct page *page;
> 
>  	if (PageReserved(pfn_to_page(start_pfn)))
> -		for (i = 0; i < nr_pages; i++) {
> -			page = pfn_to_page(start_pfn + i);
> -			(*online_page_callback)(page);
> -			onlined_pages++;
> -		}
> +		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
> 
> 
> Even then the code makes no sense.
> 
> static int online_pages_range(unsigned long start_pfn, unsigned long 
> nr_pages,
> 			void *arg)
> {
> 	unsigned long onlined_pages = *(unsigned long *)arg;
> 
> 	if (PageReserved(pfn_to_page(start_pfn)))
> 		onlined_pages += online_pages_blocks(start_pfn, nr_pages);
> 
> 	online_mem_sections(start_pfn, start_pfn + nr_pages);
> 
> 	*(unsigned long *)arg += onlined_pages;
> 	return 0;
> }
> 
> Either the final assignment should be
> 
> 	*(unsigned long *)arg = onlined_pages;
> 
> or the initialization should be
> 
> 	unsigned long onlined_pages = 0;
> 
> 
> 
> This is becoming a tad tiresome and I'd prefer not to have to check up
> on such things.  Can we please get this right?

Sorry about that. Will fix it.

Regards,
Arun
diff mbox series

Patch

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 5301fef..211f3fe 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -771,7 +771,7 @@  static void hv_mem_hot_add(unsigned long start, unsigned long size,
 	}
 }
 
-static void hv_online_page(struct page *pg)
+static int hv_online_page(struct page *pg, unsigned int order)
 {
 	struct hv_hotadd_state *has;
 	unsigned long flags;
@@ -783,10 +783,12 @@  static void hv_online_page(struct page *pg)
 		if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
 			continue;
 
-		hv_page_online_one(has, pg);
+		hv_bring_pgs_online(has, pfn, (1UL << order));
 		break;
 	}
 	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+
+	return 0;
 }
 
 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index ceb5048..95f888f 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -345,8 +345,8 @@  static enum bp_state reserve_additional_memory(void)
 
 	/*
 	 * add_memory_resource() will call online_pages() which in its turn
-	 * will call xen_online_page() callback causing deadlock if we don't
-	 * release balloon_mutex here. Unlocking here is safe because the
+	 * will call xen_bring_pgs_online() callback causing deadlock if we
+	 * don't release balloon_mutex here. Unlocking here is safe because the
 	 * callers drop the mutex before trying again.
 	 */
 	mutex_unlock(&balloon_mutex);
@@ -369,15 +369,22 @@  static enum bp_state reserve_additional_memory(void)
 	return BP_ECANCELED;
 }
 
-static void xen_online_page(struct page *page)
+static int xen_bring_pgs_online(struct page *pg, unsigned int order)
 {
-	__online_page_set_limits(page);
+	unsigned long i, size = (1 << order);
+	unsigned long start_pfn = page_to_pfn(pg);
+	struct page *p;
 
+	pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
 	mutex_lock(&balloon_mutex);
-
-	__balloon_append(page);
-
+	for (i = 0; i < size; i++) {
+		p = pfn_to_page(start_pfn + i);
+		__online_page_set_limits(p);
+		__balloon_append(p);
+	}
 	mutex_unlock(&balloon_mutex);
+
+	return 0;
 }
 
 static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
@@ -702,7 +709,7 @@  static int __init balloon_init(void)
 	balloon_stats.max_retry_count = RETRY_UNLIMITED;
 
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-	set_online_page_callback(&xen_online_page);
+	set_online_page_callback(&xen_bring_pgs_online);
 	register_memory_notifier(&xen_memory_nb);
 	register_sysctl_table(xen_root);
 #endif
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 07da5c6..d56bfba 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -87,7 +87,7 @@  extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
 	unsigned long *valid_start, unsigned long *valid_end);
 extern void __offline_isolated_pages(unsigned long, unsigned long);
 
-typedef void (*online_page_callback_t)(struct page *page);
+typedef int (*online_page_callback_t)(struct page *page, unsigned int order);
 
 extern int set_online_page_callback(online_page_callback_t callback);
 extern int restore_online_page_callback(online_page_callback_t callback);
diff --git a/mm/internal.h b/mm/internal.h
index f4a7bb0..536bc2a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -163,6 +163,7 @@  static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
 extern int __isolate_free_page(struct page *page, unsigned int order);
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
 					unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned int order);
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9a667d..0ea0eb1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -47,7 +47,7 @@ 
  * and restore_online_page_callback() for generic callback restore.
  */
 
-static void generic_online_page(struct page *page);
+static int generic_online_page(struct page *page, unsigned int order);
 
 static online_page_callback_t online_page_callback = generic_online_page;
 static DEFINE_MUTEX(online_page_callback_lock);
@@ -656,26 +656,44 @@  void __online_page_free(struct page *page)
 }
 EXPORT_SYMBOL_GPL(__online_page_free);
 
-static void generic_online_page(struct page *page)
+static int generic_online_page(struct page *page, unsigned int order)
 {
-	__online_page_set_limits(page);
-	__online_page_increment_counters(page);
-	__online_page_free(page);
+	__free_pages_core(page, order);
+	totalram_pages_add(1UL << order);
+#ifdef CONFIG_HIGHMEM
+	if (PageHighMem(page))
+		totalhigh_pages_add(1UL << order);
+#endif
+	return 0;
+}
+
+static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
+{
+	unsigned long end = start + nr_pages;
+	int order, ret, onlined_pages = 0;
+
+	while (start < end) {
+		order = min(MAX_ORDER - 1,
+			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
+
+		ret = (*online_page_callback)(pfn_to_page(start), order);
+		if (!ret)
+			onlined_pages += (1UL << order);
+		else if (ret > 0)
+			onlined_pages += ret;
+
+		start += (1UL << order);
+	}
+	return onlined_pages;
 }
 
 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 			void *arg)
 {
-	unsigned long i;
 	unsigned long onlined_pages = *(unsigned long *)arg;
-	struct page *page;
 
 	if (PageReserved(pfn_to_page(start_pfn)))
-		for (i = 0; i < nr_pages; i++) {
-			page = pfn_to_page(start_pfn + i);
-			(*online_page_callback)(page);
-			onlined_pages++;
-		}
+		onlined_pages = online_pages_blocks(start_pfn, nr_pages);
 
 	online_mem_sections(start_pfn, start_pfn + nr_pages);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac..f51a920 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1303,7 +1303,7 @@  static void __free_pages_ok(struct page *page, unsigned int order)
 	local_irq_restore(flags);
 }
 
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
@@ -1382,7 +1382,7 @@  void __init memblock_free_pages(struct page *page, unsigned long pfn,
 {
 	if (early_page_uninitialised(pfn))
 		return;
-	return __free_pages_boot_core(page, order);
+	__free_pages_core(page, order);
 }
 
 /*
@@ -1472,14 +1472,14 @@  static void __init deferred_free_range(unsigned long pfn,
 	if (nr_pages == pageblock_nr_pages &&
 	    (pfn & (pageblock_nr_pages - 1)) == 0) {
 		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_boot_core(page, pageblock_order);
+		__free_pages_core(page, pageblock_order);
 		return;
 	}
 
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
 		if ((pfn & (pageblock_nr_pages - 1)) == 0)
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_boot_core(page, 0);
+		__free_pages_core(page, 0);
 	}
 }