diff mbox

[v35,1/5] mm: support to get hints of free page blocks

Message ID 1531215067-35472-2-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Wei W July 10, 2018, 9:31 a.m. UTC
This patch adds support to get free page blocks from a free page list.
The physical addresses of the blocks are stored to a list of buffers
passed from the caller. The obtained free page blocks are hints about
free pages, because there is no guarantee that they are still on the free
page list after the function returns.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are hinted as free pages but are written after this function returns will
be captured by the hypervisor, and they will be added to the next round of
memory transfer.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 ++
 mm/page_alloc.c    | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

Comments

Wang, Wei W July 10, 2018, 10:16 a.m. UTC | #1
On Tuesday, July 10, 2018 5:31 PM, Wang, Wei W wrote:
> Subject: [PATCH v35 1/5] mm: support to get hints of free page blocks
> 
> This patch adds support to get free page blocks from a free page list.
> The physical addresses of the blocks are stored to a list of buffers passed
> from the caller. The obtained free page blocks are hints about free pages,
> because there is no guarantee that they are still on the free page list after the
> function returns.
> 
> One use example of this patch is to accelerate live migration by skipping the
> transfer of free pages reported from the guest. A popular method used by
> the hypervisor to track which part of memory is written during live migration
> is to write-protect all the guest memory. So, those pages that are hinted as
> free pages but are written after this function returns will be captured by the
> hypervisor, and they will be added to the next round of memory transfer.
> 
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> ---
>  include/linux/mm.h |  3 ++
>  mm/page_alloc.c    | 98
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 101 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h index a0fbb9f..5ce654f
> 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2007,6 +2007,9 @@ extern void free_area_init(unsigned long *
> zones_size);  extern void free_area_init_node(int nid, unsigned long *
> zones_size,
>  		unsigned long zone_start_pfn, unsigned long *zholes_size);
> extern void free_initmem(void);
> +unsigned long max_free_page_blocks(int order); int
> +get_from_free_page_list(int order, struct list_head *pages,
> +			    unsigned int size, unsigned long *loaded_num);
> 
>  /*
>   * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1521100..b67839b
> 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5043,6 +5043,104 @@ void show_free_areas(unsigned int filter,
> nodemask_t *nodemask)
>  	show_swap_cache_info();
>  }
> 
> +/**
> + * max_free_page_blocks - estimate the max number of free page blocks
> + * @order: the order of the free page blocks to estimate
> + *
> + * This function gives a rough estimation of the possible maximum
> +number of
> + * free page blocks a free list may have. The estimation works on an
> +assumption
> + * that all the system pages are on that list.
> + *
> + * Context: Any context.
> + *
> + * Return: The largest number of free page blocks that the free list can have.
> + */
> +unsigned long max_free_page_blocks(int order) {
> +	return totalram_pages / (1 << order);
> +}
> +EXPORT_SYMBOL_GPL(max_free_page_blocks);
> +
> +/**
> + * get_from_free_page_list - get hints of free pages from a free page
> +list
> + * @order: the order of the free page list to check
> + * @pages: the list of page blocks used as buffers to load the
> +addresses
> + * @size: the size of each buffer in bytes
> + * @loaded_num: the number of addresses loaded to the buffers
> + *
> + * This function offers hints about free pages. The addresses of free
> +page
> + * blocks are stored to the list of buffers passed from the caller.
> +There is
> + * no guarantee that the obtained free pages are still on the free page
> +list
> + * after the function returns. pfn_to_page on the obtained free pages
> +is
> + * strongly discouraged and if there is an absolute need for that, make
> +sure
> + * to contact MM people to discuss potential problems.
> + *
> + * The addresses are currently stored to a buffer in little endian.
> +This
> + * avoids the overhead of converting endianness by the caller who needs
> +data
> + * in the little endian format. Big endian support can be added on
> +demand in
> + * the future.
> + *
> + * Context: Process context.
> + *
> + * Return: 0 if all the free page block addresses are stored to the buffers;
> + *         -ENOSPC if the buffers are not sufficient to store all the
> + *         addresses; or -EINVAL if an unexpected argument is received (e.g.
> + *         incorrect @order, empty buffer list).
> + */
> +int get_from_free_page_list(int order, struct list_head *pages,
> +			    unsigned int size, unsigned long *loaded_num) {


Hi Linus,

We  took your original suggestion - pass in pre-allocated buffers to load the addresses (now we use a list of pre-allocated page blocks as buffers). Hope that suggestion is still acceptable (the advantage of this method was explained here: https://lkml.org/lkml/2018/6/28/184).
Look forward to getting your feedback. Thanks.

Best,
Wei
Linus Torvalds July 10, 2018, 5:33 p.m. UTC | #2
NAK.

On Tue, Jul 10, 2018 at 2:56 AM Wei Wang <wei.w.wang@intel.com> wrote:
>
> +
> +       buf_page = list_first_entry_or_null(pages, struct page, lru);
> +       if (!buf_page)
> +               return -EINVAL;
> +       buf = (__le64 *)page_address(buf_page);

Stop this garbage.

Why the hell would you pass in some crazy "liost of pages" that uses
that lru list?

That's just insane shit.

Just pass in a an array to fill in. No idiotic games like this with
odd list entries (what's the locking?) and crazy casting to

So if you want an array of page addresses, pass that in as such. If
you want to do it in a page, do it with

    u64 *array = page_address(page);
    int nr = PAGE_SIZE / sizeof(u64);

and now you pass that array in to the thing. None of this completely
insane crazy crap interfaces.

Plus, I still haven't heard an explanation for why you want so many
pages in the first place, and why you want anything but MAX_ORDER-1.

So no. This kind of unnecessarily complex code with completely insane
calling interfaces does not make it into the VM layer.

Maybe that crazy "let's pass a chain of pages that uses the lru list"
makes sense to the virtio-balloon code. But you need to understand
that it makes ZERO conceptual sense to anybody else. And the core VM
code is about a million times more important than the balloon code in
this case, so you had better make the interface make sense to *it*.

               Linus
Wang, Wei W July 11, 2018, 1:28 a.m. UTC | #3
On 07/11/2018 01:33 AM, Linus Torvalds wrote:
> NAK.
>
> On Tue, Jul 10, 2018 at 2:56 AM Wei Wang <wei.w.wang@intel.com> wrote:
>> +
>> +       buf_page = list_first_entry_or_null(pages, struct page, lru);
>> +       if (!buf_page)
>> +               return -EINVAL;
>> +       buf = (__le64 *)page_address(buf_page);
> Stop this garbage.
>
> Why the hell would you pass in some crazy "liost of pages" that uses
> that lru list?
>
> That's just insane shit.
>
> Just pass in a an array to fill in. No idiotic games like this with
> odd list entries (what's the locking?) and crazy casting to
>
> So if you want an array of page addresses, pass that in as such. If
> you want to do it in a page, do it with
>
>      u64 *array = page_address(page);
>      int nr = PAGE_SIZE / sizeof(u64);
>
> and now you pass that array in to the thing. None of this completely
> insane crazy crap interfaces.
>
> Plus, I still haven't heard an explanation for why you want so many
> pages in the first place, and why you want anything but MAX_ORDER-1.

Sorry for missing that explanation.
We only get addresses of the "MAX_ORDER-1" blocks into the array. The 
max size of the array that could be allocated by kmalloc is 
KMALLOC_MAX_SIZE (i.e. 4MB on x86). With that max array, we could load 
"4MB / sizeof(u64)" addresses of "MAX_ORDER-1" blocks, that is, 2TB free 
memory at most. We thought about removing that 2TB limitation by passing 
in multiple such max arrays (a list of them).

But 2TB has been enough for our use cases so far, and agree it would be 
better to have a simpler API in the first place. So I plan to go back to 
the previous version of just passing in one simple array 
(https://lkml.org/lkml/2018/6/15/21) if no objections.

Best,
Wei
Linus Torvalds July 11, 2018, 1:44 a.m. UTC | #4
On Tue, Jul 10, 2018 at 6:24 PM Wei Wang <wei.w.wang@intel.com> wrote:
>
> We only get addresses of the "MAX_ORDER-1" blocks into the array. The
> max size of the array that could be allocated by kmalloc is
> KMALLOC_MAX_SIZE (i.e. 4MB on x86). With that max array, we could load
> "4MB / sizeof(u64)" addresses of "MAX_ORDER-1" blocks, that is, 2TB free
> memory at most. We thought about removing that 2TB limitation by passing
> in multiple such max arrays (a list of them).

No.

Stop this already./

You're doing everthing wrong.

If the array has to describe *all* memory you will ever free, then you
have already lost.

Just do it in chunks.

I don't want the VM code to even fill in that big of an array anyway -
this all happens under the zone lock, and you're walking a list that
is bad for caching anyway.

So plan on an interface that allows _incremental_ freeing, because any
plan that starts with "I worry that maybe two TERABYTES of memory
isn't big enough" is so broken that it's laughable.

That was what I tried to encourage with actually removing the pages
form the page list. That would be an _incremental_ interface. You can
remove MAX_ORDER-1 pages one by one (or a hundred at a time), and mark
them free for ballooning that way. And if you still feel you have tons
of free memory, just continue removing more pages from the free list.

Notice? Incremental. Not "I want to have a crazy array that is enough
to hold 2TB at one time".

So here's the rule:

 - make it a simple array interface

 - make the array *small*. Not megabytes. Kilobytes. Because if you're
filling in megabytes worth of free pointers while holding the zone
lock, you're doing something wrong.

 - design the interface so that you do not *need* to have this crazy
"all or nothing" approach.

See what I'm trying to push for. Think "low latency". Think "small
arrays". Think "simple and straightforward interfaces".

At no point should you ever worry about "2TB". Never.

           Linus
Michael S. Tsirkin July 11, 2018, 4 a.m. UTC | #5
On Tue, Jul 10, 2018 at 10:33:08AM -0700, Linus Torvalds wrote:
> NAK.
> 
> On Tue, Jul 10, 2018 at 2:56 AM Wei Wang <wei.w.wang@intel.com> wrote:
> >
> > +
> > +       buf_page = list_first_entry_or_null(pages, struct page, lru);
> > +       if (!buf_page)
> > +               return -EINVAL;
> > +       buf = (__le64 *)page_address(buf_page);
> 
> Stop this garbage.
> 
> Why the hell would you pass in some crazy "liost of pages" that uses
> that lru list?
> 
> That's just insane shit.
> 
> Just pass in a an array to fill in.
> No idiotic games like this with
> odd list entries (what's the locking?) and crazy casting to
> 
> So if you want an array of page addresses, pass that in as such. If
> you want to do it in a page, do it with
> 
>     u64 *array = page_address(page);
>     int nr = PAGE_SIZE / sizeof(u64);
> 
> and now you pass that array in to the thing. None of this completely
> insane crazy crap interfaces.

Question was raised what to do if there are so many free
MAX_ORDER pages that their addresses don't fit in a single MAX_ORDER
page. Yes, only a huge guest would trigger that but it seems
theoretically possible.

I guess an array of arrays then?

An alternative suggestion was not to pass an array at all,
instead peel enough pages off the list to contain
all free entries. Maybe that's too hacky.


> 
> Plus, I still haven't heard an explanation for why you want so many
> pages in the first place, and why you want anything but MAX_ORDER-1.
> 
> So no. This kind of unnecessarily complex code with completely insane
> calling interfaces does not make it into the VM layer.
> 
> Maybe that crazy "let's pass a chain of pages that uses the lru list"
> makes sense to the virtio-balloon code. But you need to understand
> that it makes ZERO conceptual sense to anybody else. And the core VM
> code is about a million times more important than the balloon code in
> this case, so you had better make the interface make sense to *it*.
> 
>                Linus
Michael S. Tsirkin July 11, 2018, 4:04 a.m. UTC | #6
On Wed, Jul 11, 2018 at 07:00:37AM +0300, Michael S. Tsirkin wrote:
> On Tue, Jul 10, 2018 at 10:33:08AM -0700, Linus Torvalds wrote:
> > NAK.
> > 
> > On Tue, Jul 10, 2018 at 2:56 AM Wei Wang <wei.w.wang@intel.com> wrote:
> > >
> > > +
> > > +       buf_page = list_first_entry_or_null(pages, struct page, lru);
> > > +       if (!buf_page)
> > > +               return -EINVAL;
> > > +       buf = (__le64 *)page_address(buf_page);
> > 
> > Stop this garbage.
> > 
> > Why the hell would you pass in some crazy "liost of pages" that uses
> > that lru list?
> > 
> > That's just insane shit.
> > 
> > Just pass in a an array to fill in.
> > No idiotic games like this with
> > odd list entries (what's the locking?) and crazy casting to
> > 
> > So if you want an array of page addresses, pass that in as such. If
> > you want to do it in a page, do it with
> > 
> >     u64 *array = page_address(page);
> >     int nr = PAGE_SIZE / sizeof(u64);
> > 
> > and now you pass that array in to the thing. None of this completely
> > insane crazy crap interfaces.
> 
> Question was raised what to do if there are so many free
> MAX_ORDER pages that their addresses don't fit in a single MAX_ORDER
> page.

Oh you answered already, I spoke too soon. Nevermind, pls ignore me.
Michal Hocko July 11, 2018, 9:21 a.m. UTC | #7
On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
[...]
> That was what I tried to encourage with actually removing the pages
> form the page list. That would be an _incremental_ interface. You can
> remove MAX_ORDER-1 pages one by one (or a hundred at a time), and mark
> them free for ballooning that way. And if you still feel you have tons
> of free memory, just continue removing more pages from the free list.

We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
So why do we need any array based interface?
Wang, Wei W July 11, 2018, 10:52 a.m. UTC | #8
On 07/11/2018 05:21 PM, Michal Hocko wrote:
> On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
> [...]
>> That was what I tried to encourage with actually removing the pages
>> form the page list. That would be an _incremental_ interface. You can
>> remove MAX_ORDER-1 pages one by one (or a hundred at a time), and mark
>> them free for ballooning that way. And if you still feel you have tons
>> of free memory, just continue removing more pages from the free list.
> We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
> So why do we need any array based interface?

Yes, I'm trying to get free pages directly via alloc_pages, so there 
will be no new mm APIs.
I plan to let free page allocation stop when the remaining system free 
memory becomes close to min_free_kbytes (prevent swapping).


Best,
Wei
Michal Hocko July 11, 2018, 11:09 a.m. UTC | #9
On Wed 11-07-18 18:52:45, Wei Wang wrote:
> On 07/11/2018 05:21 PM, Michal Hocko wrote:
> > On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
> > [...]
> > > That was what I tried to encourage with actually removing the pages
> > > form the page list. That would be an _incremental_ interface. You can
> > > remove MAX_ORDER-1 pages one by one (or a hundred at a time), and mark
> > > them free for ballooning that way. And if you still feel you have tons
> > > of free memory, just continue removing more pages from the free list.
> > We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
> > So why do we need any array based interface?
> 
> Yes, I'm trying to get free pages directly via alloc_pages, so there will be
> no new mm APIs.

OK. The above was just a rough example. In fact you would need a more
complex gfp mask. I assume you only want to balloon only memory directly
usable by the kernel so it will be
	(GFP_KERNEL | __GFP_NOWARN) & ~__GFP_RECLAIM

> I plan to let free page allocation stop when the remaining system free
> memory becomes close to min_free_kbytes (prevent swapping).

~__GFP_RECLAIM will make sure you are allocate as long as there is any
memory without reclaim. It will not even poke the kswapd to do the
background work. So I do not think you would need much more than that.

But let me note that I am not really convinced how this (or previous)
approach will really work in most workloads. We tend to cache heavily so
there is rarely any memory free.
Wang, Wei W July 11, 2018, 1:55 p.m. UTC | #10
On Wednesday, July 11, 2018 7:10 PM, Michal Hocko wrote:
> On Wed 11-07-18 18:52:45, Wei Wang wrote:
> > On 07/11/2018 05:21 PM, Michal Hocko wrote:
> > > On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
> > > [...]
> > > > That was what I tried to encourage with actually removing the
> > > > pages form the page list. That would be an _incremental_
> > > > interface. You can remove MAX_ORDER-1 pages one by one (or a
> > > > hundred at a time), and mark them free for ballooning that way.
> > > > And if you still feel you have tons of free memory, just continue
> removing more pages from the free list.
> > > We already have an interface for that. alloc_pages(GFP_NOWAIT,
> MAX_ORDER -1).
> > > So why do we need any array based interface?
> >
> > Yes, I'm trying to get free pages directly via alloc_pages, so there
> > will be no new mm APIs.
> 
> OK. The above was just a rough example. In fact you would need a more
> complex gfp mask. I assume you only want to balloon only memory directly
> usable by the kernel so it will be
> 	(GFP_KERNEL | __GFP_NOWARN) & ~__GFP_RECLAIM

Sounds good to me, thanks.

> 
> > I plan to let free page allocation stop when the remaining system free
> > memory becomes close to min_free_kbytes (prevent swapping).
> 
> ~__GFP_RECLAIM will make sure you are allocate as long as there is any
> memory without reclaim. It will not even poke the kswapd to do the
> background work. So I do not think you would need much more than that.

"close to min_free_kbytes" - I meant when doing the allocations, we intentionally reserve some small amount of memory, e.g. 2 free page blocks of "MAX_ORDER - 1". So when other applications happen to do some allocation, they may easily get some from the reserved memory left on the free list. Without that reserved memory, other allocation may cause the system free memory below the WMARK[MIN], and kswapd would start to do swapping. This is actually just a small optimization to reduce the probability of causing swapping (nice to have, but not mandatary because we will allocate free page blocks one by one).

 > But let me note that I am not really convinced how this (or previous)
> approach will really work in most workloads. We tend to cache heavily so
> there is rarely any memory free.

With less free memory, the improvement becomes less, but should be nicer than no optimization. For example, the Linux build workload would cause 4~5 GB (out of 8GB) memory to be used as page cache at the final stage, there is still ~44% live migration time reduction.

Since we have many cloud customers interested in this feature, I think we can let them test the usefulness.

Best,
Wei
Michal Hocko July 11, 2018, 2:38 p.m. UTC | #11
On Wed 11-07-18 13:55:15, Wang, Wei W wrote:
> On Wednesday, July 11, 2018 7:10 PM, Michal Hocko wrote:
> > On Wed 11-07-18 18:52:45, Wei Wang wrote:
> > > On 07/11/2018 05:21 PM, Michal Hocko wrote:
> > > > On Tue 10-07-18 18:44:34, Linus Torvalds wrote:
> > > > [...]
> > > > > That was what I tried to encourage with actually removing the
> > > > > pages form the page list. That would be an _incremental_
> > > > > interface. You can remove MAX_ORDER-1 pages one by one (or a
> > > > > hundred at a time), and mark them free for ballooning that way.
> > > > > And if you still feel you have tons of free memory, just continue
> > removing more pages from the free list.
> > > > We already have an interface for that. alloc_pages(GFP_NOWAIT,
> > MAX_ORDER -1).
> > > > So why do we need any array based interface?
> > >
> > > Yes, I'm trying to get free pages directly via alloc_pages, so there
> > > will be no new mm APIs.
> > 
> > OK. The above was just a rough example. In fact you would need a more
> > complex gfp mask. I assume you only want to balloon only memory directly
> > usable by the kernel so it will be
> > 	(GFP_KERNEL | __GFP_NOWARN) & ~__GFP_RECLAIM
> 
> Sounds good to me, thanks.
> 
> > 
> > > I plan to let free page allocation stop when the remaining system free
> > > memory becomes close to min_free_kbytes (prevent swapping).
> > 
> > ~__GFP_RECLAIM will make sure you are allocate as long as there is any
> > memory without reclaim. It will not even poke the kswapd to do the
> > background work. So I do not think you would need much more than that.
> 
> "close to min_free_kbytes" - I meant when doing the allocations, we
> intentionally reserve some small amount of memory, e.g. 2 free page
> blocks of "MAX_ORDER - 1". So when other applications happen to do
> some allocation, they may easily get some from the reserved memory
> left on the free list. Without that reserved memory, other allocation
> may cause the system free memory below the WMARK[MIN], and kswapd
> would start to do swapping. This is actually just a small optimization
> to reduce the probability of causing swapping (nice to have, but not
> mandatary because we will allocate free page blocks one by one).

I really have hard time to follow you here. Nothing outside of the core
MM proper should play with watermarks.
 
>  > But let me note that I am not really convinced how this (or previous)
> > approach will really work in most workloads. We tend to cache heavily so
> > there is rarely any memory free.
> 
> With less free memory, the improvement becomes less, but should be
> nicer than no optimization. For example, the Linux build workload
> would cause 4~5 GB (out of 8GB) memory to be used as page cache at the
> final stage, there is still ~44% live migration time reduction.

But most systems will stay somewhere around the high watermark if there
is any page cache activity. Especially after a longer uptime.
Linus Torvalds July 11, 2018, 4:23 p.m. UTC | #12
On Wed, Jul 11, 2018 at 2:21 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
> So why do we need any array based interface?

That was actually my original argument in the original thread - that
the only new interface people might want is one that just tells how
many of those MAX_ORDER-1 pages there are.

See the thread in v33 with the subject

  "[PATCH v33 1/4] mm: add a function to get free page blocks"

and look for me suggesting just using

    #define GFP_MINFLAGS (__GFP_NORETRY | __GFP_NOWARN |
__GFP_THISNODE | __GFP_NOMEMALLOC)

    struct page *page =  alloc_pages(GFP_MINFLAGS, MAX_ORDER-1);

for this all.

But I could also see an argument for "allocate N pages of size
MAX_ORDER-1", with some small N, simply because I can see the
advantage of not taking and releasing the locking and looking up the
zone individually N times.

If you want to get gigabytes of memory (or terabytes), doing it in
bigger chunks than one single maximum-sized page sounds fairly
reasonable.

I just don't think that "thousands of pages" is reasonable. But "tens
of max-sized pages" sounds fair enough to me, and it would certainly
not be a pain for the VM.

So I'm open to new interfaces. I just want those new interfaces to
make sense, and be low latency and simple for the VM to do. I'm
objecting to the incredibly baroque and heavy-weight one that can
return near-infinite amounts of memory.

The real advantage of jjuist the existing "alloc_pages()" model is
that I think the ballooning people can use that to *test* things out.
If it turns out that taking and releasing the VM locks is a big cost,
we can see if a batch interface that allows you to get tens of pages
at the same time is worth it.

So yes, I'd suggest starting with just the existing alloc_pages. Maybe
it's not enough, but it should be good enough for testing.

                    Linus
Michael S. Tsirkin July 11, 2018, 7:36 p.m. UTC | #13
On Wed, Jul 11, 2018 at 01:09:49PM +0200, Michal Hocko wrote:
> But let me note that I am not really convinced how this (or previous)
> approach will really work in most workloads. We tend to cache heavily so
> there is rarely any memory free.

It might be that it's worth flushing the cache when VM is
migrating. Or maybe we should implement virtio-tmem or add
transcendent memory support to the balloon.
Wang, Wei W July 12, 2018, 2:21 a.m. UTC | #14
On 07/12/2018 12:23 AM, Linus Torvalds wrote:
> On Wed, Jul 11, 2018 at 2:21 AM Michal Hocko <mhocko@kernel.org> wrote:
>> We already have an interface for that. alloc_pages(GFP_NOWAIT, MAX_ORDER -1).
>> So why do we need any array based interface?
> That was actually my original argument in the original thread - that
> the only new interface people might want is one that just tells how
> many of those MAX_ORDER-1 pages there are.
>
> See the thread in v33 with the subject
>
>    "[PATCH v33 1/4] mm: add a function to get free page blocks"
>
> and look for me suggesting just using
>
>      #define GFP_MINFLAGS (__GFP_NORETRY | __GFP_NOWARN |
> __GFP_THISNODE | __GFP_NOMEMALLOC)

Would it be better to remove __GFP_THISNODE? We actually want to get all 
the guest free pages (from all the nodes).

Best,
Wei
Linus Torvalds July 12, 2018, 2:30 a.m. UTC | #15
On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>
> Would it be better to remove __GFP_THISNODE? We actually want to get all
> the guest free pages (from all the nodes).

Maybe. Or maybe it would be better to have the memory balloon logic be
per-node? Maybe you don't want to remove too much memory from one
node? I think it's one of those "play with it" things.

I don't think that's the big issue, actually. I think the real issue
is how to react quickly and gracefully to "oops, I'm trying to give
memory away, but now the guest wants it back" while you're in the
middle of trying to create that 2TB list of pages.

IOW, I think the real work is in whatever tuning for the righ
tbehavior. But I'm just guessing.

             Linus
Wang, Wei W July 12, 2018, 2:52 a.m. UTC | #16
On 07/12/2018 10:30 AM, Linus Torvalds wrote:
> On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>> Would it be better to remove __GFP_THISNODE? We actually want to get all
>> the guest free pages (from all the nodes).
> Maybe. Or maybe it would be better to have the memory balloon logic be
> per-node? Maybe you don't want to remove too much memory from one
> node? I think it's one of those "play with it" things.
>
> I don't think that's the big issue, actually. I think the real issue
> is how to react quickly and gracefully to "oops, I'm trying to give
> memory away, but now the guest wants it back" while you're in the
> middle of trying to create that 2TB list of pages.

OK. virtio-balloon has already registered an oom notifier 
(virtballoon_oom_notify). I plan to add some control there. If oom happens,
- stop the page allocation;
- immediately give back the allocated pages to mm.

Best,
Wei
Michal Hocko July 12, 2018, 8:13 a.m. UTC | #17
On Thu 12-07-18 10:52:08, Wei Wang wrote:
> On 07/12/2018 10:30 AM, Linus Torvalds wrote:
> > On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
> > > Would it be better to remove __GFP_THISNODE? We actually want to get all
> > > the guest free pages (from all the nodes).
> > Maybe. Or maybe it would be better to have the memory balloon logic be
> > per-node? Maybe you don't want to remove too much memory from one
> > node? I think it's one of those "play with it" things.
> > 
> > I don't think that's the big issue, actually. I think the real issue
> > is how to react quickly and gracefully to "oops, I'm trying to give
> > memory away, but now the guest wants it back" while you're in the
> > middle of trying to create that 2TB list of pages.
> 
> OK. virtio-balloon has already registered an oom notifier
> (virtballoon_oom_notify). I plan to add some control there. If oom happens,
> - stop the page allocation;
> - immediately give back the allocated pages to mm.

Please don't. Oom notifier is an absolutely hideous interface which
should go away sooner or later (I would much rather like the former) so
do not build a new logic on top of it. I would appreciate if you
actually remove the notifier much more.

You can give memory back from the standard shrinker interface. If we are
reaching low reclaim priorities then we are struggling to reclaim memory
and then you can start returning pages back.
Wang, Wei W July 12, 2018, 11:34 a.m. UTC | #18
On 07/12/2018 04:13 PM, Michal Hocko wrote:
> On Thu 12-07-18 10:52:08, Wei Wang wrote:
>> On 07/12/2018 10:30 AM, Linus Torvalds wrote:
>>> On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>>>> Would it be better to remove __GFP_THISNODE? We actually want to get all
>>>> the guest free pages (from all the nodes).
>>> Maybe. Or maybe it would be better to have the memory balloon logic be
>>> per-node? Maybe you don't want to remove too much memory from one
>>> node? I think it's one of those "play with it" things.
>>>
>>> I don't think that's the big issue, actually. I think the real issue
>>> is how to react quickly and gracefully to "oops, I'm trying to give
>>> memory away, but now the guest wants it back" while you're in the
>>> middle of trying to create that 2TB list of pages.
>> OK. virtio-balloon has already registered an oom notifier
>> (virtballoon_oom_notify). I plan to add some control there. If oom happens,
>> - stop the page allocation;
>> - immediately give back the allocated pages to mm.
> Please don't. Oom notifier is an absolutely hideous interface which
> should go away sooner or later (I would much rather like the former) so
> do not build a new logic on top of it. I would appreciate if you
> actually remove the notifier much more.
>
> You can give memory back from the standard shrinker interface. If we are
> reaching low reclaim priorities then we are struggling to reclaim memory
> and then you can start returning pages back.

OK. Just curious why oom notifier is thought to be hideous, and has it 
been a consensus?

Best,
Wei
Michal Hocko July 12, 2018, 11:49 a.m. UTC | #19
On Thu 12-07-18 19:34:16, Wei Wang wrote:
> On 07/12/2018 04:13 PM, Michal Hocko wrote:
> > On Thu 12-07-18 10:52:08, Wei Wang wrote:
> > > On 07/12/2018 10:30 AM, Linus Torvalds wrote:
> > > > On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
> > > > > Would it be better to remove __GFP_THISNODE? We actually want to get all
> > > > > the guest free pages (from all the nodes).
> > > > Maybe. Or maybe it would be better to have the memory balloon logic be
> > > > per-node? Maybe you don't want to remove too much memory from one
> > > > node? I think it's one of those "play with it" things.
> > > > 
> > > > I don't think that's the big issue, actually. I think the real issue
> > > > is how to react quickly and gracefully to "oops, I'm trying to give
> > > > memory away, but now the guest wants it back" while you're in the
> > > > middle of trying to create that 2TB list of pages.
> > > OK. virtio-balloon has already registered an oom notifier
> > > (virtballoon_oom_notify). I plan to add some control there. If oom happens,
> > > - stop the page allocation;
> > > - immediately give back the allocated pages to mm.
> > Please don't. Oom notifier is an absolutely hideous interface which
> > should go away sooner or later (I would much rather like the former) so
> > do not build a new logic on top of it. I would appreciate if you
> > actually remove the notifier much more.
> > 
> > You can give memory back from the standard shrinker interface. If we are
> > reaching low reclaim priorities then we are struggling to reclaim memory
> > and then you can start returning pages back.
> 
> OK. Just curious why oom notifier is thought to be hideous, and has it been
> a consensus?

Because it is a completely non-transparent callout from the OOM context
which is really subtle on its own. It is just too easy to end up in
weird corner cases. We really have to be careful and be as swift as
possible. Any potential sleep would make the OOM situation much worse
because nobody would be able to make a forward progress or (in)direct
dependency on MM subsystem can easily deadlock. Those are really hard
to track down and defining the notifier as blockable by design which
just asks for bad implementations because most people simply do not
realize how subtle the oom context is.

Another thing is that it happens way too late when we have basically
reclaimed the world and didn't get out of the memory pressure so you can
expect any workload is suffering already. Anybody sitting on a large
amount of reclaimable memory should have released that memory by that
time. Proportionally to the reclaim pressure ideally.

The notifier API is completely unaware of oom constrains. Just imagine
you are OOM in a subset of numa nodes. Callback doesn't have any idea
about that.

Moreover we do have proper reclaim mechanism that has a feedback
loop and that should be always preferable to an abrupt reclaim.
Michal Hocko July 12, 2018, 1:12 p.m. UTC | #20
[Hmm this one somehow got stuck in my outgoing emails]

On Wed 11-07-18 09:23:54, Linus Torvalds wrote:
[...]
> So I'm open to new interfaces. I just want those new interfaces to
> make sense, and be low latency and simple for the VM to do. I'm
> objecting to the incredibly baroque and heavy-weight one that can
> return near-infinite amounts of memory.

Mel was suggesting a bulk page allocator a year ago [1]. I can see only
slab bulk api so I am not sure what happened with that work. Anyway
I think that starting with what we have right now is much more
appropriate than over design this thing from the early beginning.

[1] http://lkml.kernel.org/r/20170109163518.6001-5-mgorman@techsingularity.net
Wang, Wei W July 13, 2018, 12:33 a.m. UTC | #21
On 07/12/2018 07:49 PM, Michal Hocko wrote:
> On Thu 12-07-18 19:34:16, Wei Wang wrote:
>> On 07/12/2018 04:13 PM, Michal Hocko wrote:
>>> On Thu 12-07-18 10:52:08, Wei Wang wrote:
>>>> On 07/12/2018 10:30 AM, Linus Torvalds wrote:
>>>>> On Wed, Jul 11, 2018 at 7:17 PM Wei Wang <wei.w.wang@intel.com> wrote:
>>>>>> Would it be better to remove __GFP_THISNODE? We actually want to get all
>>>>>> the guest free pages (from all the nodes).
>>>>> Maybe. Or maybe it would be better to have the memory balloon logic be
>>>>> per-node? Maybe you don't want to remove too much memory from one
>>>>> node? I think it's one of those "play with it" things.
>>>>>
>>>>> I don't think that's the big issue, actually. I think the real issue
>>>>> is how to react quickly and gracefully to "oops, I'm trying to give
>>>>> memory away, but now the guest wants it back" while you're in the
>>>>> middle of trying to create that 2TB list of pages.
>>>> OK. virtio-balloon has already registered an oom notifier
>>>> (virtballoon_oom_notify). I plan to add some control there. If oom happens,
>>>> - stop the page allocation;
>>>> - immediately give back the allocated pages to mm.
>>> Please don't. Oom notifier is an absolutely hideous interface which
>>> should go away sooner or later (I would much rather like the former) so
>>> do not build a new logic on top of it. I would appreciate if you
>>> actually remove the notifier much more.
>>>
>>> You can give memory back from the standard shrinker interface. If we are
>>> reaching low reclaim priorities then we are struggling to reclaim memory
>>> and then you can start returning pages back.
>> OK. Just curious why oom notifier is thought to be hideous, and has it been
>> a consensus?
> Because it is a completely non-transparent callout from the OOM context
> which is really subtle on its own. It is just too easy to end up in
> weird corner cases. We really have to be careful and be as swift as
> possible. Any potential sleep would make the OOM situation much worse
> because nobody would be able to make a forward progress or (in)direct
> dependency on MM subsystem can easily deadlock. Those are really hard
> to track down and defining the notifier as blockable by design which
> just asks for bad implementations because most people simply do not
> realize how subtle the oom context is.
>
> Another thing is that it happens way too late when we have basically
> reclaimed the world and didn't get out of the memory pressure so you can
> expect any workload is suffering already. Anybody sitting on a large
> amount of reclaimable memory should have released that memory by that
> time. Proportionally to the reclaim pressure ideally.
>
> The notifier API is completely unaware of oom constrains. Just imagine
> you are OOM in a subset of numa nodes. Callback doesn't have any idea
> about that.
>
> Moreover we do have proper reclaim mechanism that has a feedback
> loop and that should be always preferable to an abrupt reclaim.

Sounds very reasonable, thanks for the elaboration. I'll try with shrinker.

Best,
Wei
diff mbox

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0fbb9f..5ce654f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2007,6 +2007,9 @@  extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
+unsigned long max_free_page_blocks(int order);
+int get_from_free_page_list(int order, struct list_head *pages,
+			    unsigned int size, unsigned long *loaded_num);
 
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100..b67839b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5043,6 +5043,104 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 	show_swap_cache_info();
 }
 
+/**
+ * max_free_page_blocks - estimate the max number of free page blocks
+ * @order: the order of the free page blocks to estimate
+ *
+ * This function gives a rough estimation of the possible maximum number of
+ * free page blocks a free list may have. The estimation works on an assumption
+ * that all the system pages are on that list.
+ *
+ * Context: Any context.
+ *
+ * Return: The largest number of free page blocks that the free list can have.
+ */
+unsigned long max_free_page_blocks(int order)
+{
+	return totalram_pages / (1 << order);
+}
+EXPORT_SYMBOL_GPL(max_free_page_blocks);
+
+/**
+ * get_from_free_page_list - get hints of free pages from a free page list
+ * @order: the order of the free page list to check
+ * @pages: the list of page blocks used as buffers to load the addresses
+ * @size: the size of each buffer in bytes
+ * @loaded_num: the number of addresses loaded to the buffers
+ *
+ * This function offers hints about free pages. The addresses of free page
+ * blocks are stored to the list of buffers passed from the caller. There is
+ * no guarantee that the obtained free pages are still on the free page list
+ * after the function returns. pfn_to_page on the obtained free pages is
+ * strongly discouraged and if there is an absolute need for that, make sure
+ * to contact MM people to discuss potential problems.
+ *
+ * The addresses are currently stored to a buffer in little endian. This
+ * avoids the overhead of converting endianness by the caller who needs data
+ * in the little endian format. Big endian support can be added on demand in
+ * the future.
+ *
+ * Context: Process context.
+ *
+ * Return: 0 if all the free page block addresses are stored to the buffers;
+ *         -ENOSPC if the buffers are not sufficient to store all the
+ *         addresses; or -EINVAL if an unexpected argument is received (e.g.
+ *         incorrect @order, empty buffer list).
+ */
+int get_from_free_page_list(int order, struct list_head *pages,
+			    unsigned int size, unsigned long *loaded_num)
+{
+	struct zone *zone;
+	enum migratetype mt;
+	struct list_head *free_list;
+	struct page *free_page, *buf_page;
+	unsigned long addr;
+	__le64 *buf;
+	unsigned int used_buf_num = 0, entry_index = 0,
+		     entries = size / sizeof(__le64);
+	*loaded_num = 0;
+
+	/* Validity check */
+	if (order < 0 || order >= MAX_ORDER)
+		return -EINVAL;
+
+	buf_page = list_first_entry_or_null(pages, struct page, lru);
+	if (!buf_page)
+		return -EINVAL;
+	buf = (__le64 *)page_address(buf_page);
+
+	for_each_populated_zone(zone) {
+		spin_lock_irq(&zone->lock);
+		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+			free_list = &zone->free_area[order].free_list[mt];
+			list_for_each_entry(free_page, free_list, lru) {
+				addr = page_to_pfn(free_page) << PAGE_SHIFT;
+				/* This buffer is full, so use the next one */
+				if (entry_index == entries) {
+					buf_page = list_next_entry(buf_page,
+								   lru);
+					/* All the buffers are consumed */
+					if (!buf_page) {
+						spin_unlock_irq(&zone->lock);
+						*loaded_num = used_buf_num *
+							      entries;
+						return -ENOSPC;
+					}
+					buf = (__le64 *)page_address(buf_page);
+					entry_index = 0;
+					used_buf_num++;
+				}
+				buf[entry_index++] = cpu_to_le64(addr);
+			}
+		}
+		spin_unlock_irq(&zone->lock);
+	}
+
+	*loaded_num = used_buf_num * entries + entry_index;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_from_free_page_list);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;