diff mbox series

[RFC,3/4] kvm: Add guest side support for free memory hints

Message ID 20190204181552.12095.46287.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series kvm: Report unused guest pages to host | expand

Commit Message

Alexander Duyck Feb. 4, 2019, 6:15 p.m. UTC
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>

Add guest support for providing free memory hints to the KVM hypervisor for
freed pages huge TLB size or larger. I am restricting the size to
huge TLB order and larger because the hypercalls are too expensive to be
performing one per 4K page. Using the huge TLB order became the obvious
choice for the order to use as it allows us to avoid fragmentation of higher
order memory on the host.

I have limited the functionality so that it doesn't work when page
poisoning is enabled. I did this because a write to the page after doing an
MADV_DONTNEED would effectively negate the hint, so it would be wasting
cycles to do so.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 arch/x86/include/asm/page.h |   13 +++++++++++++
 arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+)

Comments

Dave Hansen Feb. 4, 2019, 7:44 p.m. UTC | #1
On 2/4/19 10:15 AM, Alexander Duyck wrote:
> +#ifdef CONFIG_KVM_GUEST
> +#include <linux/jump_label.h>
> +extern struct static_key_false pv_free_page_hint_enabled;
> +
> +#define HAVE_ARCH_FREE_PAGE
> +void __arch_free_page(struct page *page, unsigned int order);
> +static inline void arch_free_page(struct page *page, unsigned int order)
> +{
> +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> +		__arch_free_page(page, order);
> +}
> +#endif

So, this ends up with at least a call, a branch and a ret added to the
order-0 paths, including freeing pages to the per-cpu-pageset lists.
That seems worrisome.

What performance testing has been performed to look into the overhead
added to those paths?
Alexander Duyck Feb. 4, 2019, 8:42 p.m. UTC | #2
On Mon, 2019-02-04 at 11:44 -0800, Dave Hansen wrote:
> On 2/4/19 10:15 AM, Alexander Duyck wrote:
> > +#ifdef CONFIG_KVM_GUEST
> > +#include <linux/jump_label.h>
> > +extern struct static_key_false pv_free_page_hint_enabled;
> > +
> > +#define HAVE_ARCH_FREE_PAGE
> > +void __arch_free_page(struct page *page, unsigned int order);
> > +static inline void arch_free_page(struct page *page, unsigned int order)
> > +{
> > +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> > +		__arch_free_page(page, order);
> > +}
> > +#endif
> 
> So, this ends up with at least a call, a branch and a ret added to the
> order-0 paths, including freeing pages to the per-cpu-pageset lists.
> That seems worrisome.
> 
> What performance testing has been performed to look into the overhead
> added to those paths?

So far I haven't done much in the way of actual performance testing.
Most of my tests have been focused on "is this doing what I think it is
supposed to be doing".

I have been debating if I want to just move the order checks to include
them in the inline functions. In that case we would end up essentially
just jumping over the call code.
Nadav Amit Feb. 4, 2019, 11 p.m. UTC | #3
> On Feb 4, 2019, at 10:15 AM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> 
> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> 
> Add guest support for providing free memory hints to the KVM hypervisor for
> freed pages huge TLB size or larger. I am restricting the size to
> huge TLB order and larger because the hypercalls are too expensive to be
> performing one per 4K page. Using the huge TLB order became the obvious
> choice for the order to use as it allows us to avoid fragmentation of higher
> order memory on the host.
> 
> I have limited the functionality so that it doesn't work when page
> poisoning is enabled. I did this because a write to the page after doing an
> MADV_DONTNEED would effectively negate the hint, so it would be wasting
> cycles to do so.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
> arch/x86/include/asm/page.h |   13 +++++++++++++
> arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> 2 files changed, 36 insertions(+)
> 
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 7555b48803a8..4487ad7a3385 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -18,6 +18,19 @@
> 
> struct page;
> 
> +#ifdef CONFIG_KVM_GUEST
> +#include <linux/jump_label.h>
> +extern struct static_key_false pv_free_page_hint_enabled;
> +
> +#define HAVE_ARCH_FREE_PAGE
> +void __arch_free_page(struct page *page, unsigned int order);
> +static inline void arch_free_page(struct page *page, unsigned int order)
> +{
> +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> +		__arch_free_page(page, order);
> +}
> +#endif

This patch and the following one assume that only KVM should be able to hook
to these events. I do not think it is appropriate for __arch_free_page() to
effectively mean “kvm_guest_free_page()”.

Is it possible to use the paravirt infrastructure for this feature,
similarly to other PV features? It is not the best infrastructure, but at least
it is hypervisor-neutral.
Alexander Duyck Feb. 4, 2019, 11:37 p.m. UTC | #4
On Mon, 2019-02-04 at 15:00 -0800, Nadav Amit wrote:
> > On Feb 4, 2019, at 10:15 AM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > 
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > 
> > Add guest support for providing free memory hints to the KVM hypervisor for
> > freed pages huge TLB size or larger. I am restricting the size to
> > huge TLB order and larger because the hypercalls are too expensive to be
> > performing one per 4K page. Using the huge TLB order became the obvious
> > choice for the order to use as it allows us to avoid fragmentation of higher
> > order memory on the host.
> > 
> > I have limited the functionality so that it doesn't work when page
> > poisoning is enabled. I did this because a write to the page after doing an
> > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > cycles to do so.
> > 
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > ---
> > arch/x86/include/asm/page.h |   13 +++++++++++++
> > arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> > 2 files changed, 36 insertions(+)
> > 
> > diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> > index 7555b48803a8..4487ad7a3385 100644
> > --- a/arch/x86/include/asm/page.h
> > +++ b/arch/x86/include/asm/page.h
> > @@ -18,6 +18,19 @@
> > 
> > struct page;
> > 
> > +#ifdef CONFIG_KVM_GUEST
> > +#include <linux/jump_label.h>
> > +extern struct static_key_false pv_free_page_hint_enabled;
> > +
> > +#define HAVE_ARCH_FREE_PAGE
> > +void __arch_free_page(struct page *page, unsigned int order);
> > +static inline void arch_free_page(struct page *page, unsigned int order)
> > +{
> > +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> > +		__arch_free_page(page, order);
> > +}
> > +#endif
> 
> This patch and the following one assume that only KVM should be able to hook
> to these events. I do not think it is appropriate for __arch_free_page() to
> effectively mean “kvm_guest_free_page()”.
> 
> Is it possible to use the paravirt infrastructure for this feature,
> similarly to other PV features? It is not the best infrastructure, but at least
> it is hypervisor-neutral.

I could probably tie this into the paravirt infrastructure, but if I
did so I would probably want to pull the checks for the page order out
of the KVM specific bits and make it something we handle in the inline.
Doing that I would probably make it a paravirtual hint that only
operates at the PMD level. That way we wouldn't incur the cost of the
paravirt infrastructure at the per 4K page level.
Nadav Amit Feb. 5, 2019, 12:03 a.m. UTC | #5
> On Feb 4, 2019, at 3:37 PM, Alexander Duyck <alexander.h.duyck@linux.intel.com> wrote:
> 
> On Mon, 2019-02-04 at 15:00 -0800, Nadav Amit wrote:
>>> On Feb 4, 2019, at 10:15 AM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
>>> 
>>> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>> 
>>> Add guest support for providing free memory hints to the KVM hypervisor for
>>> freed pages huge TLB size or larger. I am restricting the size to
>>> huge TLB order and larger because the hypercalls are too expensive to be
>>> performing one per 4K page. Using the huge TLB order became the obvious
>>> choice for the order to use as it allows us to avoid fragmentation of higher
>>> order memory on the host.
>>> 
>>> I have limited the functionality so that it doesn't work when page
>>> poisoning is enabled. I did this because a write to the page after doing an
>>> MADV_DONTNEED would effectively negate the hint, so it would be wasting
>>> cycles to do so.
>>> 
>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>> ---
>>> arch/x86/include/asm/page.h |   13 +++++++++++++
>>> arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
>>> 2 files changed, 36 insertions(+)
>>> 
>>> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
>>> index 7555b48803a8..4487ad7a3385 100644
>>> --- a/arch/x86/include/asm/page.h
>>> +++ b/arch/x86/include/asm/page.h
>>> @@ -18,6 +18,19 @@
>>> 
>>> struct page;
>>> 
>>> +#ifdef CONFIG_KVM_GUEST
>>> +#include <linux/jump_label.h>
>>> +extern struct static_key_false pv_free_page_hint_enabled;
>>> +
>>> +#define HAVE_ARCH_FREE_PAGE
>>> +void __arch_free_page(struct page *page, unsigned int order);
>>> +static inline void arch_free_page(struct page *page, unsigned int order)
>>> +{
>>> +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
>>> +		__arch_free_page(page, order);
>>> +}
>>> +#endif
>> 
>> This patch and the following one assume that only KVM should be able to hook
>> to these events. I do not think it is appropriate for __arch_free_page() to
>> effectively mean “kvm_guest_free_page()”.
>> 
>> Is it possible to use the paravirt infrastructure for this feature,
>> similarly to other PV features? It is not the best infrastructure, but at least
>> it is hypervisor-neutral.
> 
> I could probably tie this into the paravirt infrastructure, but if I
> did so I would probably want to pull the checks for the page order out
> of the KVM specific bits and make it something we handle in the inline.
> Doing that I would probably make it a paravirtual hint that only
> operates at the PMD level. That way we wouldn't incur the cost of the
> paravirt infrastructure at the per 4K page level.

If I understand you correctly, you “complain” that this would affect
performance.

While it might be, you may want to check whether the already available
tools can solve the problem:

1. You can use a combination of static-key and pv-ops - see for example
steal_account_process_time()

2. You can use callee-saved pv-ops.

The latter might anyhow be necessary since, IIUC, you change a very hot
path. So you may want have a look on the assembly code of free_pcp_prepare()
(or at least its code-size) before and after your changes. If they are too
big, a callee-saved function might be necessary.
Alexander Duyck Feb. 5, 2019, 12:16 a.m. UTC | #6
On Mon, Feb 4, 2019 at 4:03 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Feb 4, 2019, at 3:37 PM, Alexander Duyck <alexander.h.duyck@linux.intel.com> wrote:
> >
> > On Mon, 2019-02-04 at 15:00 -0800, Nadav Amit wrote:
> >>> On Feb 4, 2019, at 10:15 AM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> >>>
> >>> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >>>
> >>> Add guest support for providing free memory hints to the KVM hypervisor for
> >>> freed pages huge TLB size or larger. I am restricting the size to
> >>> huge TLB order and larger because the hypercalls are too expensive to be
> >>> performing one per 4K page. Using the huge TLB order became the obvious
> >>> choice for the order to use as it allows us to avoid fragmentation of higher
> >>> order memory on the host.
> >>>
> >>> I have limited the functionality so that it doesn't work when page
> >>> poisoning is enabled. I did this because a write to the page after doing an
> >>> MADV_DONTNEED would effectively negate the hint, so it would be wasting
> >>> cycles to do so.
> >>>
> >>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >>> ---
> >>> arch/x86/include/asm/page.h |   13 +++++++++++++
> >>> arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> >>> 2 files changed, 36 insertions(+)
> >>>
> >>> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> >>> index 7555b48803a8..4487ad7a3385 100644
> >>> --- a/arch/x86/include/asm/page.h
> >>> +++ b/arch/x86/include/asm/page.h
> >>> @@ -18,6 +18,19 @@
> >>>
> >>> struct page;
> >>>
> >>> +#ifdef CONFIG_KVM_GUEST
> >>> +#include <linux/jump_label.h>
> >>> +extern struct static_key_false pv_free_page_hint_enabled;
> >>> +
> >>> +#define HAVE_ARCH_FREE_PAGE
> >>> +void __arch_free_page(struct page *page, unsigned int order);
> >>> +static inline void arch_free_page(struct page *page, unsigned int order)
> >>> +{
> >>> +   if (static_branch_unlikely(&pv_free_page_hint_enabled))
> >>> +           __arch_free_page(page, order);
> >>> +}
> >>> +#endif
> >>
> >> This patch and the following one assume that only KVM should be able to hook
> >> to these events. I do not think it is appropriate for __arch_free_page() to
> >> effectively mean “kvm_guest_free_page()”.
> >>
> >> Is it possible to use the paravirt infrastructure for this feature,
> >> similarly to other PV features? It is not the best infrastructure, but at least
> >> it is hypervisor-neutral.
> >
> > I could probably tie this into the paravirt infrastructure, but if I
> > did so I would probably want to pull the checks for the page order out
> > of the KVM specific bits and make it something we handle in the inline.
> > Doing that I would probably make it a paravirtual hint that only
> > operates at the PMD level. That way we wouldn't incur the cost of the
> > paravirt infrastructure at the per 4K page level.
>
> If I understand you correctly, you “complain” that this would affect
> performance.

It wasn't so much a "complaint" as an "observation". What I was
getting at is that if I am going to make it a PV operation I might set
a hard limit on it so that it will specifically only apply to huge
pages and larger. By doing that I can justify performing the screening
based on page order in the inline path and avoid any PV infrastructure
overhead unless I have to incur it.

> While it might be, you may want to check whether the already available
> tools can solve the problem:
>
> 1. You can use a combination of static-key and pv-ops - see for example
> steal_account_process_time()

Okay, I was kind of already heading in this direction. The static key
I am using now would probably stay put.

> 2. You can use callee-saved pv-ops.
>
> The latter might anyhow be necessary since, IIUC, you change a very hot
> path. So you may want have a look on the assembly code of free_pcp_prepare()
> (or at least its code-size) before and after your changes. If they are too
> big, a callee-saved function might be necessary.

I'll have to take a look. I will spend the next couple days
familiarizing myself with the pv-ops infrastructure.
Nadav Amit Feb. 5, 2019, 1:46 a.m. UTC | #7
> On Feb 4, 2019, at 4:16 PM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> 
> On Mon, Feb 4, 2019 at 4:03 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>>> On Feb 4, 2019, at 3:37 PM, Alexander Duyck <alexander.h.duyck@linux.intel.com> wrote:
>>> 
>>> On Mon, 2019-02-04 at 15:00 -0800, Nadav Amit wrote:
>>>>> On Feb 4, 2019, at 10:15 AM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
>>>>> 
>>>>> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>>>> 
>>>>> Add guest support for providing free memory hints to the KVM hypervisor for
>>>>> freed pages huge TLB size or larger. I am restricting the size to
>>>>> huge TLB order and larger because the hypercalls are too expensive to be
>>>>> performing one per 4K page. Using the huge TLB order became the obvious
>>>>> choice for the order to use as it allows us to avoid fragmentation of higher
>>>>> order memory on the host.
>>>>> 
>>>>> I have limited the functionality so that it doesn't work when page
>>>>> poisoning is enabled. I did this because a write to the page after doing an
>>>>> MADV_DONTNEED would effectively negate the hint, so it would be wasting
>>>>> cycles to do so.
>>>>> 
>>>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>>>> ---
>>>>> arch/x86/include/asm/page.h |   13 +++++++++++++
>>>>> arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
>>>>> 2 files changed, 36 insertions(+)
>>>>> 
>>>>> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
>>>>> index 7555b48803a8..4487ad7a3385 100644
>>>>> --- a/arch/x86/include/asm/page.h
>>>>> +++ b/arch/x86/include/asm/page.h
>>>>> @@ -18,6 +18,19 @@
>>>>> 
>>>>> struct page;
>>>>> 
>>>>> +#ifdef CONFIG_KVM_GUEST
>>>>> +#include <linux/jump_label.h>
>>>>> +extern struct static_key_false pv_free_page_hint_enabled;
>>>>> +
>>>>> +#define HAVE_ARCH_FREE_PAGE
>>>>> +void __arch_free_page(struct page *page, unsigned int order);
>>>>> +static inline void arch_free_page(struct page *page, unsigned int order)
>>>>> +{
>>>>> +   if (static_branch_unlikely(&pv_free_page_hint_enabled))
>>>>> +           __arch_free_page(page, order);
>>>>> +}
>>>>> +#endif
>>>> 
>>>> This patch and the following one assume that only KVM should be able to hook
>>>> to these events. I do not think it is appropriate for __arch_free_page() to
>>>> effectively mean “kvm_guest_free_page()”.
>>>> 
>>>> Is it possible to use the paravirt infrastructure for this feature,
>>>> similarly to other PV features? It is not the best infrastructure, but at least
>>>> it is hypervisor-neutral.
>>> 
>>> I could probably tie this into the paravirt infrastructure, but if I
>>> did so I would probably want to pull the checks for the page order out
>>> of the KVM specific bits and make it something we handle in the inline.
>>> Doing that I would probably make it a paravirtual hint that only
>>> operates at the PMD level. That way we wouldn't incur the cost of the
>>> paravirt infrastructure at the per 4K page level.
>> 
>> If I understand you correctly, you “complain” that this would affect
>> performance.
> 
> It wasn't so much a "complaint" as an "observation". What I was
> getting at is that if I am going to make it a PV operation I might set
> a hard limit on it so that it will specifically only apply to huge
> pages and larger. By doing that I can justify performing the screening
> based on page order in the inline path and avoid any PV infrastructure
> overhead unless I have to incur it.

I understood. I guess my use of “double quotes” was lost in translation. ;-)

One more point regarding [2/4] - you may want to consider using madvise_free
instead of madvise_dontneed to avoid unnecessary EPT violations.
Alexander Duyck Feb. 5, 2019, 6:09 p.m. UTC | #8
On Mon, 2019-02-04 at 17:46 -0800, Nadav Amit wrote:
> > On Feb 4, 2019, at 4:16 PM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > 
> > On Mon, Feb 4, 2019 at 4:03 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> > > > On Feb 4, 2019, at 3:37 PM, Alexander Duyck <alexander.h.duyck@linux.intel.com> wrote:
> > > > 
> > > > On Mon, 2019-02-04 at 15:00 -0800, Nadav Amit wrote:
> > > > > > On Feb 4, 2019, at 10:15 AM, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > > > > > 
> > > > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > > 
> > > > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > > > performing one per 4K page. Using the huge TLB order became the obvious
> > > > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > > > order memory on the host.
> > > > > > 
> > > > > > I have limited the functionality so that it doesn't work when page
> > > > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > > > cycles to do so.
> > > > > > 
> > > > > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > > ---
> > > > > > arch/x86/include/asm/page.h |   13 +++++++++++++
> > > > > > arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> > > > > > 2 files changed, 36 insertions(+)
> > > > > > 
> > > > > > diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> > > > > > index 7555b48803a8..4487ad7a3385 100644
> > > > > > --- a/arch/x86/include/asm/page.h
> > > > > > +++ b/arch/x86/include/asm/page.h
> > > > > > @@ -18,6 +18,19 @@
> > > > > > 
> > > > > > struct page;
> > > > > > 
> > > > > > +#ifdef CONFIG_KVM_GUEST
> > > > > > +#include <linux/jump_label.h>
> > > > > > +extern struct static_key_false pv_free_page_hint_enabled;
> > > > > > +
> > > > > > +#define HAVE_ARCH_FREE_PAGE
> > > > > > +void __arch_free_page(struct page *page, unsigned int order);
> > > > > > +static inline void arch_free_page(struct page *page, unsigned int order)
> > > > > > +{
> > > > > > +   if (static_branch_unlikely(&pv_free_page_hint_enabled))
> > > > > > +           __arch_free_page(page, order);
> > > > > > +}
> > > > > > +#endif
> > > > > 
> > > > > This patch and the following one assume that only KVM should be able to hook
> > > > > to these events. I do not think it is appropriate for __arch_free_page() to
> > > > > effectively mean “kvm_guest_free_page()”.
> > > > > 
> > > > > Is it possible to use the paravirt infrastructure for this feature,
> > > > > similarly to other PV features? It is not the best infrastructure, but at least
> > > > > it is hypervisor-neutral.
> > > > 
> > > > I could probably tie this into the paravirt infrastructure, but if I
> > > > did so I would probably want to pull the checks for the page order out
> > > > of the KVM specific bits and make it something we handle in the inline.
> > > > Doing that I would probably make it a paravirtual hint that only
> > > > operates at the PMD level. That way we wouldn't incur the cost of the
> > > > paravirt infrastructure at the per 4K page level.
> > > 
> > > If I understand you correctly, you “complain” that this would affect
> > > performance.
> > 
> > It wasn't so much a "complaint" as an "observation". What I was
> > getting at is that if I am going to make it a PV operation I might set
> > a hard limit on it so that it will specifically only apply to huge
> > pages and larger. By doing that I can justify performing the screening
> > based on page order in the inline path and avoid any PV infrastructure
> > overhead unless I have to incur it.
> 
> I understood. I guess my use of “double quotes” was lost in translation. ;-)

Yeah, I just figured I would restate it to make sure we were "on the
same page". ;-)

> One more point regarding [2/4] - you may want to consider using madvise_free
> instead of madvise_dontneed to avoid unnecessary EPT violations.

For now I am using MADVISE_DONTNEED because it reduces the complexity.
I have been working on a proof of concept with MADVISE_FREE, however we
then have to add some additional checks as MADVISE_FREE only works with
anonymous memory if I am not mistaken.
Luiz Capitulino Feb. 7, 2019, 6:21 p.m. UTC | #9
On Mon, 04 Feb 2019 10:15:52 -0800
Alexander Duyck <alexander.duyck@gmail.com> wrote:

> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> 
> Add guest support for providing free memory hints to the KVM hypervisor for
> freed pages huge TLB size or larger. I am restricting the size to
> huge TLB order and larger because the hypercalls are too expensive to be
> performing one per 4K page. Using the huge TLB order became the obvious
> choice for the order to use as it allows us to avoid fragmentation of higher
> order memory on the host.
> 
> I have limited the functionality so that it doesn't work when page
> poisoning is enabled. I did this because a write to the page after doing an
> MADV_DONTNEED would effectively negate the hint, so it would be wasting
> cycles to do so.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
>  arch/x86/include/asm/page.h |   13 +++++++++++++
>  arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
>  2 files changed, 36 insertions(+)
> 
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 7555b48803a8..4487ad7a3385 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -18,6 +18,19 @@
>  
>  struct page;
>  
> +#ifdef CONFIG_KVM_GUEST
> +#include <linux/jump_label.h>
> +extern struct static_key_false pv_free_page_hint_enabled;
> +
> +#define HAVE_ARCH_FREE_PAGE
> +void __arch_free_page(struct page *page, unsigned int order);
> +static inline void arch_free_page(struct page *page, unsigned int order)
> +{
> +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> +		__arch_free_page(page, order);
> +}
> +#endif
> +
>  #include <linux/range.h>
>  extern struct range pfn_mapped[];
>  extern int nr_pfn_mapped;
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5c93a65ee1e5..09c91641c36c 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -48,6 +48,7 @@
>  #include <asm/tlb.h>
>  
>  static int kvmapf = 1;
> +DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
>  
>  static int __init parse_no_kvmapf(char *arg)
>  {
> @@ -648,6 +649,15 @@ static void __init kvm_guest_init(void)
>  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
>  		apic_set_eoi_write(kvm_guest_apic_eoi_write);
>  
> +	/*
> +	 * The free page hinting doesn't add much value if page poisoning
> +	 * is enabled. So we only enable the feature if page poisoning is
> +	 * no present.
> +	 */
> +	if (!page_poisoning_enabled() &&
> +	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
> +		static_branch_enable(&pv_free_page_hint_enabled);
> +
>  #ifdef CONFIG_SMP
>  	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
>  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
> @@ -762,6 +772,19 @@ static __init int kvm_setup_pv_tlb_flush(void)
>  }
>  arch_initcall(kvm_setup_pv_tlb_flush);
>  
> +void __arch_free_page(struct page *page, unsigned int order)
> +{
> +	/*
> +	 * Limit hints to blocks no smaller than pageblock in
> +	 * size to limit the cost for the hypercalls.
> +	 */
> +	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
> +		return;
> +
> +	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
> +		       PAGE_SIZE << order);

Does this mean that the vCPU executing this will get stuck
here for the duration of the hypercall? Isn't that too long,
considering that the zone lock is taken and madvise in the
host block on semaphores?

> +}
> +
>  #ifdef CONFIG_PARAVIRT_SPINLOCKS
>  
>  /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
>
Alexander Duyck Feb. 7, 2019, 6:44 p.m. UTC | #10
On Thu, 2019-02-07 at 13:21 -0500, Luiz Capitulino wrote:
> On Mon, 04 Feb 2019 10:15:52 -0800
> Alexander Duyck <alexander.duyck@gmail.com> wrote:
> 
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > 
> > Add guest support for providing free memory hints to the KVM hypervisor for
> > freed pages huge TLB size or larger. I am restricting the size to
> > huge TLB order and larger because the hypercalls are too expensive to be
> > performing one per 4K page. Using the huge TLB order became the obvious
> > choice for the order to use as it allows us to avoid fragmentation of higher
> > order memory on the host.
> > 
> > I have limited the functionality so that it doesn't work when page
> > poisoning is enabled. I did this because a write to the page after doing an
> > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > cycles to do so.
> > 
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > ---
> >  arch/x86/include/asm/page.h |   13 +++++++++++++
> >  arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> >  2 files changed, 36 insertions(+)
> > 
> > diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> > index 7555b48803a8..4487ad7a3385 100644
> > --- a/arch/x86/include/asm/page.h
> > +++ b/arch/x86/include/asm/page.h
> > @@ -18,6 +18,19 @@
> >  
> >  struct page;
> >  
> > +#ifdef CONFIG_KVM_GUEST
> > +#include <linux/jump_label.h>
> > +extern struct static_key_false pv_free_page_hint_enabled;
> > +
> > +#define HAVE_ARCH_FREE_PAGE
> > +void __arch_free_page(struct page *page, unsigned int order);
> > +static inline void arch_free_page(struct page *page, unsigned int order)
> > +{
> > +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> > +		__arch_free_page(page, order);
> > +}
> > +#endif
> > +
> >  #include <linux/range.h>
> >  extern struct range pfn_mapped[];
> >  extern int nr_pfn_mapped;
> > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > index 5c93a65ee1e5..09c91641c36c 100644
> > --- a/arch/x86/kernel/kvm.c
> > +++ b/arch/x86/kernel/kvm.c
> > @@ -48,6 +48,7 @@
> >  #include <asm/tlb.h>
> >  
> >  static int kvmapf = 1;
> > +DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
> >  
> >  static int __init parse_no_kvmapf(char *arg)
> >  {
> > @@ -648,6 +649,15 @@ static void __init kvm_guest_init(void)
> >  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> >  		apic_set_eoi_write(kvm_guest_apic_eoi_write);
> >  
> > +	/*
> > +	 * The free page hinting doesn't add much value if page poisoning
> > +	 * is enabled. So we only enable the feature if page poisoning is
> > +	 * no present.
> > +	 */
> > +	if (!page_poisoning_enabled() &&
> > +	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
> > +		static_branch_enable(&pv_free_page_hint_enabled);
> > +
> >  #ifdef CONFIG_SMP
> >  	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
> >  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
> > @@ -762,6 +772,19 @@ static __init int kvm_setup_pv_tlb_flush(void)
> >  }
> >  arch_initcall(kvm_setup_pv_tlb_flush);
> >  
> > +void __arch_free_page(struct page *page, unsigned int order)
> > +{
> > +	/*
> > +	 * Limit hints to blocks no smaller than pageblock in
> > +	 * size to limit the cost for the hypercalls.
> > +	 */
> > +	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
> > +		return;
> > +
> > +	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
> > +		       PAGE_SIZE << order);
> 
> Does this mean that the vCPU executing this will get stuck
> here for the duration of the hypercall? Isn't that too long,
> considering that the zone lock is taken and madvise in the
> host block on semaphores?

I'm pretty sure the zone lock isn't held when this is called. The lock
isn't acquired until later in the path. This gets executed just before
the page poisoning call which would take time as well since it would
have to memset an entire page. This function is called as a part of
free_pages_prepare, the zone locks aren't acquired until we are calling
into either free_one_page and a few spots before calling
__free_one_page.

My other function in patch 4 which does this from inside of
__free_one_page does have to release the zone lock since it is taken
there.
Luiz Capitulino Feb. 7, 2019, 8:02 p.m. UTC | #11
On Thu, 07 Feb 2019 10:44:11 -0800
Alexander Duyck <alexander.h.duyck@linux.intel.com> wrote:

> On Thu, 2019-02-07 at 13:21 -0500, Luiz Capitulino wrote:
> > On Mon, 04 Feb 2019 10:15:52 -0800
> > Alexander Duyck <alexander.duyck@gmail.com> wrote:
> >   
> > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > 
> > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > freed pages huge TLB size or larger. I am restricting the size to
> > > huge TLB order and larger because the hypercalls are too expensive to be
> > > performing one per 4K page. Using the huge TLB order became the obvious
> > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > order memory on the host.
> > > 
> > > I have limited the functionality so that it doesn't work when page
> > > poisoning is enabled. I did this because a write to the page after doing an
> > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > cycles to do so.
> > > 
> > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > ---
> > >  arch/x86/include/asm/page.h |   13 +++++++++++++
> > >  arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> > >  2 files changed, 36 insertions(+)
> > > 
> > > diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> > > index 7555b48803a8..4487ad7a3385 100644
> > > --- a/arch/x86/include/asm/page.h
> > > +++ b/arch/x86/include/asm/page.h
> > > @@ -18,6 +18,19 @@
> > >  
> > >  struct page;
> > >  
> > > +#ifdef CONFIG_KVM_GUEST
> > > +#include <linux/jump_label.h>
> > > +extern struct static_key_false pv_free_page_hint_enabled;
> > > +
> > > +#define HAVE_ARCH_FREE_PAGE
> > > +void __arch_free_page(struct page *page, unsigned int order);
> > > +static inline void arch_free_page(struct page *page, unsigned int order)
> > > +{
> > > +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> > > +		__arch_free_page(page, order);
> > > +}
> > > +#endif
> > > +
> > >  #include <linux/range.h>
> > >  extern struct range pfn_mapped[];
> > >  extern int nr_pfn_mapped;
> > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > index 5c93a65ee1e5..09c91641c36c 100644
> > > --- a/arch/x86/kernel/kvm.c
> > > +++ b/arch/x86/kernel/kvm.c
> > > @@ -48,6 +48,7 @@
> > >  #include <asm/tlb.h>
> > >  
> > >  static int kvmapf = 1;
> > > +DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
> > >  
> > >  static int __init parse_no_kvmapf(char *arg)
> > >  {
> > > @@ -648,6 +649,15 @@ static void __init kvm_guest_init(void)
> > >  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> > >  		apic_set_eoi_write(kvm_guest_apic_eoi_write);
> > >  
> > > +	/*
> > > +	 * The free page hinting doesn't add much value if page poisoning
> > > +	 * is enabled. So we only enable the feature if page poisoning is
> > > +	 * no present.
> > > +	 */
> > > +	if (!page_poisoning_enabled() &&
> > > +	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
> > > +		static_branch_enable(&pv_free_page_hint_enabled);
> > > +
> > >  #ifdef CONFIG_SMP
> > >  	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
> > >  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
> > > @@ -762,6 +772,19 @@ static __init int kvm_setup_pv_tlb_flush(void)
> > >  }
> > >  arch_initcall(kvm_setup_pv_tlb_flush);
> > >  
> > > +void __arch_free_page(struct page *page, unsigned int order)
> > > +{
> > > +	/*
> > > +	 * Limit hints to blocks no smaller than pageblock in
> > > +	 * size to limit the cost for the hypercalls.
> > > +	 */
> > > +	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
> > > +		return;
> > > +
> > > +	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
> > > +		       PAGE_SIZE << order);  
> > 
> > Does this mean that the vCPU executing this will get stuck
> > here for the duration of the hypercall? Isn't that too long,
> > considering that the zone lock is taken and madvise in the
> > host block on semaphores?  
> 
> I'm pretty sure the zone lock isn't held when this is called. The lock
> isn't acquired until later in the path. This gets executed just before
> the page poisoning call which would take time as well since it would
> have to memset an entire page. This function is called as a part of
> free_pages_prepare, the zone locks aren't acquired until we are calling
> into either free_one_page and a few spots before calling
> __free_one_page.

Yeah, you're right of course! I think mixed up __arch_free_page()
and __free_one_page()... free_pages() code path won't take any
locks up to calling __arch_free_page(). Sorry for the noise.

> My other function in patch 4 which does this from inside of
> __free_one_page does have to release the zone lock since it is taken
> there.

I haven't checked that one yet, I'll let you know if I have comments.
Nitesh Narayan Lal Feb. 8, 2019, 9:05 p.m. UTC | #12
On 2/7/19 1:44 PM, Alexander Duyck wrote:
> On Thu, 2019-02-07 at 13:21 -0500, Luiz Capitulino wrote:
>> On Mon, 04 Feb 2019 10:15:52 -0800
>> Alexander Duyck <alexander.duyck@gmail.com> wrote:
>>
>>> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>>
>>> Add guest support for providing free memory hints to the KVM hypervisor for
>>> freed pages huge TLB size or larger. I am restricting the size to
>>> huge TLB order and larger because the hypercalls are too expensive to be
>>> performing one per 4K page. Using the huge TLB order became the obvious
>>> choice for the order to use as it allows us to avoid fragmentation of higher
>>> order memory on the host.
>>>
>>> I have limited the functionality so that it doesn't work when page
>>> poisoning is enabled. I did this because a write to the page after doing an
>>> MADV_DONTNEED would effectively negate the hint, so it would be wasting
>>> cycles to do so.
>>>
>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>> ---
>>>  arch/x86/include/asm/page.h |   13 +++++++++++++
>>>  arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
>>>  2 files changed, 36 insertions(+)
>>>
>>> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
>>> index 7555b48803a8..4487ad7a3385 100644
>>> --- a/arch/x86/include/asm/page.h
>>> +++ b/arch/x86/include/asm/page.h
>>> @@ -18,6 +18,19 @@
>>>  
>>>  struct page;
>>>  
>>> +#ifdef CONFIG_KVM_GUEST
>>> +#include <linux/jump_label.h>
>>> +extern struct static_key_false pv_free_page_hint_enabled;
>>> +
>>> +#define HAVE_ARCH_FREE_PAGE
>>> +void __arch_free_page(struct page *page, unsigned int order);
>>> +static inline void arch_free_page(struct page *page, unsigned int order)
>>> +{
>>> +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
>>> +		__arch_free_page(page, order);
>>> +}
>>> +#endif
>>> +
>>>  #include <linux/range.h>
>>>  extern struct range pfn_mapped[];
>>>  extern int nr_pfn_mapped;
>>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>>> index 5c93a65ee1e5..09c91641c36c 100644
>>> --- a/arch/x86/kernel/kvm.c
>>> +++ b/arch/x86/kernel/kvm.c
>>> @@ -48,6 +48,7 @@
>>>  #include <asm/tlb.h>
>>>  
>>>  static int kvmapf = 1;
>>> +DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
>>>  
>>>  static int __init parse_no_kvmapf(char *arg)
>>>  {
>>> @@ -648,6 +649,15 @@ static void __init kvm_guest_init(void)
>>>  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
>>>  		apic_set_eoi_write(kvm_guest_apic_eoi_write);
>>>  
>>> +	/*
>>> +	 * The free page hinting doesn't add much value if page poisoning
>>> +	 * is enabled. So we only enable the feature if page poisoning is
>>> +	 * no present.
>>> +	 */
>>> +	if (!page_poisoning_enabled() &&
>>> +	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
>>> +		static_branch_enable(&pv_free_page_hint_enabled);
>>> +
>>>  #ifdef CONFIG_SMP
>>>  	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
>>>  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
>>> @@ -762,6 +772,19 @@ static __init int kvm_setup_pv_tlb_flush(void)
>>>  }
>>>  arch_initcall(kvm_setup_pv_tlb_flush);
>>>  
>>> +void __arch_free_page(struct page *page, unsigned int order)
>>> +{
>>> +	/*
>>> +	 * Limit hints to blocks no smaller than pageblock in
>>> +	 * size to limit the cost for the hypercalls.
>>> +	 */
>>> +	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
>>> +		return;
>>> +
>>> +	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
>>> +		       PAGE_SIZE << order);
>> Does this mean that the vCPU executing this will get stuck
>> here for the duration of the hypercall? Isn't that too long,
>> considering that the zone lock is taken and madvise in the
>> host block on semaphores?
> I'm pretty sure the zone lock isn't held when this is called. The lock
> isn't acquired until later in the path. This gets executed just before
> the page poisoning call which would take time as well since it would
> have to memset an entire page. This function is called as a part of
> free_pages_prepare, the zone locks aren't acquired until we are calling
> into either free_one_page and a few spots before calling
> __free_one_page.
>
> My other function in patch 4 which does this from inside of
> __free_one_page does have to release the zone lock since it is taken
> there.
>
Considering hypercall's are costly, will it not make sense to coalesce
the pages you are reporting and make a single hypercall for a bunch of
pages?
Alexander Duyck Feb. 8, 2019, 9:31 p.m. UTC | #13
On Fri, 2019-02-08 at 16:05 -0500, Nitesh Narayan Lal wrote:
> On 2/7/19 1:44 PM, Alexander Duyck wrote:
> > On Thu, 2019-02-07 at 13:21 -0500, Luiz Capitulino wrote:
> > > On Mon, 04 Feb 2019 10:15:52 -0800
> > > Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > > 
> > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > 
> > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > performing one per 4K page. Using the huge TLB order became the obvious
> > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > order memory on the host.
> > > > 
> > > > I have limited the functionality so that it doesn't work when page
> > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > cycles to do so.
> > > > 
> > > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > ---
> > > >  arch/x86/include/asm/page.h |   13 +++++++++++++
> > > >  arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
> > > >  2 files changed, 36 insertions(+)
> > > > 
> > > > diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> > > > index 7555b48803a8..4487ad7a3385 100644
> > > > --- a/arch/x86/include/asm/page.h
> > > > +++ b/arch/x86/include/asm/page.h
> > > > @@ -18,6 +18,19 @@
> > > >  
> > > >  struct page;
> > > >  
> > > > +#ifdef CONFIG_KVM_GUEST
> > > > +#include <linux/jump_label.h>
> > > > +extern struct static_key_false pv_free_page_hint_enabled;
> > > > +
> > > > +#define HAVE_ARCH_FREE_PAGE
> > > > +void __arch_free_page(struct page *page, unsigned int order);
> > > > +static inline void arch_free_page(struct page *page, unsigned int order)
> > > > +{
> > > > +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> > > > +		__arch_free_page(page, order);
> > > > +}
> > > > +#endif
> > > > +
> > > >  #include <linux/range.h>
> > > >  extern struct range pfn_mapped[];
> > > >  extern int nr_pfn_mapped;
> > > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > > index 5c93a65ee1e5..09c91641c36c 100644
> > > > --- a/arch/x86/kernel/kvm.c
> > > > +++ b/arch/x86/kernel/kvm.c
> > > > @@ -48,6 +48,7 @@
> > > >  #include <asm/tlb.h>
> > > >  
> > > >  static int kvmapf = 1;
> > > > +DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
> > > >  
> > > >  static int __init parse_no_kvmapf(char *arg)
> > > >  {
> > > > @@ -648,6 +649,15 @@ static void __init kvm_guest_init(void)
> > > >  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> > > >  		apic_set_eoi_write(kvm_guest_apic_eoi_write);
> > > >  
> > > > +	/*
> > > > +	 * The free page hinting doesn't add much value if page poisoning
> > > > +	 * is enabled. So we only enable the feature if page poisoning is
> > > > +	 * no present.
> > > > +	 */
> > > > +	if (!page_poisoning_enabled() &&
> > > > +	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
> > > > +		static_branch_enable(&pv_free_page_hint_enabled);
> > > > +
> > > >  #ifdef CONFIG_SMP
> > > >  	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
> > > >  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
> > > > @@ -762,6 +772,19 @@ static __init int kvm_setup_pv_tlb_flush(void)
> > > >  }
> > > >  arch_initcall(kvm_setup_pv_tlb_flush);
> > > >  
> > > > +void __arch_free_page(struct page *page, unsigned int order)
> > > > +{
> > > > +	/*
> > > > +	 * Limit hints to blocks no smaller than pageblock in
> > > > +	 * size to limit the cost for the hypercalls.
> > > > +	 */
> > > > +	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
> > > > +		return;
> > > > +
> > > > +	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
> > > > +		       PAGE_SIZE << order);
> > > 
> > > Does this mean that the vCPU executing this will get stuck
> > > here for the duration of the hypercall? Isn't that too long,
> > > considering that the zone lock is taken and madvise in the
> > > host block on semaphores?
> > 
> > I'm pretty sure the zone lock isn't held when this is called. The lock
> > isn't acquired until later in the path. This gets executed just before
> > the page poisoning call which would take time as well since it would
> > have to memset an entire page. This function is called as a part of
> > free_pages_prepare, the zone locks aren't acquired until we are calling
> > into either free_one_page and a few spots before calling
> > __free_one_page.
> > 
> > My other function in patch 4 which does this from inside of
> > __free_one_page does have to release the zone lock since it is taken
> > there.
> > 
> 
> Considering hypercall's are costly, will it not make sense to coalesce
> the pages you are reporting and make a single hypercall for a bunch of
> pages?

That is what I am doing with this code and patch 4. I am only making
the call when I have been given a page that is 2M or larger. As such I
am only making one hypercall for every 512 4K pages.

So for example on my test VMs with 8G of RAM I see only about 3K calls
when it ends up freeing all of the application memory which is about 6G
after my test has ended.
Michael S. Tsirkin Feb. 10, 2019, 12:49 a.m. UTC | #14
On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> 
> Add guest support for providing free memory hints to the KVM hypervisor for
> freed pages huge TLB size or larger. I am restricting the size to
> huge TLB order and larger because the hypercalls are too expensive to be
> performing one per 4K page.

Even 2M pages start to get expensive with a TB guest.

Really it seems we want a virtio ring so we can pass a batch of these.
E.g. 256 entries, 2M each - that's more like it.

> Using the huge TLB order became the obvious
> choice for the order to use as it allows us to avoid fragmentation of higher
> order memory on the host.
> 
> I have limited the functionality so that it doesn't work when page
> poisoning is enabled. I did this because a write to the page after doing an
> MADV_DONTNEED would effectively negate the hint, so it would be wasting
> cycles to do so.

Again that's leaking host implementation detail into guest interface.

We are giving guest page hints to host that makes sense,
weird interactions with other features due to host
implementation details should be handled by host.




> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
>  arch/x86/include/asm/page.h |   13 +++++++++++++
>  arch/x86/kernel/kvm.c       |   23 +++++++++++++++++++++++
>  2 files changed, 36 insertions(+)
> 
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 7555b48803a8..4487ad7a3385 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -18,6 +18,19 @@
>  
>  struct page;
>  
> +#ifdef CONFIG_KVM_GUEST
> +#include <linux/jump_label.h>
> +extern struct static_key_false pv_free_page_hint_enabled;
> +
> +#define HAVE_ARCH_FREE_PAGE
> +void __arch_free_page(struct page *page, unsigned int order);
> +static inline void arch_free_page(struct page *page, unsigned int order)
> +{
> +	if (static_branch_unlikely(&pv_free_page_hint_enabled))
> +		__arch_free_page(page, order);
> +}
> +#endif
> +
>  #include <linux/range.h>
>  extern struct range pfn_mapped[];
>  extern int nr_pfn_mapped;
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5c93a65ee1e5..09c91641c36c 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -48,6 +48,7 @@
>  #include <asm/tlb.h>
>  
>  static int kvmapf = 1;
> +DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
>  
>  static int __init parse_no_kvmapf(char *arg)
>  {
> @@ -648,6 +649,15 @@ static void __init kvm_guest_init(void)
>  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
>  		apic_set_eoi_write(kvm_guest_apic_eoi_write);
>  
> +	/*
> +	 * The free page hinting doesn't add much value if page poisoning
> +	 * is enabled. So we only enable the feature if page poisoning is
> +	 * no present.
> +	 */
> +	if (!page_poisoning_enabled() &&
> +	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
> +		static_branch_enable(&pv_free_page_hint_enabled);
> +
>  #ifdef CONFIG_SMP
>  	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
>  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
> @@ -762,6 +772,19 @@ static __init int kvm_setup_pv_tlb_flush(void)
>  }
>  arch_initcall(kvm_setup_pv_tlb_flush);
>  
> +void __arch_free_page(struct page *page, unsigned int order)
> +{
> +	/*
> +	 * Limit hints to blocks no smaller than pageblock in
> +	 * size to limit the cost for the hypercalls.
> +	 */
> +	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
> +		return;
> +
> +	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
> +		       PAGE_SIZE << order);
> +}
> +
>  #ifdef CONFIG_PARAVIRT_SPINLOCKS
>  
>  /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
Alexander Duyck Feb. 11, 2019, 4:31 p.m. UTC | #15
On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > 
> > Add guest support for providing free memory hints to the KVM hypervisor for
> > freed pages huge TLB size or larger. I am restricting the size to
> > huge TLB order and larger because the hypercalls are too expensive to be
> > performing one per 4K page.
> 
> Even 2M pages start to get expensive with a TB guest.

Agreed.

> Really it seems we want a virtio ring so we can pass a batch of these.
> E.g. 256 entries, 2M each - that's more like it.

The only issue I see with doing that is that we then have to defer the
freeing. Doing that is going to introduce issues in the guest as we are
going to have pages going unused for some period of time while we wait
for the hint to complete, and we cannot just pull said pages back. I'm
not really a fan of the asynchronous nature of Nitesh's patches for
this reason.

> > Using the huge TLB order became the obvious
> > choice for the order to use as it allows us to avoid fragmentation of higher
> > order memory on the host.
> > 
> > I have limited the functionality so that it doesn't work when page
> > poisoning is enabled. I did this because a write to the page after doing an
> > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > cycles to do so.
> 
> Again that's leaking host implementation detail into guest interface.
> 
> We are giving guest page hints to host that makes sense,
> weird interactions with other features due to host
> implementation details should be handled by host.

I don't view this as a host implementation detail, this is guest
feature making use of all pages for debugging. If we are placing poison
values in the page then I wouldn't consider them an unused page, it is
being actively used to store the poison value. If we can achieve this
and free the page back to the host then even better, but until the
features can coexist we should not use the page hinting while page
poisoning is enabled.

This is one of the reasons why I was opposed to just disabling page
poisoning when this feature was enabled in Nitesh's patches. If the
guest has page poisoning enabled it is doing something with the page.
It shouldn't be prevented from doing that because the host wants to
have the option to free the pages.
Michael S. Tsirkin Feb. 11, 2019, 5:36 p.m. UTC | #16
On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > 
> > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > freed pages huge TLB size or larger. I am restricting the size to
> > > huge TLB order and larger because the hypercalls are too expensive to be
> > > performing one per 4K page.
> > 
> > Even 2M pages start to get expensive with a TB guest.
> 
> Agreed.
> 
> > Really it seems we want a virtio ring so we can pass a batch of these.
> > E.g. 256 entries, 2M each - that's more like it.
> 
> The only issue I see with doing that is that we then have to defer the
> freeing. Doing that is going to introduce issues in the guest as we are
> going to have pages going unused for some period of time while we wait
> for the hint to complete, and we cannot just pull said pages back. I'm
> not really a fan of the asynchronous nature of Nitesh's patches for
> this reason.

Well nothing prevents us from doing an extra exit to the hypervisor if
we want. The asynchronous nature is there as an optimization
to allow hypervisor to do its thing on a separate CPU.
Why not proceed doing other things meanwhile?
And if the reason is that we are short on memory, then
maybe we should be less aggressive in hinting?

E.g. if we just have 2 pages:

hint page 1
page 1 hint processed?
	yes - proceed to page 2
	no - wait for interrupt

get interrupt that page 1 hint is processed
hint page 2


If hypervisor happens to be running on same CPU it
can process things synchronously and we never enter
the no branch.





> > > Using the huge TLB order became the obvious
> > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > order memory on the host.
> > > 
> > > I have limited the functionality so that it doesn't work when page
> > > poisoning is enabled. I did this because a write to the page after doing an
> > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > cycles to do so.
> > 
> > Again that's leaking host implementation detail into guest interface.
> > 
> > We are giving guest page hints to host that makes sense,
> > weird interactions with other features due to host
> > implementation details should be handled by host.
> 
> I don't view this as a host implementation detail, this is guest
> feature making use of all pages for debugging. If we are placing poison
> values in the page then I wouldn't consider them an unused page, it is
> being actively used to store the poison value.

Well I guess it's a valid point of view for a kernel hacker, but they are
unused from application's point of view.
However poisoning is transparent to users and most distro users
are not aware of it going on. They just know that debug kernels
are slower.
User loading a debug kernel and immediately breaking overcommit
is an unpleasant experience.

> If we can achieve this
> and free the page back to the host then even better, but until the
> features can coexist we should not use the page hinting while page
> poisoning is enabled.

Existing hinting in balloon allows them to coexist so I think we
need to set the bar just as high for any new variant.

> This is one of the reasons why I was opposed to just disabling page
> poisoning when this feature was enabled in Nitesh's patches. If the
> guest has page poisoning enabled it is doing something with the page.
> It shouldn't be prevented from doing that because the host wants to
> have the option to free the pages.

I agree but I think the decision belongs on the host. I.e.
hint the page but tell the host it needs to be careful
about the poison value. It might also mean we
need to make sure poisoning happens after the hinting, not before.
Dave Hansen Feb. 11, 2019, 5:48 p.m. UTC | #17
On 2/9/19 4:49 PM, Michael S. Tsirkin wrote:
> On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
>> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>
>> Add guest support for providing free memory hints to the KVM hypervisor for
>> freed pages huge TLB size or larger. I am restricting the size to
>> huge TLB order and larger because the hypercalls are too expensive to be
>> performing one per 4K page.
> Even 2M pages start to get expensive with a TB guest.

Yeah, but we don't allocate and free TB's of memory at a high frequency.

> Really it seems we want a virtio ring so we can pass a batch of these.
> E.g. 256 entries, 2M each - that's more like it.

That only makes sense for a system that's doing high-frequency,
discontiguous frees of 2M pages.  Right now, a 2M free/realloc cycle
(THP or hugetlb) is *not* super-high frequency just because of the
latency for zeroing the page.

A virtio ring seems like an overblown solution to a non-existent problem.
Michael S. Tsirkin Feb. 11, 2019, 5:58 p.m. UTC | #18
On Mon, Feb 11, 2019 at 09:48:11AM -0800, Dave Hansen wrote:
> On 2/9/19 4:49 PM, Michael S. Tsirkin wrote:
> > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> >> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >>
> >> Add guest support for providing free memory hints to the KVM hypervisor for
> >> freed pages huge TLB size or larger. I am restricting the size to
> >> huge TLB order and larger because the hypercalls are too expensive to be
> >> performing one per 4K page.
> > Even 2M pages start to get expensive with a TB guest.
> 
> Yeah, but we don't allocate and free TB's of memory at a high frequency.
> 
> > Really it seems we want a virtio ring so we can pass a batch of these.
> > E.g. 256 entries, 2M each - that's more like it.
> 
> That only makes sense for a system that's doing high-frequency,
> discontiguous frees of 2M pages.  Right now, a 2M free/realloc cycle
> (THP or hugetlb) is *not* super-high frequency just because of the
> latency for zeroing the page.

Heh but with a ton of free memory, and a thread zeroing some of
it out in the background, will this still be the case?
It could be that we'll be able to find clean pages
at all times.


> A virtio ring seems like an overblown solution to a non-existent problem.

It would be nice to see some traces to help us decide one way or the other.
Alexander Duyck Feb. 11, 2019, 6:10 p.m. UTC | #19
On Mon, 2019-02-11 at 12:36 -0500, Michael S. Tsirkin wrote:
> On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> > On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > 
> > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > performing one per 4K page.
> > > 
> > > Even 2M pages start to get expensive with a TB guest.
> > 
> > Agreed.
> > 
> > > Really it seems we want a virtio ring so we can pass a batch of these.
> > > E.g. 256 entries, 2M each - that's more like it.
> > 
> > The only issue I see with doing that is that we then have to defer the
> > freeing. Doing that is going to introduce issues in the guest as we are
> > going to have pages going unused for some period of time while we wait
> > for the hint to complete, and we cannot just pull said pages back. I'm
> > not really a fan of the asynchronous nature of Nitesh's patches for
> > this reason.
> 
> Well nothing prevents us from doing an extra exit to the hypervisor if
> we want. The asynchronous nature is there as an optimization
> to allow hypervisor to do its thing on a separate CPU.
> Why not proceed doing other things meanwhile?
> And if the reason is that we are short on memory, then
> maybe we should be less aggressive in hinting?
> 
> E.g. if we just have 2 pages:
> 
> hint page 1
> page 1 hint processed?
> 	yes - proceed to page 2
> 	no - wait for interrupt
> 
> get interrupt that page 1 hint is processed
> hint page 2
> 
> 
> If hypervisor happens to be running on same CPU it
> can process things synchronously and we never enter
> the no branch.
> 

Another concern I would have about processing this asynchronously is
that we have the potential for multiple guest CPUs to become
bottlenecked by a single host CPU. I am not sure if that is something
that would be desirable.

> > > > Using the huge TLB order became the obvious
> > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > order memory on the host.
> > > > 
> > > > I have limited the functionality so that it doesn't work when page
> > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > cycles to do so.
> > > 
> > > Again that's leaking host implementation detail into guest interface.
> > > 
> > > We are giving guest page hints to host that makes sense,
> > > weird interactions with other features due to host
> > > implementation details should be handled by host.
> > 
> > I don't view this as a host implementation detail, this is guest
> > feature making use of all pages for debugging. If we are placing poison
> > values in the page then I wouldn't consider them an unused page, it is
> > being actively used to store the poison value.
> 
> Well I guess it's a valid point of view for a kernel hacker, but they are
> unused from application's point of view.
> However poisoning is transparent to users and most distro users
> are not aware of it going on. They just know that debug kernels
> are slower.
> User loading a debug kernel and immediately breaking overcommit
> is an unpleasant experience.

How would that be any different then a user loading an older kernel
that doesn't have this feature and breaking overcommit as a result?

I still think it would be better if we left the poisoning enabled in
such a case and just displayed a warning message if nothing else that
hinting is disabled because of page poisoning.

One other thought I had on this is that one side effect of page
poisoning is probably that KSM would be able to merge all of the poison
pages together into a single page since they are all set to the same
values. So even with the poisoned pages it would be possible to reduce
total memory overhead.

> > If we can achieve this
> > and free the page back to the host then even better, but until the
> > features can coexist we should not use the page hinting while page
> > poisoning is enabled.
> 
> Existing hinting in balloon allows them to coexist so I think we
> need to set the bar just as high for any new variant.

That is what I heard. I will have to look into this.

> > This is one of the reasons why I was opposed to just disabling page
> > poisoning when this feature was enabled in Nitesh's patches. If the
> > guest has page poisoning enabled it is doing something with the page.
> > It shouldn't be prevented from doing that because the host wants to
> > have the option to free the pages.
> 
> I agree but I think the decision belongs on the host. I.e.
> hint the page but tell the host it needs to be careful
> about the poison value. It might also mean we
> need to make sure poisoning happens after the hinting, not before.

The only issue with poisoning after instead of before is that the hint
is ignored and we end up triggering a page fault and zero as a result.
It might make more sense to have an architecture specific call that can
be paravirtualized to handle the case of poisoning the page for us if
we have the unused page hint enabled. Otherwise the write to the page
is a given to invalidate the hint.
Dave Hansen Feb. 11, 2019, 6:19 p.m. UTC | #20
On 2/11/19 9:58 AM, Michael S. Tsirkin wrote:
>>> Really it seems we want a virtio ring so we can pass a batch of these.
>>> E.g. 256 entries, 2M each - that's more like it.
>> That only makes sense for a system that's doing high-frequency,
>> discontiguous frees of 2M pages.  Right now, a 2M free/realloc cycle
>> (THP or hugetlb) is *not* super-high frequency just because of the
>> latency for zeroing the page.
> Heh but with a ton of free memory, and a thread zeroing some of
> it out in the background, will this still be the case?
> It could be that we'll be able to find clean pages
> at all times.

In a systems where we have some asynchrounous zeroing of memory where
freed, non-zeroed memory is sequestered out of the allocator, yeah, that
could make sense.

But, that's not what we have today.

>> A virtio ring seems like an overblown solution to a non-existent problem.
> It would be nice to see some traces to help us decide one way or the other.

Yeah, agreed.  Sounds like we need some more testing to see if these
approaches hit bottlenecks anywhere.
Michael S. Tsirkin Feb. 11, 2019, 7:54 p.m. UTC | #21
On Mon, Feb 11, 2019 at 10:10:06AM -0800, Alexander Duyck wrote:
> On Mon, 2019-02-11 at 12:36 -0500, Michael S. Tsirkin wrote:
> > On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> > > On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > > > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > 
> > > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > > performing one per 4K page.
> > > > 
> > > > Even 2M pages start to get expensive with a TB guest.
> > > 
> > > Agreed.
> > > 
> > > > Really it seems we want a virtio ring so we can pass a batch of these.
> > > > E.g. 256 entries, 2M each - that's more like it.
> > > 
> > > The only issue I see with doing that is that we then have to defer the
> > > freeing. Doing that is going to introduce issues in the guest as we are
> > > going to have pages going unused for some period of time while we wait
> > > for the hint to complete, and we cannot just pull said pages back. I'm
> > > not really a fan of the asynchronous nature of Nitesh's patches for
> > > this reason.
> > 
> > Well nothing prevents us from doing an extra exit to the hypervisor if
> > we want. The asynchronous nature is there as an optimization
> > to allow hypervisor to do its thing on a separate CPU.
> > Why not proceed doing other things meanwhile?
> > And if the reason is that we are short on memory, then
> > maybe we should be less aggressive in hinting?
> > 
> > E.g. if we just have 2 pages:
> > 
> > hint page 1
> > page 1 hint processed?
> > 	yes - proceed to page 2
> > 	no - wait for interrupt
> > 
> > get interrupt that page 1 hint is processed
> > hint page 2
> > 
> > 
> > If hypervisor happens to be running on same CPU it
> > can process things synchronously and we never enter
> > the no branch.
> > 
> 
> Another concern I would have about processing this asynchronously is
> that we have the potential for multiple guest CPUs to become
> bottlenecked by a single host CPU. I am not sure if that is something
> that would be desirable.

Well with a hypercall per page the fix is to block VCPU
completely which is also not for everyone.

If you can't push a free page hint to host, then
ideally you just won't. That's a nice property of
hinting we have upstream right now.
Host too busy - hinting is just skipped.


> > > > > Using the huge TLB order became the obvious
> > > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > > order memory on the host.
> > > > > 
> > > > > I have limited the functionality so that it doesn't work when page
> > > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > > cycles to do so.
> > > > 
> > > > Again that's leaking host implementation detail into guest interface.
> > > > 
> > > > We are giving guest page hints to host that makes sense,
> > > > weird interactions with other features due to host
> > > > implementation details should be handled by host.
> > > 
> > > I don't view this as a host implementation detail, this is guest
> > > feature making use of all pages for debugging. If we are placing poison
> > > values in the page then I wouldn't consider them an unused page, it is
> > > being actively used to store the poison value.
> > 
> > Well I guess it's a valid point of view for a kernel hacker, but they are
> > unused from application's point of view.
> > However poisoning is transparent to users and most distro users
> > are not aware of it going on. They just know that debug kernels
> > are slower.
> > User loading a debug kernel and immediately breaking overcommit
> > is an unpleasant experience.
> 
> How would that be any different then a user loading an older kernel
> that doesn't have this feature and breaking overcommit as a result?

Well old kernel does not have the feature so nothing to debug.
When we have a new feature that goes away in the debug kernel,
that's a big support problem since this leads to heisenbugs.

> I still think it would be better if we left the poisoning enabled in
> such a case and just displayed a warning message if nothing else that
> hinting is disabled because of page poisoning.
> 
> One other thought I had on this is that one side effect of page
> poisoning is probably that KSM would be able to merge all of the poison
> pages together into a single page since they are all set to the same
> values. So even with the poisoned pages it would be possible to reduce
> total memory overhead.

Right. And BTW one thing that host can do is pass
the hinted area to KSM for merging.
That requires an alloc hook to free it though.

Or we could add a per-VMA byte with the poison
value and use that on host to populate pages on fault.


> > > If we can achieve this
> > > and free the page back to the host then even better, but until the
> > > features can coexist we should not use the page hinting while page
> > > poisoning is enabled.
> > 
> > Existing hinting in balloon allows them to coexist so I think we
> > need to set the bar just as high for any new variant.
> 
> That is what I heard. I will have to look into this.

It's not doing anything smart right now, just checks
that poison == 0 and skips freeing if not.
But it can be enhanced transparently to guests.

> > > This is one of the reasons why I was opposed to just disabling page
> > > poisoning when this feature was enabled in Nitesh's patches. If the
> > > guest has page poisoning enabled it is doing something with the page.
> > > It shouldn't be prevented from doing that because the host wants to
> > > have the option to free the pages.
> > 
> > I agree but I think the decision belongs on the host. I.e.
> > hint the page but tell the host it needs to be careful
> > about the poison value. It might also mean we
> > need to make sure poisoning happens after the hinting, not before.
> 
> The only issue with poisoning after instead of before is that the hint
> is ignored and we end up triggering a page fault and zero as a result.
> It might make more sense to have an architecture specific call that can
> be paravirtualized to handle the case of poisoning the page for us if
> we have the unused page hint enabled. Otherwise the write to the page
> is a given to invalidate the hint.

Sounds interesting. So the arch hook will first poison and
then pass the page to the host?

Or we can also ask the host to poison for us, problem is this forces
host to either always write into page, or call MADV_DONTNEED,
without it could do MADV_FREE. Maybe that is not a big issue.
Michael S. Tsirkin Feb. 11, 2019, 7:56 p.m. UTC | #22
On Mon, Feb 11, 2019 at 10:19:17AM -0800, Dave Hansen wrote:
> On 2/11/19 9:58 AM, Michael S. Tsirkin wrote:
> >>> Really it seems we want a virtio ring so we can pass a batch of these.
> >>> E.g. 256 entries, 2M each - that's more like it.
> >> That only makes sense for a system that's doing high-frequency,
> >> discontiguous frees of 2M pages.  Right now, a 2M free/realloc cycle
> >> (THP or hugetlb) is *not* super-high frequency just because of the
> >> latency for zeroing the page.
> > Heh but with a ton of free memory, and a thread zeroing some of
> > it out in the background, will this still be the case?
> > It could be that we'll be able to find clean pages
> > at all times.
> 
> In a systems where we have some asynchrounous zeroing of memory where
> freed, non-zeroed memory is sequestered out of the allocator, yeah, that
> could make sense.
> 
> But, that's not what we have today.

Right. I wonder whether it's smart to build this assumption
into a host/guest interface though.

> >> A virtio ring seems like an overblown solution to a non-existent problem.
> > It would be nice to see some traces to help us decide one way or the other.
> 
> Yeah, agreed.  Sounds like we need some more testing to see if these
> approaches hit bottlenecks anywhere.
Alexander Duyck Feb. 11, 2019, 9 p.m. UTC | #23
On Mon, 2019-02-11 at 14:54 -0500, Michael S. Tsirkin wrote:
> On Mon, Feb 11, 2019 at 10:10:06AM -0800, Alexander Duyck wrote:
> > On Mon, 2019-02-11 at 12:36 -0500, Michael S. Tsirkin wrote:
> > > On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> > > > On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > > > > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > > 
> > > > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > > > performing one per 4K page.
> > > > > 
> > > > > Even 2M pages start to get expensive with a TB guest.
> > > > 
> > > > Agreed.
> > > > 
> > > > > Really it seems we want a virtio ring so we can pass a batch of these.
> > > > > E.g. 256 entries, 2M each - that's more like it.
> > > > 
> > > > The only issue I see with doing that is that we then have to defer the
> > > > freeing. Doing that is going to introduce issues in the guest as we are
> > > > going to have pages going unused for some period of time while we wait
> > > > for the hint to complete, and we cannot just pull said pages back. I'm
> > > > not really a fan of the asynchronous nature of Nitesh's patches for
> > > > this reason.
> > > 
> > > Well nothing prevents us from doing an extra exit to the hypervisor if
> > > we want. The asynchronous nature is there as an optimization
> > > to allow hypervisor to do its thing on a separate CPU.
> > > Why not proceed doing other things meanwhile?
> > > And if the reason is that we are short on memory, then
> > > maybe we should be less aggressive in hinting?
> > > 
> > > E.g. if we just have 2 pages:
> > > 
> > > hint page 1
> > > page 1 hint processed?
> > > 	yes - proceed to page 2
> > > 	no - wait for interrupt
> > > 
> > > get interrupt that page 1 hint is processed
> > > hint page 2
> > > 
> > > 
> > > If hypervisor happens to be running on same CPU it
> > > can process things synchronously and we never enter
> > > the no branch.
> > > 
> > 
> > Another concern I would have about processing this asynchronously is
> > that we have the potential for multiple guest CPUs to become
> > bottlenecked by a single host CPU. I am not sure if that is something
> > that would be desirable.
> 
> Well with a hypercall per page the fix is to block VCPU
> completely which is also not for everyone.
> 
> If you can't push a free page hint to host, then
> ideally you just won't. That's a nice property of
> hinting we have upstream right now.
> Host too busy - hinting is just skipped.

Right, but if you do that then there is a potential to end up missing
hints for a large portion of memory. It seems like you would end up
with even bigger issues since then at that point you have essentially
leaked memory.

I would think you would need a way to resync the host and the guest
after something like that. Otherwise you can have memory that will just
go unused for an extended period if a guest just goes idle.

> > > > > > Using the huge TLB order became the obvious
> > > > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > > > order memory on the host.
> > > > > > 
> > > > > > I have limited the functionality so that it doesn't work when page
> > > > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > > > cycles to do so.
> > > > > 
> > > > > Again that's leaking host implementation detail into guest interface.
> > > > > 
> > > > > We are giving guest page hints to host that makes sense,
> > > > > weird interactions with other features due to host
> > > > > implementation details should be handled by host.
> > > > 
> > > > I don't view this as a host implementation detail, this is guest
> > > > feature making use of all pages for debugging. If we are placing poison
> > > > values in the page then I wouldn't consider them an unused page, it is
> > > > being actively used to store the poison value.
> > > 
> > > Well I guess it's a valid point of view for a kernel hacker, but they are
> > > unused from application's point of view.
> > > However poisoning is transparent to users and most distro users
> > > are not aware of it going on. They just know that debug kernels
> > > are slower.
> > > User loading a debug kernel and immediately breaking overcommit
> > > is an unpleasant experience.
> > 
> > How would that be any different then a user loading an older kernel
> > that doesn't have this feature and breaking overcommit as a result?
> 
> Well old kernel does not have the feature so nothing to debug.
> When we have a new feature that goes away in the debug kernel,
> that's a big support problem since this leads to heisenbugs.

Trying to debug host features from the guest would be a pain anyway as
a guest shouldn't even really know what the underlying setup of the
guest is supposed to be.

> > I still think it would be better if we left the poisoning enabled in
> > such a case and just displayed a warning message if nothing else that
> > hinting is disabled because of page poisoning.
> > 
> > One other thought I had on this is that one side effect of page
> > poisoning is probably that KSM would be able to merge all of the poison
> > pages together into a single page since they are all set to the same
> > values. So even with the poisoned pages it would be possible to reduce
> > total memory overhead.
> 
> Right. And BTW one thing that host can do is pass
> the hinted area to KSM for merging.
> That requires an alloc hook to free it though.
> 
> Or we could add a per-VMA byte with the poison
> value and use that on host to populate pages on fault.
> 
> 
> > > > If we can achieve this
> > > > and free the page back to the host then even better, but until the
> > > > features can coexist we should not use the page hinting while page
> > > > poisoning is enabled.
> > > 
> > > Existing hinting in balloon allows them to coexist so I think we
> > > need to set the bar just as high for any new variant.
> > 
> > That is what I heard. I will have to look into this.
> 
> It's not doing anything smart right now, just checks
> that poison == 0 and skips freeing if not.
> But it can be enhanced transparently to guests.

Okay, so it probably should be extended to add something like poison
page that could replace the zero page for reads to a page that has been
unmapped.

> > > > This is one of the reasons why I was opposed to just disabling page
> > > > poisoning when this feature was enabled in Nitesh's patches. If the
> > > > guest has page poisoning enabled it is doing something with the page.
> > > > It shouldn't be prevented from doing that because the host wants to
> > > > have the option to free the pages.
> > > 
> > > I agree but I think the decision belongs on the host. I.e.
> > > hint the page but tell the host it needs to be careful
> > > about the poison value. It might also mean we
> > > need to make sure poisoning happens after the hinting, not before.
> > 
> > The only issue with poisoning after instead of before is that the hint
> > is ignored and we end up triggering a page fault and zero as a result.
> > It might make more sense to have an architecture specific call that can
> > be paravirtualized to handle the case of poisoning the page for us if
> > we have the unused page hint enabled. Otherwise the write to the page
> > is a given to invalidate the hint.
> 
> Sounds interesting. So the arch hook will first poison and
> then pass the page to the host?
> 
> Or we can also ask the host to poison for us, problem is this forces
> host to either always write into page, or call MADV_DONTNEED,
> without it could do MADV_FREE. Maybe that is not a big issue.

I would think we would ask the host to poison for us. If I am not
mistaken both solutions right now are using MADV_DONTNEED. I would tend
to lean that way if we are doing page poisoning since the cost for
zeroing/poisoning the page on the host could be canceled out by
dropping the page poisoning on the guest.

Then again since we are doing higher order pages only, and the
poisoning is supposed to happen before we get into __free_one_page we
would probably have to do both the poisoning, and the poison on fault.
Michael S. Tsirkin Feb. 11, 2019, 10:52 p.m. UTC | #24
On Mon, Feb 11, 2019 at 01:00:53PM -0800, Alexander Duyck wrote:
> On Mon, 2019-02-11 at 14:54 -0500, Michael S. Tsirkin wrote:
> > On Mon, Feb 11, 2019 at 10:10:06AM -0800, Alexander Duyck wrote:
> > > On Mon, 2019-02-11 at 12:36 -0500, Michael S. Tsirkin wrote:
> > > > On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> > > > > On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > > > > > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > > > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > > > 
> > > > > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > > > > performing one per 4K page.
> > > > > > 
> > > > > > Even 2M pages start to get expensive with a TB guest.
> > > > > 
> > > > > Agreed.
> > > > > 
> > > > > > Really it seems we want a virtio ring so we can pass a batch of these.
> > > > > > E.g. 256 entries, 2M each - that's more like it.
> > > > > 
> > > > > The only issue I see with doing that is that we then have to defer the
> > > > > freeing. Doing that is going to introduce issues in the guest as we are
> > > > > going to have pages going unused for some period of time while we wait
> > > > > for the hint to complete, and we cannot just pull said pages back. I'm
> > > > > not really a fan of the asynchronous nature of Nitesh's patches for
> > > > > this reason.
> > > > 
> > > > Well nothing prevents us from doing an extra exit to the hypervisor if
> > > > we want. The asynchronous nature is there as an optimization
> > > > to allow hypervisor to do its thing on a separate CPU.
> > > > Why not proceed doing other things meanwhile?
> > > > And if the reason is that we are short on memory, then
> > > > maybe we should be less aggressive in hinting?
> > > > 
> > > > E.g. if we just have 2 pages:
> > > > 
> > > > hint page 1
> > > > page 1 hint processed?
> > > > 	yes - proceed to page 2
> > > > 	no - wait for interrupt
> > > > 
> > > > get interrupt that page 1 hint is processed
> > > > hint page 2
> > > > 
> > > > 
> > > > If hypervisor happens to be running on same CPU it
> > > > can process things synchronously and we never enter
> > > > the no branch.
> > > > 
> > > 
> > > Another concern I would have about processing this asynchronously is
> > > that we have the potential for multiple guest CPUs to become
> > > bottlenecked by a single host CPU. I am not sure if that is something
> > > that would be desirable.
> > 
> > Well with a hypercall per page the fix is to block VCPU
> > completely which is also not for everyone.
> > 
> > If you can't push a free page hint to host, then
> > ideally you just won't. That's a nice property of
> > hinting we have upstream right now.
> > Host too busy - hinting is just skipped.
> 
> Right, but if you do that then there is a potential to end up missing
> hints for a large portion of memory. It seems like you would end up
> with even bigger issues since then at that point you have essentially
> leaked memory.
> I would think you would need a way to resync the host and the guest
> after something like that. Otherwise you can have memory that will just
> go unused for an extended period if a guest just goes idle.

Yes and that is my point.  Existing hints code will just take a page off
the free list in that case so it resyncs using the free list.

Something like this could work then: mark up
hinted pages with a flag (its easy to find unused
flags for free pages) then when you get an interrupt
because outstanding hints have been consumed,
get unflagged/unhinted pages from buddy and pass
them to host.


> 
> > > > > > > Using the huge TLB order became the obvious
> > > > > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > > > > order memory on the host.
> > > > > > > 
> > > > > > > I have limited the functionality so that it doesn't work when page
> > > > > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > > > > cycles to do so.
> > > > > > 
> > > > > > Again that's leaking host implementation detail into guest interface.
> > > > > > 
> > > > > > We are giving guest page hints to host that makes sense,
> > > > > > weird interactions with other features due to host
> > > > > > implementation details should be handled by host.
> > > > > 
> > > > > I don't view this as a host implementation detail, this is guest
> > > > > feature making use of all pages for debugging. If we are placing poison
> > > > > values in the page then I wouldn't consider them an unused page, it is
> > > > > being actively used to store the poison value.
> > > > 
> > > > Well I guess it's a valid point of view for a kernel hacker, but they are
> > > > unused from application's point of view.
> > > > However poisoning is transparent to users and most distro users
> > > > are not aware of it going on. They just know that debug kernels
> > > > are slower.
> > > > User loading a debug kernel and immediately breaking overcommit
> > > > is an unpleasant experience.
> > > 
> > > How would that be any different then a user loading an older kernel
> > > that doesn't have this feature and breaking overcommit as a result?
> > 
> > Well old kernel does not have the feature so nothing to debug.
> > When we have a new feature that goes away in the debug kernel,
> > that's a big support problem since this leads to heisenbugs.
> 
> Trying to debug host features from the guest would be a pain anyway as
> a guest shouldn't even really know what the underlying setup of the
> guest is supposed to be.

I'm talking about debugging the guest though.

> > > I still think it would be better if we left the poisoning enabled in
> > > such a case and just displayed a warning message if nothing else that
> > > hinting is disabled because of page poisoning.
> > > 
> > > One other thought I had on this is that one side effect of page
> > > poisoning is probably that KSM would be able to merge all of the poison
> > > pages together into a single page since they are all set to the same
> > > values. So even with the poisoned pages it would be possible to reduce
> > > total memory overhead.
> > 
> > Right. And BTW one thing that host can do is pass
> > the hinted area to KSM for merging.
> > That requires an alloc hook to free it though.
> > 
> > Or we could add a per-VMA byte with the poison
> > value and use that on host to populate pages on fault.
> > 
> > 
> > > > > If we can achieve this
> > > > > and free the page back to the host then even better, but until the
> > > > > features can coexist we should not use the page hinting while page
> > > > > poisoning is enabled.
> > > > 
> > > > Existing hinting in balloon allows them to coexist so I think we
> > > > need to set the bar just as high for any new variant.
> > > 
> > > That is what I heard. I will have to look into this.
> > 
> > It's not doing anything smart right now, just checks
> > that poison == 0 and skips freeing if not.
> > But it can be enhanced transparently to guests.
> 
> Okay, so it probably should be extended to add something like poison
> page that could replace the zero page for reads to a page that has been
> unmapped.
> 
> > > > > This is one of the reasons why I was opposed to just disabling page
> > > > > poisoning when this feature was enabled in Nitesh's patches. If the
> > > > > guest has page poisoning enabled it is doing something with the page.
> > > > > It shouldn't be prevented from doing that because the host wants to
> > > > > have the option to free the pages.
> > > > 
> > > > I agree but I think the decision belongs on the host. I.e.
> > > > hint the page but tell the host it needs to be careful
> > > > about the poison value. It might also mean we
> > > > need to make sure poisoning happens after the hinting, not before.
> > > 
> > > The only issue with poisoning after instead of before is that the hint
> > > is ignored and we end up triggering a page fault and zero as a result.
> > > It might make more sense to have an architecture specific call that can
> > > be paravirtualized to handle the case of poisoning the page for us if
> > > we have the unused page hint enabled. Otherwise the write to the page
> > > is a given to invalidate the hint.
> > 
> > Sounds interesting. So the arch hook will first poison and
> > then pass the page to the host?
> > 
> > Or we can also ask the host to poison for us, problem is this forces
> > host to either always write into page, or call MADV_DONTNEED,
> > without it could do MADV_FREE. Maybe that is not a big issue.
> 
> I would think we would ask the host to poison for us. If I am not
> mistaken both solutions right now are using MADV_DONTNEED. I would tend
> to lean that way if we are doing page poisoning since the cost for
> zeroing/poisoning the page on the host could be canceled out by
> dropping the page poisoning on the guest.
> 
> Then again since we are doing higher order pages only, and the
> poisoning is supposed to happen before we get into __free_one_page we
> would probably have to do both the poisoning, and the poison on fault.


Oh that's a nice trick. So in fact if we just make sure
we never report PAGE_SIZE pages then poisoning will
automatically happen before reporting?
So we just need to teach host to poison on fault.
Sounds cool and we can always optimize further later.
Alexander Duyck Feb. 12, 2019, 12:09 a.m. UTC | #25
On Mon, 2019-02-11 at 17:52 -0500, Michael S. Tsirkin wrote:
> On Mon, Feb 11, 2019 at 01:00:53PM -0800, Alexander Duyck wrote:
> > On Mon, 2019-02-11 at 14:54 -0500, Michael S. Tsirkin wrote:
> > > On Mon, Feb 11, 2019 at 10:10:06AM -0800, Alexander Duyck wrote:
> > > > On Mon, 2019-02-11 at 12:36 -0500, Michael S. Tsirkin wrote:
> > > > > On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> > > > > > On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > > > > > > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > > > > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > > > > 
> > > > > > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > > > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > > > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > > > > > performing one per 4K page.
> > > > > > > 
> > > > > > > Even 2M pages start to get expensive with a TB guest.
> > > > > > 
> > > > > > Agreed.
> > > > > > 
> > > > > > > Really it seems we want a virtio ring so we can pass a batch of these.
> > > > > > > E.g. 256 entries, 2M each - that's more like it.
> > > > > > 
> > > > > > The only issue I see with doing that is that we then have to defer the
> > > > > > freeing. Doing that is going to introduce issues in the guest as we are
> > > > > > going to have pages going unused for some period of time while we wait
> > > > > > for the hint to complete, and we cannot just pull said pages back. I'm
> > > > > > not really a fan of the asynchronous nature of Nitesh's patches for
> > > > > > this reason.
> > > > > 
> > > > > Well nothing prevents us from doing an extra exit to the hypervisor if
> > > > > we want. The asynchronous nature is there as an optimization
> > > > > to allow hypervisor to do its thing on a separate CPU.
> > > > > Why not proceed doing other things meanwhile?
> > > > > And if the reason is that we are short on memory, then
> > > > > maybe we should be less aggressive in hinting?
> > > > > 
> > > > > E.g. if we just have 2 pages:
> > > > > 
> > > > > hint page 1
> > > > > page 1 hint processed?
> > > > > 	yes - proceed to page 2
> > > > > 	no - wait for interrupt
> > > > > 
> > > > > get interrupt that page 1 hint is processed
> > > > > hint page 2
> > > > > 
> > > > > 
> > > > > If hypervisor happens to be running on same CPU it
> > > > > can process things synchronously and we never enter
> > > > > the no branch.
> > > > > 
> > > > 
> > > > Another concern I would have about processing this asynchronously is
> > > > that we have the potential for multiple guest CPUs to become
> > > > bottlenecked by a single host CPU. I am not sure if that is something
> > > > that would be desirable.
> > > 
> > > Well with a hypercall per page the fix is to block VCPU
> > > completely which is also not for everyone.
> > > 
> > > If you can't push a free page hint to host, then
> > > ideally you just won't. That's a nice property of
> > > hinting we have upstream right now.
> > > Host too busy - hinting is just skipped.
> > 
> > Right, but if you do that then there is a potential to end up missing
> > hints for a large portion of memory. It seems like you would end up
> > with even bigger issues since then at that point you have essentially
> > leaked memory.
> > I would think you would need a way to resync the host and the guest
> > after something like that. Otherwise you can have memory that will just
> > go unused for an extended period if a guest just goes idle.
> 
> Yes and that is my point.  Existing hints code will just take a page off
> the free list in that case so it resyncs using the free list.
> 
> Something like this could work then: mark up
> hinted pages with a flag (its easy to find unused
> flags for free pages) then when you get an interrupt
> because outstanding hints have been consumed,
> get unflagged/unhinted pages from buddy and pass
> them to host.

Ugh. This is beginning to sound like yet another daemon that will have
to be running to handle missed sync events.

I really think that taking an async approach for this will be nothing
but trouble. You are going to have a difficult time maintaining any
sort of coherency no the freelist without the daemon having to take the
zone lock and then notify the host of what is free and what isn't.

> > 
> > > > > > > > Using the huge TLB order became the obvious
> > > > > > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > > > > > order memory on the host.
> > > > > > > > 
> > > > > > > > I have limited the functionality so that it doesn't work when page
> > > > > > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > > > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > > > > > cycles to do so.
> > > > > > > 
> > > > > > > Again that's leaking host implementation detail into guest interface.
> > > > > > > 
> > > > > > > We are giving guest page hints to host that makes sense,
> > > > > > > weird interactions with other features due to host
> > > > > > > implementation details should be handled by host.
> > > > > > 
> > > > > > I don't view this as a host implementation detail, this is guest
> > > > > > feature making use of all pages for debugging. If we are placing poison
> > > > > > values in the page then I wouldn't consider them an unused page, it is
> > > > > > being actively used to store the poison value.
> > > > > 
> > > > > Well I guess it's a valid point of view for a kernel hacker, but they are
> > > > > unused from application's point of view.
> > > > > However poisoning is transparent to users and most distro users
> > > > > are not aware of it going on. They just know that debug kernels
> > > > > are slower.
> > > > > User loading a debug kernel and immediately breaking overcommit
> > > > > is an unpleasant experience.
> > > > 
> > > > How would that be any different then a user loading an older kernel
> > > > that doesn't have this feature and breaking overcommit as a result?
> > > 
> > > Well old kernel does not have the feature so nothing to debug.
> > > When we have a new feature that goes away in the debug kernel,
> > > that's a big support problem since this leads to heisenbugs.
> > 
> > Trying to debug host features from the guest would be a pain anyway as
> > a guest shouldn't even really know what the underlying setup of the
> > guest is supposed to be.
> 
> I'm talking about debugging the guest though.

Right. But my point is if it is a guest feature related to memory that
you are debugging, then disabling the page hinting would probably be an
advisable step anyway since it would have the potential for memory
corruptions itself due to its nature.

> > > > I still think it would be better if we left the poisoning enabled in
> > > > such a case and just displayed a warning message if nothing else that
> > > > hinting is disabled because of page poisoning.
> > > > 
> > > > One other thought I had on this is that one side effect of page
> > > > poisoning is probably that KSM would be able to merge all of the poison
> > > > pages together into a single page since they are all set to the same
> > > > values. So even with the poisoned pages it would be possible to reduce
> > > > total memory overhead.
> > > 
> > > Right. And BTW one thing that host can do is pass
> > > the hinted area to KSM for merging.
> > > That requires an alloc hook to free it though.
> > > 
> > > Or we could add a per-VMA byte with the poison
> > > value and use that on host to populate pages on fault.
> > > 
> > > 
> > > > > > If we can achieve this
> > > > > > and free the page back to the host then even better, but until the
> > > > > > features can coexist we should not use the page hinting while page
> > > > > > poisoning is enabled.
> > > > > 
> > > > > Existing hinting in balloon allows them to coexist so I think we
> > > > > need to set the bar just as high for any new variant.
> > > > 
> > > > That is what I heard. I will have to look into this.
> > > 
> > > It's not doing anything smart right now, just checks
> > > that poison == 0 and skips freeing if not.
> > > But it can be enhanced transparently to guests.
> > 
> > Okay, so it probably should be extended to add something like poison
> > page that could replace the zero page for reads to a page that has been
> > unmapped.
> > 
> > > > > > This is one of the reasons why I was opposed to just disabling page
> > > > > > poisoning when this feature was enabled in Nitesh's patches. If the
> > > > > > guest has page poisoning enabled it is doing something with the page.
> > > > > > It shouldn't be prevented from doing that because the host wants to
> > > > > > have the option to free the pages.
> > > > > 
> > > > > I agree but I think the decision belongs on the host. I.e.
> > > > > hint the page but tell the host it needs to be careful
> > > > > about the poison value. It might also mean we
> > > > > need to make sure poisoning happens after the hinting, not before.
> > > > 
> > > > The only issue with poisoning after instead of before is that the hint
> > > > is ignored and we end up triggering a page fault and zero as a result.
> > > > It might make more sense to have an architecture specific call that can
> > > > be paravirtualized to handle the case of poisoning the page for us if
> > > > we have the unused page hint enabled. Otherwise the write to the page
> > > > is a given to invalidate the hint.
> > > 
> > > Sounds interesting. So the arch hook will first poison and
> > > then pass the page to the host?
> > > 
> > > Or we can also ask the host to poison for us, problem is this forces
> > > host to either always write into page, or call MADV_DONTNEED,
> > > without it could do MADV_FREE. Maybe that is not a big issue.
> > 
> > I would think we would ask the host to poison for us. If I am not
> > mistaken both solutions right now are using MADV_DONTNEED. I would tend
> > to lean that way if we are doing page poisoning since the cost for
> > zeroing/poisoning the page on the host could be canceled out by
> > dropping the page poisoning on the guest.
> > 
> > Then again since we are doing higher order pages only, and the
> > poisoning is supposed to happen before we get into __free_one_page we
> > would probably have to do both the poisoning, and the poison on fault.
> 
> 
> Oh that's a nice trick. So in fact if we just make sure
> we never report PAGE_SIZE pages then poisoning will
> automatically happen before reporting?
> So we just need to teach host to poison on fault.
> Sounds cool and we can always optimize further later.

That is kind of what I was thinking. In the grand scheme of things I
figure most of the expense is in the fault and page zeroing bits of the
code path. I have done a bit of testing today with the patch that just
drops the messages if a device is assigned, and just the hypercall bits
are only causing about a 2.5% regression in performance on a will-it-
scale/page-fault1 test. However if I commit to the full setup with the
madvise, page fault, and zeroing then I am seeing an 11.5% drop in
performance.

I think in order to really make this pay off we may need to look into
avoiding zeroing or poisoning the page in both the host and the guest.
I will have to look into some things as it looks like there was
somebody from Intel may have been working on doing some work to address
that based on the presentation I found at the link below:

https://www.lfasiallc.com/wp-content/uploads/2017/11/Use-Hyper-V-Enlightenments-to-Increase-KVM-VM-Performance_Density_Chao-Peng.pdf
Michael S. Tsirkin Feb. 12, 2019, 12:34 a.m. UTC | #26
On Mon, Feb 11, 2019 at 04:09:53PM -0800, Alexander Duyck wrote:
> On Mon, 2019-02-11 at 17:52 -0500, Michael S. Tsirkin wrote:
> > On Mon, Feb 11, 2019 at 01:00:53PM -0800, Alexander Duyck wrote:
> > > On Mon, 2019-02-11 at 14:54 -0500, Michael S. Tsirkin wrote:
> > > > On Mon, Feb 11, 2019 at 10:10:06AM -0800, Alexander Duyck wrote:
> > > > > On Mon, 2019-02-11 at 12:36 -0500, Michael S. Tsirkin wrote:
> > > > > > On Mon, Feb 11, 2019 at 08:31:34AM -0800, Alexander Duyck wrote:
> > > > > > > On Sat, 2019-02-09 at 19:49 -0500, Michael S. Tsirkin wrote:
> > > > > > > > On Mon, Feb 04, 2019 at 10:15:52AM -0800, Alexander Duyck wrote:
> > > > > > > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > > > > > 
> > > > > > > > > Add guest support for providing free memory hints to the KVM hypervisor for
> > > > > > > > > freed pages huge TLB size or larger. I am restricting the size to
> > > > > > > > > huge TLB order and larger because the hypercalls are too expensive to be
> > > > > > > > > performing one per 4K page.
> > > > > > > > 
> > > > > > > > Even 2M pages start to get expensive with a TB guest.
> > > > > > > 
> > > > > > > Agreed.
> > > > > > > 
> > > > > > > > Really it seems we want a virtio ring so we can pass a batch of these.
> > > > > > > > E.g. 256 entries, 2M each - that's more like it.
> > > > > > > 
> > > > > > > The only issue I see with doing that is that we then have to defer the
> > > > > > > freeing. Doing that is going to introduce issues in the guest as we are
> > > > > > > going to have pages going unused for some period of time while we wait
> > > > > > > for the hint to complete, and we cannot just pull said pages back. I'm
> > > > > > > not really a fan of the asynchronous nature of Nitesh's patches for
> > > > > > > this reason.
> > > > > > 
> > > > > > Well nothing prevents us from doing an extra exit to the hypervisor if
> > > > > > we want. The asynchronous nature is there as an optimization
> > > > > > to allow hypervisor to do its thing on a separate CPU.
> > > > > > Why not proceed doing other things meanwhile?
> > > > > > And if the reason is that we are short on memory, then
> > > > > > maybe we should be less aggressive in hinting?
> > > > > > 
> > > > > > E.g. if we just have 2 pages:
> > > > > > 
> > > > > > hint page 1
> > > > > > page 1 hint processed?
> > > > > > 	yes - proceed to page 2
> > > > > > 	no - wait for interrupt
> > > > > > 
> > > > > > get interrupt that page 1 hint is processed
> > > > > > hint page 2
> > > > > > 
> > > > > > 
> > > > > > If hypervisor happens to be running on same CPU it
> > > > > > can process things synchronously and we never enter
> > > > > > the no branch.
> > > > > > 
> > > > > 
> > > > > Another concern I would have about processing this asynchronously is
> > > > > that we have the potential for multiple guest CPUs to become
> > > > > bottlenecked by a single host CPU. I am not sure if that is something
> > > > > that would be desirable.
> > > > 
> > > > Well with a hypercall per page the fix is to block VCPU
> > > > completely which is also not for everyone.
> > > > 
> > > > If you can't push a free page hint to host, then
> > > > ideally you just won't. That's a nice property of
> > > > hinting we have upstream right now.
> > > > Host too busy - hinting is just skipped.
> > > 
> > > Right, but if you do that then there is a potential to end up missing
> > > hints for a large portion of memory. It seems like you would end up
> > > with even bigger issues since then at that point you have essentially
> > > leaked memory.
> > > I would think you would need a way to resync the host and the guest
> > > after something like that. Otherwise you can have memory that will just
> > > go unused for an extended period if a guest just goes idle.
> > 
> > Yes and that is my point.  Existing hints code will just take a page off
> > the free list in that case so it resyncs using the free list.
> > 
> > Something like this could work then: mark up
> > hinted pages with a flag (its easy to find unused
> > flags for free pages) then when you get an interrupt
> > because outstanding hints have been consumed,
> > get unflagged/unhinted pages from buddy and pass
> > them to host.
> 
> Ugh. This is beginning to sound like yet another daemon that will have
> to be running to handle missed sync events.

Why a daemon? Not at all. You get an interrupt, you schedule
a wq immediately or just do it from the interrupt handler.

> I really think that taking an async approach for this will be nothing
> but trouble. You are going to have a difficult time maintaining any
> sort of coherency no the freelist without the daemon having to take the
> zone lock and then notify the host of what is free and what isn't.

We seem to be doing fine without zone lock for now.
Just plain alloc_pages.

> > > 
> > > > > > > > > Using the huge TLB order became the obvious
> > > > > > > > > choice for the order to use as it allows us to avoid fragmentation of higher
> > > > > > > > > order memory on the host.
> > > > > > > > > 
> > > > > > > > > I have limited the functionality so that it doesn't work when page
> > > > > > > > > poisoning is enabled. I did this because a write to the page after doing an
> > > > > > > > > MADV_DONTNEED would effectively negate the hint, so it would be wasting
> > > > > > > > > cycles to do so.
> > > > > > > > 
> > > > > > > > Again that's leaking host implementation detail into guest interface.
> > > > > > > > 
> > > > > > > > We are giving guest page hints to host that makes sense,
> > > > > > > > weird interactions with other features due to host
> > > > > > > > implementation details should be handled by host.
> > > > > > > 
> > > > > > > I don't view this as a host implementation detail, this is guest
> > > > > > > feature making use of all pages for debugging. If we are placing poison
> > > > > > > values in the page then I wouldn't consider them an unused page, it is
> > > > > > > being actively used to store the poison value.
> > > > > > 
> > > > > > Well I guess it's a valid point of view for a kernel hacker, but they are
> > > > > > unused from application's point of view.
> > > > > > However poisoning is transparent to users and most distro users
> > > > > > are not aware of it going on. They just know that debug kernels
> > > > > > are slower.
> > > > > > User loading a debug kernel and immediately breaking overcommit
> > > > > > is an unpleasant experience.
> > > > > 
> > > > > How would that be any different then a user loading an older kernel
> > > > > that doesn't have this feature and breaking overcommit as a result?
> > > > 
> > > > Well old kernel does not have the feature so nothing to debug.
> > > > When we have a new feature that goes away in the debug kernel,
> > > > that's a big support problem since this leads to heisenbugs.
> > > 
> > > Trying to debug host features from the guest would be a pain anyway as
> > > a guest shouldn't even really know what the underlying setup of the
> > > guest is supposed to be.
> > 
> > I'm talking about debugging the guest though.
> 
> Right. But my point is if it is a guest feature related to memory that
> you are debugging, then disabling the page hinting would probably be an
> advisable step anyway since it would have the potential for memory
> corruptions itself due to its nature.

Oh absolutely. So that's why I wanted debug kernel to be
as close as possible to non-debug one in that respect.
If one gets a corruption we want it reproducible on debug too.

> > > > > I still think it would be better if we left the poisoning enabled in
> > > > > such a case and just displayed a warning message if nothing else that
> > > > > hinting is disabled because of page poisoning.
> > > > > 
> > > > > One other thought I had on this is that one side effect of page
> > > > > poisoning is probably that KSM would be able to merge all of the poison
> > > > > pages together into a single page since they are all set to the same
> > > > > values. So even with the poisoned pages it would be possible to reduce
> > > > > total memory overhead.
> > > > 
> > > > Right. And BTW one thing that host can do is pass
> > > > the hinted area to KSM for merging.
> > > > That requires an alloc hook to free it though.
> > > > 
> > > > Or we could add a per-VMA byte with the poison
> > > > value and use that on host to populate pages on fault.
> > > > 
> > > > 
> > > > > > > If we can achieve this
> > > > > > > and free the page back to the host then even better, but until the
> > > > > > > features can coexist we should not use the page hinting while page
> > > > > > > poisoning is enabled.
> > > > > > 
> > > > > > Existing hinting in balloon allows them to coexist so I think we
> > > > > > need to set the bar just as high for any new variant.
> > > > > 
> > > > > That is what I heard. I will have to look into this.
> > > > 
> > > > It's not doing anything smart right now, just checks
> > > > that poison == 0 and skips freeing if not.
> > > > But it can be enhanced transparently to guests.
> > > 
> > > Okay, so it probably should be extended to add something like poison
> > > page that could replace the zero page for reads to a page that has been
> > > unmapped.
> > > 
> > > > > > > This is one of the reasons why I was opposed to just disabling page
> > > > > > > poisoning when this feature was enabled in Nitesh's patches. If the
> > > > > > > guest has page poisoning enabled it is doing something with the page.
> > > > > > > It shouldn't be prevented from doing that because the host wants to
> > > > > > > have the option to free the pages.
> > > > > > 
> > > > > > I agree but I think the decision belongs on the host. I.e.
> > > > > > hint the page but tell the host it needs to be careful
> > > > > > about the poison value. It might also mean we
> > > > > > need to make sure poisoning happens after the hinting, not before.
> > > > > 
> > > > > The only issue with poisoning after instead of before is that the hint
> > > > > is ignored and we end up triggering a page fault and zero as a result.
> > > > > It might make more sense to have an architecture specific call that can
> > > > > be paravirtualized to handle the case of poisoning the page for us if
> > > > > we have the unused page hint enabled. Otherwise the write to the page
> > > > > is a given to invalidate the hint.
> > > > 
> > > > Sounds interesting. So the arch hook will first poison and
> > > > then pass the page to the host?
> > > > 
> > > > Or we can also ask the host to poison for us, problem is this forces
> > > > host to either always write into page, or call MADV_DONTNEED,
> > > > without it could do MADV_FREE. Maybe that is not a big issue.
> > > 
> > > I would think we would ask the host to poison for us. If I am not
> > > mistaken both solutions right now are using MADV_DONTNEED. I would tend
> > > to lean that way if we are doing page poisoning since the cost for
> > > zeroing/poisoning the page on the host could be canceled out by
> > > dropping the page poisoning on the guest.
> > > 
> > > Then again since we are doing higher order pages only, and the
> > > poisoning is supposed to happen before we get into __free_one_page we
> > > would probably have to do both the poisoning, and the poison on fault.
> > 
> > 
> > Oh that's a nice trick. So in fact if we just make sure
> > we never report PAGE_SIZE pages then poisoning will
> > automatically happen before reporting?
> > So we just need to teach host to poison on fault.
> > Sounds cool and we can always optimize further later.
> 
> That is kind of what I was thinking. In the grand scheme of things I
> figure most of the expense is in the fault and page zeroing bits of the
> code path. I have done a bit of testing today with the patch that just
> drops the messages if a device is assigned, and just the hypercall bits
> are only causing about a 2.5% regression in performance on a will-it-
> scale/page-fault1 test. However if I commit to the full setup with the
> madvise, page fault, and zeroing then I am seeing an 11.5% drop in
> performance.
> 
> I think in order to really make this pay off we may need to look into
> avoiding zeroing or poisoning the page in both the host and the guest.
> I will have to look into some things as it looks like there was
> somebody from Intel may have been working on doing some work to address
> that based on the presentation I found at the link below:
> 
> https://www.lfasiallc.com/wp-content/uploads/2017/11/Use-Hyper-V-Enlightenments-to-Increase-KVM-VM-Performance_Density_Chao-Peng.pdf
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 7555b48803a8..4487ad7a3385 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -18,6 +18,19 @@ 
 
 struct page;
 
+#ifdef CONFIG_KVM_GUEST
+#include <linux/jump_label.h>
+extern struct static_key_false pv_free_page_hint_enabled;
+
+#define HAVE_ARCH_FREE_PAGE
+void __arch_free_page(struct page *page, unsigned int order);
+static inline void arch_free_page(struct page *page, unsigned int order)
+{
+	if (static_branch_unlikely(&pv_free_page_hint_enabled))
+		__arch_free_page(page, order);
+}
+#endif
+
 #include <linux/range.h>
 extern struct range pfn_mapped[];
 extern int nr_pfn_mapped;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5c93a65ee1e5..09c91641c36c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -48,6 +48,7 @@ 
 #include <asm/tlb.h>
 
 static int kvmapf = 1;
+DEFINE_STATIC_KEY_FALSE(pv_free_page_hint_enabled);
 
 static int __init parse_no_kvmapf(char *arg)
 {
@@ -648,6 +649,15 @@  static void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
+	/*
+	 * The free page hinting doesn't add much value if page poisoning
+	 * is enabled. So we only enable the feature if page poisoning is
+	 * no present.
+	 */
+	if (!page_poisoning_enabled() &&
+	    kvm_para_has_feature(KVM_FEATURE_PV_UNUSED_PAGE_HINT))
+		static_branch_enable(&pv_free_page_hint_enabled);
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
@@ -762,6 +772,19 @@  static __init int kvm_setup_pv_tlb_flush(void)
 }
 arch_initcall(kvm_setup_pv_tlb_flush);
 
+void __arch_free_page(struct page *page, unsigned int order)
+{
+	/*
+	 * Limit hints to blocks no smaller than pageblock in
+	 * size to limit the cost for the hypercalls.
+	 */
+	if (order < KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
+		return;
+
+	kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
+		       PAGE_SIZE << order);
+}
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */