diff mbox series

[v3,03/70] RAMBlock/guest_memfd: Enable KVM_GUEST_MEMFD_ALLOW_HUGEPAGE

Message ID 20231115071519.2864957-4-xiaoyao.li@intel.com (mailing list archive)
State New, archived
Headers show
Series QEMU Guest memfd + QEMU TDX support | expand

Commit Message

Xiaoyao Li Nov. 15, 2023, 7:14 a.m. UTC
KVM allows KVM_GUEST_MEMFD_ALLOW_HUGEPAGE for guest memfd. When the
flag is set, KVM tries to allocate memory with transparent hugeapge at
first and falls back to non-hugepage on failure.

However, KVM defines one restriction that size must be hugepage size
aligned when KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is set.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
---
v3:
 - New one in v3.
---
 system/physmem.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

Comments

David Hildenbrand Nov. 15, 2023, 6:10 p.m. UTC | #1
On 15.11.23 08:14, Xiaoyao Li wrote:
> KVM allows KVM_GUEST_MEMFD_ALLOW_HUGEPAGE for guest memfd. When the
> flag is set, KVM tries to allocate memory with transparent hugeapge at
> first and falls back to non-hugepage on failure.
> 
> However, KVM defines one restriction that size must be hugepage size
> aligned when KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is set.
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
> v3:
>   - New one in v3.
> ---
>   system/physmem.c | 38 ++++++++++++++++++++++++++++++++++++--
>   1 file changed, 36 insertions(+), 2 deletions(-)
> 
> diff --git a/system/physmem.c b/system/physmem.c
> index 0af2213cbd9c..c56b17e44df6 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1803,6 +1803,40 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
>       }
>   }
>   
> +#ifdef CONFIG_KVM
> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> +#define DEFAULT_PMD_SIZE (1ul << 21)
> +
> +static uint32_t get_thp_size(void)
> +{
> +    gchar *content = NULL;
> +    const char *endptr;
> +    static uint64_t thp_size = 0;
> +    uint64_t tmp;
> +
> +    if (thp_size != 0) {
> +        return thp_size;
> +    }
> +
> +    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
> +        !qemu_strtou64(content, &endptr, 0, &tmp) &&
> +        (!endptr || *endptr == '\n')) {
> +        /* Sanity-check the value and fallback to something reasonable. */
> +        if (!tmp || !is_power_of_2(tmp)) {
> +            warn_report("Read unsupported THP size: %" PRIx64, tmp);
> +        } else {
> +            thp_size = tmp;
> +        }
> +    }
> +
> +    if (!thp_size) {
> +        thp_size = DEFAULT_PMD_SIZE;
> +    }

... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)

This should be factored out into a common helper.
Xiaoyao Li Nov. 16, 2023, 2:47 a.m. UTC | #2
On 11/16/2023 2:10 AM, David Hildenbrand wrote:
> On 15.11.23 08:14, Xiaoyao Li wrote:
>> KVM allows KVM_GUEST_MEMFD_ALLOW_HUGEPAGE for guest memfd. When the
>> flag is set, KVM tries to allocate memory with transparent hugeapge at
>> first and falls back to non-hugepage on failure.
>>
>> However, KVM defines one restriction that size must be hugepage size
>> aligned when KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is set.
>>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> ---
>> v3:
>>   - New one in v3.
>> ---
>>   system/physmem.c | 38 ++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 36 insertions(+), 2 deletions(-)
>>
>> diff --git a/system/physmem.c b/system/physmem.c
>> index 0af2213cbd9c..c56b17e44df6 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -1803,6 +1803,40 @@ static void dirty_memory_extend(ram_addr_t 
>> old_ram_size,
>>       }
>>   }
>> +#ifdef CONFIG_KVM
>> +#define HPAGE_PMD_SIZE_PATH 
>> "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>> +#define DEFAULT_PMD_SIZE (1ul << 21)
>> +
>> +static uint32_t get_thp_size(void)
>> +{
>> +    gchar *content = NULL;
>> +    const char *endptr;
>> +    static uint64_t thp_size = 0;
>> +    uint64_t tmp;
>> +
>> +    if (thp_size != 0) {
>> +        return thp_size;
>> +    }
>> +
>> +    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, 
>> NULL) &&
>> +        !qemu_strtou64(content, &endptr, 0, &tmp) &&
>> +        (!endptr || *endptr == '\n')) {
>> +        /* Sanity-check the value and fallback to something 
>> reasonable. */
>> +        if (!tmp || !is_power_of_2(tmp)) {
>> +            warn_report("Read unsupported THP size: %" PRIx64, tmp);
>> +        } else {
>> +            thp_size = tmp;
>> +        }
>> +    }
>> +
>> +    if (!thp_size) {
>> +        thp_size = DEFAULT_PMD_SIZE;
>> +    }
> 
> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)

Get caught.

> This should be factored out into a common helper.

Sure, will do it in next version.
David Hildenbrand Nov. 20, 2023, 9:26 a.m. UTC | #3
>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
> 
> Get caught.
> 
>> This should be factored out into a common helper.
> 
> Sure, will do it in next version.

Factor it out in a separate patch. Then, this patch is get small that 
you can just squash it into #2.

And my comment regarding "flags = 0" to patch #2 does no longer apply :)
Xiaoyao Li Nov. 30, 2023, 7:32 a.m. UTC | #4
On 11/20/2023 5:26 PM, David Hildenbrand wrote:
> 
>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>
>> Get caught.
>>
>>> This should be factored out into a common helper.
>>
>> Sure, will do it in next version.
> 
> Factor it out in a separate patch. Then, this patch is get small that 
> you can just squash it into #2.
> 
> And my comment regarding "flags = 0" to patch #2 does no longer apply :)
> 

I see.

But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together 
with initial guest memfd in linux (hopefully 6.8)
https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/

If like Paolo committed, no KVM_GUEST_MEMFD_ALLOW_HUGEPAGE in initial 
merge, I will go simplify Patch #2. Otherwise factor out a common 
function to get hugepage size as you suggested.

Thanks!
Xiaoyao Li Nov. 30, 2023, 8 a.m. UTC | #5
On 11/20/2023 5:26 PM, David Hildenbrand wrote:
> 
>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>
>> Get caught.
>>
>>> This should be factored out into a common helper.
>>
>> Sure, will do it in next version.
> 
> Factor it out in a separate patch. Then, this patch is get small that 
> you can just squash it into #2.

A silly question. What file should the factored function be put in?

> And my comment regarding "flags = 0" to patch #2 does no longer apply :)
>
David Hildenbrand Nov. 30, 2023, 10:59 a.m. UTC | #6
On 30.11.23 08:32, Xiaoyao Li wrote:
> On 11/20/2023 5:26 PM, David Hildenbrand wrote:
>>
>>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>>
>>> Get caught.
>>>
>>>> This should be factored out into a common helper.
>>>
>>> Sure, will do it in next version.
>>
>> Factor it out in a separate patch. Then, this patch is get small that
>> you can just squash it into #2.
>>
>> And my comment regarding "flags = 0" to patch #2 does no longer apply :)
>>
> 
> I see.
> 
> But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together
> with initial guest memfd in linux (hopefully 6.8)
> https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/
> 

Doesn't seem to be in -next if I am looking at the right tree:

https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
Sean Christopherson Nov. 30, 2023, 4:01 p.m. UTC | #7
On Thu, Nov 30, 2023, David Hildenbrand wrote:
> On 30.11.23 08:32, Xiaoyao Li wrote:
> > On 11/20/2023 5:26 PM, David Hildenbrand wrote:
> > > 
> > > > > ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
> > > > 
> > > > Get caught.
> > > > 
> > > > > This should be factored out into a common helper.
> > > > 
> > > > Sure, will do it in next version.
> > > 
> > > Factor it out in a separate patch. Then, this patch is get small that
> > > you can just squash it into #2.
> > > 
> > > And my comment regarding "flags = 0" to patch #2 does no longer apply :)
> > > 
> > 
> > I see.
> > 
> > But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together
> > with initial guest memfd in linux (hopefully 6.8)
> > https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/
> > 
> 
> Doesn't seem to be in -next if I am looking at the right tree:
> 
> https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next

Yeah, we punted on adding hugepage support for the initial guest_memfd merge so
as not to rush in kludgy uABI.  The internal KVM code isn't problematic, we just
haven't figured out exactly what the ABI should look like, e.g. should hugepages
be dependent on THP being enabled, and if not, how does userspace discover the
supported hugepage sizes?
David Hildenbrand Nov. 30, 2023, 4:54 p.m. UTC | #8
On 30.11.23 17:01, Sean Christopherson wrote:
> On Thu, Nov 30, 2023, David Hildenbrand wrote:
>> On 30.11.23 08:32, Xiaoyao Li wrote:
>>> On 11/20/2023 5:26 PM, David Hildenbrand wrote:
>>>>
>>>>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>>>>
>>>>> Get caught.
>>>>>
>>>>>> This should be factored out into a common helper.
>>>>>
>>>>> Sure, will do it in next version.
>>>>
>>>> Factor it out in a separate patch. Then, this patch is get small that
>>>> you can just squash it into #2.
>>>>
>>>> And my comment regarding "flags = 0" to patch #2 does no longer apply :)
>>>>
>>>
>>> I see.
>>>
>>> But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together
>>> with initial guest memfd in linux (hopefully 6.8)
>>> https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/
>>>
>>
>> Doesn't seem to be in -next if I am looking at the right tree:
>>
>> https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
> 
> Yeah, we punted on adding hugepage support for the initial guest_memfd merge so
> as not to rush in kludgy uABI.  The internal KVM code isn't problematic, we just
> haven't figured out exactly what the ABI should look like, e.g. should hugepages
> be dependent on THP being enabled, and if not, how does userspace discover the
> supported hugepage sizes?

Are we talking about THP or hugetlb? They are two different things, and 
"KVM_GUEST_MEMFD_ALLOW_HUGEPAGE" doesn't make it clearer what we are 
talking about.

This patch here "get_thp_size()" indicates that we care about THP, not 
hugetlb.


THP lives in:
	/sys/kernel/mm/transparent_hugepage/
and hugetlb in:
	/sys/kernel/mm/hugepages/

THP for shmem+anon currently really only supports PMD-sized THP, that 
size can be observed via:
	/sys/kernel/mm/transparent_hugepage/hpage_pmd_size

hugetlb sizes can be detected simply by looking at the folders inside
/sys/kernel/mm/hugepages/. "tools/testing/selftests/mm/vm_util.c" in the 
kernel has a function "detect_hugetlb_page_sizes()" that uses that 
interface to detect the sizes.


But likely we want THP support here. Because for hugetlb, one would 
actually have to instruct the kernel which size to use, like we do for 
memfd with hugetlb.


Anon support for smaller sizes than PMDs is in the works, and once 
upstream, it can then be detected via 
/sys/kernel/mm/transparent_hugepage/ as well.

shmem support for smaller sizes is partially in the works: only on the 
write() path. Likely, we'll make it configurable/observable in 
/sys/kernel/mm/transparent_hugepage/ as well.


So if we are talking about THP for shmem, there really only is 
/sys/kernel/mm/transparent_hugepage/hpage_pmd_size.
Peter Xu Nov. 30, 2023, 5:46 p.m. UTC | #9
On Thu, Nov 30, 2023 at 05:54:26PM +0100, David Hildenbrand wrote:
> But likely we want THP support here. Because for hugetlb, one would actually
> have to instruct the kernel which size to use, like we do for memfd with
> hugetlb.

I doubt it, as VM can still leverage larger sizes if possible?

IIUC one of the major challenges of gmem hugepage is how to support
security features while reusing existing mm infrastructures as much as
possible.

Thanks,
Daniel P. Berrangé Nov. 30, 2023, 5:51 p.m. UTC | #10
On Thu, Nov 30, 2023 at 05:54:26PM +0100, David Hildenbrand wrote:
> On 30.11.23 17:01, Sean Christopherson wrote:
> > On Thu, Nov 30, 2023, David Hildenbrand wrote:
> > > On 30.11.23 08:32, Xiaoyao Li wrote:
> > > > On 11/20/2023 5:26 PM, David Hildenbrand wrote:
> > > > > 
> > > > > > > ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
> > > > > > 
> > > > > > Get caught.
> > > > > > 
> > > > > > > This should be factored out into a common helper.
> > > > > > 
> > > > > > Sure, will do it in next version.
> > > > > 
> > > > > Factor it out in a separate patch. Then, this patch is get small that
> > > > > you can just squash it into #2.
> > > > > 
> > > > > And my comment regarding "flags = 0" to patch #2 does no longer apply :)
> > > > > 
> > > > 
> > > > I see.
> > > > 
> > > > But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together
> > > > with initial guest memfd in linux (hopefully 6.8)
> > > > https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/
> > > > 
> > > 
> > > Doesn't seem to be in -next if I am looking at the right tree:
> > > 
> > > https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
> > 
> > Yeah, we punted on adding hugepage support for the initial guest_memfd merge so
> > as not to rush in kludgy uABI.  The internal KVM code isn't problematic, we just
> > haven't figured out exactly what the ABI should look like, e.g. should hugepages
> > be dependent on THP being enabled, and if not, how does userspace discover the
> > supported hugepage sizes?
> 
> Are we talking about THP or hugetlb? They are two different things, and
> "KVM_GUEST_MEMFD_ALLOW_HUGEPAGE" doesn't make it clearer what we are talking
> about.
> 
> This patch here "get_thp_size()" indicates that we care about THP, not
> hugetlb.
> 
> 
> THP lives in:
> 	/sys/kernel/mm/transparent_hugepage/
> and hugetlb in:
> 	/sys/kernel/mm/hugepages/
> 
> THP for shmem+anon currently really only supports PMD-sized THP, that size
> can be observed via:
> 	/sys/kernel/mm/transparent_hugepage/hpage_pmd_size
> 
> hugetlb sizes can be detected simply by looking at the folders inside
> /sys/kernel/mm/hugepages/. "tools/testing/selftests/mm/vm_util.c" in the
> kernel has a function "detect_hugetlb_page_sizes()" that uses that interface
> to detect the sizes.
> 
> 
> But likely we want THP support here. Because for hugetlb, one would actually
> have to instruct the kernel which size to use, like we do for memfd with
> hugetlb.

Would we not want both ultimately ?

THP is good because it increases performance vs non-HP out of the box
without the user or mgmt app having to make any decisions.

It does not give you deterministic performance though, because it has
to opportunistically assign huge pages basd on what is available and
that may differ each time a VM is launched.  Explicit admin/mgmt app
controlled huge page usage gives determinism, at the cost of increased
mgmt overhead.

Both are valid use cases depending on the tradeoff a deployment and/or
mgmt app wants to make.


With regards,
Daniel
David Hildenbrand Nov. 30, 2023, 5:57 p.m. UTC | #11
On 30.11.23 18:46, Peter Xu wrote:
> On Thu, Nov 30, 2023 at 05:54:26PM +0100, David Hildenbrand wrote:
>> But likely we want THP support here. Because for hugetlb, one would actually
>> have to instruct the kernel which size to use, like we do for memfd with
>> hugetlb.
> 
> I doubt it, as VM can still leverage larger sizes if possible?

What do you doubt? I am talking about the current implementation and 
expected semantics of KVM_GUEST_MEMFD_ALLOW_HUGEPAGE.
David Hildenbrand Nov. 30, 2023, 6:09 p.m. UTC | #12
On 30.11.23 18:57, David Hildenbrand wrote:
> On 30.11.23 18:46, Peter Xu wrote:
>> On Thu, Nov 30, 2023 at 05:54:26PM +0100, David Hildenbrand wrote:
>>> But likely we want THP support here. Because for hugetlb, one would actually
>>> have to instruct the kernel which size to use, like we do for memfd with
>>> hugetlb.
>>
>> I doubt it, as VM can still leverage larger sizes if possible?
> 
> What do you doubt? I am talking about the current implementation and
> expected semantics of KVM_GUEST_MEMFD_ALLOW_HUGEPAGE.
> 

I looked at the kernel implementation, and it simply allocates a 
PMD-sized folio and puts it into the pagecache. So hugetlb is not involved.

That raises various questions:

1) What are the semantics if we ever allow migrating/compacting such
    folios. Would we allow split them into smaller pages when required
    (or compact into larger)? What happens when we would partially zap
    them (fallocate?)right now? IOW, do they behave like THP, and do we
    want them to behave like THP?

2) If they behave like THP, wow would we able to compact them into
    bigger pages? khugepaged only works on VMAs IIRC.

3) How would you allocate gigantic pages if not by the help of hugetlb
    and reserved pools? At least as of today, runtime allocation of
    gigantic pages is extremely unreliable and compaction into gigantic
    pages does not work. So gigantic pages would be something for that
    far distant future.

4) cont-pte-sizes folios?

Maybe it's all clarified already, in that case I'd appreciate a pointer.

Looking at the current code, it looks like it behaves like shmem thp, 
just without any way to collapse afterwards (unless I am missing something).
David Hildenbrand Nov. 30, 2023, 6:22 p.m. UTC | #13
On 30.11.23 18:51, Daniel P. Berrangé wrote:
> On Thu, Nov 30, 2023 at 05:54:26PM +0100, David Hildenbrand wrote:
>> On 30.11.23 17:01, Sean Christopherson wrote:
>>> On Thu, Nov 30, 2023, David Hildenbrand wrote:
>>>> On 30.11.23 08:32, Xiaoyao Li wrote:
>>>>> On 11/20/2023 5:26 PM, David Hildenbrand wrote:
>>>>>>
>>>>>>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>>>>>>
>>>>>>> Get caught.
>>>>>>>
>>>>>>>> This should be factored out into a common helper.
>>>>>>>
>>>>>>> Sure, will do it in next version.
>>>>>>
>>>>>> Factor it out in a separate patch. Then, this patch is get small that
>>>>>> you can just squash it into #2.
>>>>>>
>>>>>> And my comment regarding "flags = 0" to patch #2 does no longer apply :)
>>>>>>
>>>>>
>>>>> I see.
>>>>>
>>>>> But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together
>>>>> with initial guest memfd in linux (hopefully 6.8)
>>>>> https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/
>>>>>
>>>>
>>>> Doesn't seem to be in -next if I am looking at the right tree:
>>>>
>>>> https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
>>>
>>> Yeah, we punted on adding hugepage support for the initial guest_memfd merge so
>>> as not to rush in kludgy uABI.  The internal KVM code isn't problematic, we just
>>> haven't figured out exactly what the ABI should look like, e.g. should hugepages
>>> be dependent on THP being enabled, and if not, how does userspace discover the
>>> supported hugepage sizes?
>>
>> Are we talking about THP or hugetlb? They are two different things, and
>> "KVM_GUEST_MEMFD_ALLOW_HUGEPAGE" doesn't make it clearer what we are talking
>> about.
>>
>> This patch here "get_thp_size()" indicates that we care about THP, not
>> hugetlb.
>>
>>
>> THP lives in:
>> 	/sys/kernel/mm/transparent_hugepage/
>> and hugetlb in:
>> 	/sys/kernel/mm/hugepages/
>>
>> THP for shmem+anon currently really only supports PMD-sized THP, that size
>> can be observed via:
>> 	/sys/kernel/mm/transparent_hugepage/hpage_pmd_size
>>
>> hugetlb sizes can be detected simply by looking at the folders inside
>> /sys/kernel/mm/hugepages/. "tools/testing/selftests/mm/vm_util.c" in the
>> kernel has a function "detect_hugetlb_page_sizes()" that uses that interface
>> to detect the sizes.
>>
>>
>> But likely we want THP support here. Because for hugetlb, one would actually
>> have to instruct the kernel which size to use, like we do for memfd with
>> hugetlb.
> 
> Would we not want both ultimately ?

Likely we want both somehow, although I am not sure how to obtain either 
cleanly and fully.

My question is targeted at what the current interface/implementation 
promises, and how it relates to both, THP and hugetlb.
David Hildenbrand Dec. 1, 2023, 11 a.m. UTC | #14
On 30.11.23 09:00, Xiaoyao Li wrote:
> On 11/20/2023 5:26 PM, David Hildenbrand wrote:
>>
>>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>>
>>> Get caught.
>>>
>>>> This should be factored out into a common helper.
>>>
>>> Sure, will do it in next version.
>>
>> Factor it out in a separate patch. Then, this patch is get small that
>> you can just squash it into #2.
> 
> A silly question. What file should the factored function be put in?

Good question :) it's highly Linux specific, probably util/oslib-posix.c ?
Claudio Fontana Dec. 1, 2023, 11:22 a.m. UTC | #15
On 11/30/23 18:51, Daniel P. Berrangé wrote:
> On Thu, Nov 30, 2023 at 05:54:26PM +0100, David Hildenbrand wrote:
>> On 30.11.23 17:01, Sean Christopherson wrote:
>>> On Thu, Nov 30, 2023, David Hildenbrand wrote:
>>>> On 30.11.23 08:32, Xiaoyao Li wrote:
>>>>> On 11/20/2023 5:26 PM, David Hildenbrand wrote:
>>>>>>
>>>>>>>> ... did you shamelessly copy that from hw/virtio/virtio-mem.c ? ;)
>>>>>>>
>>>>>>> Get caught.
>>>>>>>
>>>>>>>> This should be factored out into a common helper.
>>>>>>>
>>>>>>> Sure, will do it in next version.
>>>>>>
>>>>>> Factor it out in a separate patch. Then, this patch is get small that
>>>>>> you can just squash it into #2.
>>>>>>
>>>>>> And my comment regarding "flags = 0" to patch #2 does no longer apply :)
>>>>>>
>>>>>
>>>>> I see.
>>>>>
>>>>> But it depends on if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE will appear together
>>>>> with initial guest memfd in linux (hopefully 6.8)
>>>>> https://lore.kernel.org/all/CABgObfa=DH7FySBviF63OS9sVog_wt-AqYgtUAGKqnY5Bizivw@mail.gmail.com/
>>>>>
>>>>
>>>> Doesn't seem to be in -next if I am looking at the right tree:
>>>>
>>>> https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
>>>
>>> Yeah, we punted on adding hugepage support for the initial guest_memfd merge so
>>> as not to rush in kludgy uABI.  The internal KVM code isn't problematic, we just
>>> haven't figured out exactly what the ABI should look like, e.g. should hugepages
>>> be dependent on THP being enabled, and if not, how does userspace discover the
>>> supported hugepage sizes?
>>
>> Are we talking about THP or hugetlb? They are two different things, and
>> "KVM_GUEST_MEMFD_ALLOW_HUGEPAGE" doesn't make it clearer what we are talking
>> about.
>>
>> This patch here "get_thp_size()" indicates that we care about THP, not
>> hugetlb.
>>
>>
>> THP lives in:
>> 	/sys/kernel/mm/transparent_hugepage/
>> and hugetlb in:
>> 	/sys/kernel/mm/hugepages/
>>
>> THP for shmem+anon currently really only supports PMD-sized THP, that size
>> can be observed via:
>> 	/sys/kernel/mm/transparent_hugepage/hpage_pmd_size
>>
>> hugetlb sizes can be detected simply by looking at the folders inside
>> /sys/kernel/mm/hugepages/. "tools/testing/selftests/mm/vm_util.c" in the
>> kernel has a function "detect_hugetlb_page_sizes()" that uses that interface
>> to detect the sizes.
>>
>>
>> But likely we want THP support here. Because for hugetlb, one would actually
>> have to instruct the kernel which size to use, like we do for memfd with
>> hugetlb.
> 
> Would we not want both ultimately ?
> 
> THP is good because it increases performance vs non-HP out of the box
> without the user or mgmt app having to make any decisions.
> 
> It does not give you deterministic performance though, because it has
> to opportunistically assign huge pages basd on what is available and
> that may differ each time a VM is launched.  Explicit admin/mgmt app
> controlled huge page usage gives determinism, at the cost of increased
> mgmt overhead.
> 
> Both are valid use cases depending on the tradeoff a deployment and/or
> mgmt app wants to make.

Absolutely, it really depends on the definition of "performance" for the specific goal the user is trying to achieve.
There are very prominent use cases where THP is a big no-no due to the latency introduced.

C
diff mbox series

Patch

diff --git a/system/physmem.c b/system/physmem.c
index 0af2213cbd9c..c56b17e44df6 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1803,6 +1803,40 @@  static void dirty_memory_extend(ram_addr_t old_ram_size,
     }
 }
 
+#ifdef CONFIG_KVM
+#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define DEFAULT_PMD_SIZE (1ul << 21)
+
+static uint32_t get_thp_size(void)
+{
+    gchar *content = NULL;
+    const char *endptr;
+    static uint64_t thp_size = 0;
+    uint64_t tmp;
+
+    if (thp_size != 0) {
+        return thp_size;
+    }
+
+    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
+        !qemu_strtou64(content, &endptr, 0, &tmp) &&
+        (!endptr || *endptr == '\n')) {
+        /* Sanity-check the value and fallback to something reasonable. */
+        if (!tmp || !is_power_of_2(tmp)) {
+            warn_report("Read unsupported THP size: %" PRIx64, tmp);
+        } else {
+            thp_size = tmp;
+        }
+    }
+
+    if (!thp_size) {
+        thp_size = DEFAULT_PMD_SIZE;
+    }
+
+    return thp_size;
+}
+#endif
+
 static void ram_block_add(RAMBlock *new_block, Error **errp)
 {
     const bool noreserve = qemu_ram_is_noreserve(new_block);
@@ -1844,8 +1878,8 @@  static void ram_block_add(RAMBlock *new_block, Error **errp)
 #ifdef CONFIG_KVM
     if (kvm_enabled() && new_block->flags & RAM_GUEST_MEMFD &&
         new_block->guest_memfd < 0) {
-        /* TODO: to decide if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is supported */
-        uint64_t flags = 0;
+        uint64_t flags = QEMU_IS_ALIGNED(new_block->max_length, get_thp_size()) ?
+                         KVM_GUEST_MEMFD_ALLOW_HUGEPAGE : 0;
         new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length,
                                                         flags, errp);
         if (new_block->guest_memfd < 0) {