diff mbox series

[v5,2/2] kvm: Use huge pages for DAX-backed files

Message ID 20191212182238.46535-3-brho@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: Use huge pages for DAX-backed files | expand

Commit Message

Barret Rhoden Dec. 12, 2019, 6:22 p.m. UTC
This change allows KVM to map DAX-backed files made of huge pages with
huge mappings in the EPT/TDP.

DAX pages are not PageTransCompound.  The existing check is trying to
determine if the mapping for the pfn is a huge mapping or not.  For
non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
For DAX, we can check the page table itself.

Note that KVM already faulted in the page (or huge page) in the host's
page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.

Signed-off-by: Barret Rhoden <brho@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

Comments

Liran Alon Dec. 12, 2019, 6:47 p.m. UTC | #1
> On 12 Dec 2019, at 20:22, Barret Rhoden <brho@google.com> wrote:
> 
> This change allows KVM to map DAX-backed files made of huge pages with
> huge mappings in the EPT/TDP.

This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
See how FNAME(page_fault)() calls transparent_hugepage_adjust().

> 
> DAX pages are not PageTransCompound.  The existing check is trying to
> determine if the mapping for the pfn is a huge mapping or not.

I would rephrase “The existing check is trying to determine if the pfn
is mapped as part of a transparent huge-page”.

> For
> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.

This is not related to hugetlbfs but rather THP.

> For DAX, we can check the page table itself.
> 
> Note that KVM already faulted in the page (or huge page) in the host's
> page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
> 
> Signed-off-by: Barret Rhoden <brho@google.com>

I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
which is meant to handle PFNs that are mapped as part of a transparent huge-page.

For example, this would prevent mapping DAX-backed file page as 1GB.
As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).

As you are parsing the page-tables to discover the page-size the PFN is mapped in,
I think you should instead modify kvm_host_page_size() to parse page-tables instead
of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
of is_zone_device_page().
The main complication though of doing this is that at this point you don’t yet have the PFN
that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
in tdp_page_fault() & FNAME(page_fault)().

-Liran

> ---
> arch/x86/kvm/mmu/mmu.c | 31 +++++++++++++++++++++++++++----
> 1 file changed, 27 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7269130ea5e2..ea8f6951398b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3328,6 +3328,30 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> 	__direct_pte_prefetch(vcpu, sp, sptep);
> }
> 
> +static bool pfn_is_huge_mapped(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn)
> +{
> +	struct page *page = pfn_to_page(pfn);
> +	unsigned long hva;
> +
> +	if (!is_zone_device_page(page))
> +		return PageTransCompoundMap(page);
> +
> +	/*
> +	 * DAX pages do not use compound pages.  The page should have already
> +	 * been mapped into the host-side page table during try_async_pf(), so
> +	 * we can check the page tables directly.
> +	 */
> +	hva = gfn_to_hva(kvm, gfn);
> +	if (kvm_is_error_hva(hva))
> +		return false;
> +
> +	/*
> +	 * Our caller grabbed the KVM mmu_lock with a successful
> +	 * mmu_notifier_retry, so we're safe to walk the page table.
> +	 */
> +	return dev_pagemap_mapping_shift(hva, current->mm) > PAGE_SHIFT;
> +}
> +
> static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
> 					gfn_t gfn, kvm_pfn_t *pfnp,
> 					int *levelp)
> @@ -3342,8 +3366,8 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
> 	 * here.
> 	 */
> 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
> -	    !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
> -	    PageTransCompoundMap(pfn_to_page(pfn))) {
> +	    level == PT_PAGE_TABLE_LEVEL &&
> +	    pfn_is_huge_mapped(vcpu->kvm, gfn, pfn)) {
> 		unsigned long mask;
> 
> 		/*
> @@ -5957,8 +5981,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> 		 * mapping if the indirect sp has level = 1.
> 		 */
> 		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
> -		    !kvm_is_zone_device_pfn(pfn) &&
> -		    PageTransCompoundMap(pfn_to_page(pfn))) {
> +		    pfn_is_huge_mapped(kvm, sp->gfn, pfn)) {
> 			pte_list_remove(rmap_head, sptep);
> 
> 			if (kvm_available_flush_tlb_with_range())
> -- 
> 2.24.0.525.g8f36a354ae-goog
>
Liran Alon Dec. 12, 2019, 6:49 p.m. UTC | #2
> On 12 Dec 2019, at 20:47, Liran Alon <liran.alon@oracle.com> wrote:
> 
> 
> 
>> On 12 Dec 2019, at 20:22, Barret Rhoden <brho@google.com> wrote:
>> 
>> This change allows KVM to map DAX-backed files made of huge pages with
>> huge mappings in the EPT/TDP.
> 
> This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
> See how FNAME(page_fault)() calls transparent_hugepage_adjust().
> 
>> 
>> DAX pages are not PageTransCompound.  The existing check is trying to
>> determine if the mapping for the pfn is a huge mapping or not.
> 
> I would rephrase “The existing check is trying to determine if the pfn
> is mapped as part of a transparent huge-page”.
> 
>> For
>> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
> 
> This is not related to hugetlbfs but rather THP.
> 
>> For DAX, we can check the page table itself.
>> 
>> Note that KVM already faulted in the page (or huge page) in the host's
>> page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>> 
>> Signed-off-by: Barret Rhoden <brho@google.com>
> 
> I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
> which is meant to handle PFNs that are mapped as part of a transparent huge-page.
> 
> For example, this would prevent mapping DAX-backed file page as 1GB.
> As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
> 
> As you are parsing the page-tables to discover the page-size the PFN is mapped in,
> I think you should instead modify kvm_host_page_size() to parse page-tables instead
> of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
> of is_zone_device_page().
> The main complication though of doing this is that at this point you don’t yet have the PFN
> that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
> in tdp_page_fault() & FNAME(page_fault)().
> 
> -Liran

Or alternatively when thinking about it more, maybe just rename transparent_hugepage_adjust()
to not be specific to THP and better handle the case of parsing page-tables changing mapping-level to 1GB.
That is probably easier and more elegant.

-Liran
Barret Rhoden Dec. 12, 2019, 7:55 p.m. UTC | #3
Hi -

On 12/12/19 1:49 PM, Liran Alon wrote:
> 
> 
>> On 12 Dec 2019, at 20:47, Liran Alon <liran.alon@oracle.com> wrote:
>>
>>
>>
>>> On 12 Dec 2019, at 20:22, Barret Rhoden <brho@google.com> wrote:
>>>
>>> This change allows KVM to map DAX-backed files made of huge pages with
>>> huge mappings in the EPT/TDP.
>>
>> This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
>> See how FNAME(page_fault)() calls transparent_hugepage_adjust().

Cool, I'll drop references to the EPT/TDP from the commit message.

>>> DAX pages are not PageTransCompound.  The existing check is trying to
>>> determine if the mapping for the pfn is a huge mapping or not.
>>
>> I would rephrase “The existing check is trying to determine if the pfn
>> is mapped as part of a transparent huge-page”.

Can do.

>>
>>> For
>>> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
>>
>> This is not related to hugetlbfs but rather THP.

I thought that PageTransCompound also returned true for hugetlbfs (based 
off of comments in page-flags.h).  Though I do see the comment about the 
'level == PT_PAGE_TABLE_LEVEL' check excluding hugetlbfs pages.

Anyway, I'll remove the "e.g. hugetlbfs" from the commit message.

>>
>>> For DAX, we can check the page table itself.
>>>
>>> Note that KVM already faulted in the page (or huge page) in the host's
>>> page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>>
>>> Signed-off-by: Barret Rhoden <brho@google.com>
>>
>> I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
>> which is meant to handle PFNs that are mapped as part of a transparent huge-page.
>>
>> For example, this would prevent mapping DAX-backed file page as 1GB.
>> As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
>>
>> As you are parsing the page-tables to discover the page-size the PFN is mapped in,
>> I think you should instead modify kvm_host_page_size() to parse page-tables instead
>> of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
>> of is_zone_device_page().
>> The main complication though of doing this is that at this point you don’t yet have the PFN
>> that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
>> in tdp_page_fault() & FNAME(page_fault)().
>>
>> -Liran
> 
> Or alternatively when thinking about it more, maybe just rename transparent_hugepage_adjust()
> to not be specific to THP and better handle the case of parsing page-tables changing mapping-level to 1GB.
> That is probably easier and more elegant.

I can rename it to hugepage_adjust(), since it's not just THP anymore.

I was a little hesitant to change the this to handle 1 GB pages with 
this patchset at first.  I didn't want to break the non-DAX case stuff 
by doing so.

Specifically, can a THP page be 1 GB, and if so, how can you tell?  If 
you can't tell easily, I could walk the page table for all cases, 
instead of just zone_device().

I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think, 
which would open this up to hugetlbfs pages (based on the comments).  Is 
there any reason why that would be a bad idea?

Thanks,

Barret
Liran Alon Dec. 13, 2019, 1:07 a.m. UTC | #4
> On 12 Dec 2019, at 21:55, Barret Rhoden <brho@google.com> wrote:
> 
> Hi -
> 
> On 12/12/19 1:49 PM, Liran Alon wrote:
>>> On 12 Dec 2019, at 20:47, Liran Alon <liran.alon@oracle.com> wrote:
>>> 
>>> 
>>> 
>>>> On 12 Dec 2019, at 20:22, Barret Rhoden <brho@google.com> wrote:
>>>> 
>>>> This change allows KVM to map DAX-backed files made of huge pages with
>>>> huge mappings in the EPT/TDP.
>>> 
>>> This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
>>> See how FNAME(page_fault)() calls transparent_hugepage_adjust().
> 
> Cool, I'll drop references to the EPT/TDP from the commit message.
> 
>>>> DAX pages are not PageTransCompound.  The existing check is trying to
>>>> determine if the mapping for the pfn is a huge mapping or not.
>>> 
>>> I would rephrase “The existing check is trying to determine if the pfn
>>> is mapped as part of a transparent huge-page”.
> 
> Can do.
> 
>>> 
>>>> For
>>>> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
>>> 
>>> This is not related to hugetlbfs but rather THP.
> 
> I thought that PageTransCompound also returned true for hugetlbfs (based off of comments in page-flags.h).  Though I do see the comment about the 'level == PT_PAGE_TABLE_LEVEL' check excluding hugetlbfs pages.
> 
> Anyway, I'll remove the "e.g. hugetlbfs" from the commit message.
> 
>>> 
>>>> For DAX, we can check the page table itself.
>>>> 
>>>> Note that KVM already faulted in the page (or huge page) in the host's
>>>> page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
>>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>>> 
>>>> Signed-off-by: Barret Rhoden <brho@google.com>
>>> 
>>> I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
>>> which is meant to handle PFNs that are mapped as part of a transparent huge-page.
>>> 
>>> For example, this would prevent mapping DAX-backed file page as 1GB.
>>> As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
>>> 
>>> As you are parsing the page-tables to discover the page-size the PFN is mapped in,
>>> I think you should instead modify kvm_host_page_size() to parse page-tables instead
>>> of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
>>> of is_zone_device_page().
>>> The main complication though of doing this is that at this point you don’t yet have the PFN
>>> that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
>>> in tdp_page_fault() & FNAME(page_fault)().
>>> 
>>> -Liran
>> Or alternatively when thinking about it more, maybe just rename transparent_hugepage_adjust()
>> to not be specific to THP and better handle the case of parsing page-tables changing mapping-level to 1GB.
>> That is probably easier and more elegant.
> 
> I can rename it to hugepage_adjust(), since it's not just THP anymore.

Sounds good.

> 
> I was a little hesitant to change the this to handle 1 GB pages with this patchset at first.  I didn't want to break the non-DAX case stuff by doing so.

Why would it affect non-DAX case?
Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
i.e. THP merged pages should still be detected by PageTransCompoundMap().

> 
> Specifically, can a THP page be 1 GB, and if so, how can you tell?  If you can't tell easily, I could walk the page table for all cases, instead of just zone_device().

I prefer to walk page-tables only for is_zone_device_page().

> 
> I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think, which would open this up to hugetlbfs pages (based on the comments).  Is there any reason why that would be a bad idea?

KVM already supports mapping 1GB hugetlbfs pages. As level is set to PUD-level by tdp_page_fault()->mapping_level()->host_mapping_level()->kvm_host_page_size()->vma_kernel_pagesize(). As VMA which is mmap of hugetlbfs sets vma->vm_ops to hugetlb_vm_ops() where hugetlb_vm_op_pagesize() will return appropriate page-size.

Specifically, I don’t think THP ever merges small pages to 1GB pages. I think this is why transparent_hugepage_adjust() checks PageTransCompoundMap() only in case level == PT_PAGE_TABLE_LEVEL. I think you should keep this check in the case of !is_zone_device_page().

-Liran

> 
> Thanks,
> 
> Barret
>
Barret Rhoden Dec. 13, 2019, 2:13 p.m. UTC | #5
On 12/12/19 8:07 PM, Liran Alon wrote:
>> I was a little hesitant to change the this to handle 1 GB pages with this patchset at first.  I didn't want to break the non-DAX case stuff by doing so.
> 
> Why would it affect non-DAX case?
> Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
> i.e. THP merged pages should still be detected by PageTransCompoundMap().

That's what I already do.  But if I wanted to make the hugepage_adjust() 
function also handle the change to 1 GB, then that code would apply to 
THP too.  I didn't want to do that without knowing the implications for THP.

>> Specifically, can a THP page be 1 GB, and if so, how can you tell?  If you can't tell easily, I could walk the page table for all cases, instead of just zone_device().
> 
> I prefer to walk page-tables only for is_zone_device_page().

Is there another way to tell if a THP page is 1 GB?  Anyway, this is the 
sort of stuff I didn't want to mess around with.

hugepage_adjust() seemed like a reasonable place to get a huge (2MB) 
page table entry out of a DAX mapping.  I didn't want to proliferate 
another special case for upgrading to a larger PTE size (i.e. how 
hugetlbfs and THP have separate mechanisms), so I hopped on to the "can 
we do a 2MB mapping even though host_mapping_level() didn't say so" case 
- which is my interpretation of what huge_adjust() is for.

Barret
Sean Christopherson Dec. 13, 2019, 5:19 p.m. UTC | #6
On Fri, Dec 13, 2019 at 03:07:31AM +0200, Liran Alon wrote:
> 
> > On 12 Dec 2019, at 21:55, Barret Rhoden <brho@google.com> wrote:
> > 
> >>>> Note that KVM already faulted in the page (or huge page) in the host's
> >>>> page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
> >>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
> >>>> 
> >>>> Signed-off-by: Barret Rhoden <brho@google.com>
> >>> 
> >>> I don’t think the right place to change for this functionality is
> >>> transparent_hugepage_adjust() which is meant to handle PFNs that are
> >>> mapped as part of a transparent huge-page.
> >>> 
> >>> For example, this would prevent mapping DAX-backed file page as 1GB.  As
> >>> transparent_hugepage_adjust() only handles the case (level ==
> >>> PT_PAGE_TABLE_LEVEL).

Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing.  It's
unlikely THP itself will support 1GB pages any time soon, but having the
logic there wouldn't hurt anything.

> >>> As you are parsing the page-tables to discover the page-size the PFN is
> >>> mapped in, I think you should instead modify kvm_host_page_size() to
> >>> parse page-tables instead of rely on vma_kernel_pagesize() (Which relies
> >>> on vma->vm_ops->pagesize()) in case of is_zone_device_page().
> >>>
> >>> The main complication though of doing this is that at this point you
> >>> don’t yet have the PFN that is retrieved by try_async_pf(). So maybe you
> >>> should consider modifying the order of calls in tdp_page_fault() &
> >>> FNAME(page_fault)().
> >>> 
> >>> -Liran
> >> Or alternatively when thinking about it more, maybe just rename
> >> transparent_hugepage_adjust() to not be specific to THP and better handle
> >> the case of parsing page-tables changing mapping-level to 1GB.
> >> That is probably easier and more elegant.

Agreed.

> > I can rename it to hugepage_adjust(), since it's not just THP anymore.

Or maybe allowed_hugepage_adjust()?  To pair with disallowed_hugepage_adjust(),
which adjusts KVM's page size in the opposite direction to avoid the iTLB
multi-hit issue.

> 
> Sounds good.
> 
> > 
> > I was a little hesitant to change the this to handle 1 GB pages with this
> > patchset at first.  I didn't want to break the non-DAX case stuff by doing
> > so.
> 
> Why would it affect non-DAX case?
> Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
> i.e. THP merged pages should still be detected by PageTransCompoundMap().

I think what Barret is saying is that teaching thp_adjust() how to do 1gb
mappings would naturally affect the code path for THP pages.  But I agree
that it would be superficial.
 
> > Specifically, can a THP page be 1 GB, and if so, how can you tell?  If you
> > can't tell easily, I could walk the page table for all cases, instead of
> > just zone_device().

No, THP doesn't currently support 1gb pages.  Expliciting returning
PMD_SIZE on PageTransCompoundMap() would be a good thing from a readability
perspective.

> I prefer to walk page-tables only for is_zone_device_page().
> 
> > 
> > I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think,
> > which would open this up to hugetlbfs pages (based on the comments).  Is
> > there any reason why that would be a bad idea?

No, the "level == PT_PAGE_TABLE_LEVEL" check is to filter out the case
where KVM is already planning on using a large page, e.g. when the memory
is backed by hugetlbs.

> KVM already supports mapping 1GB hugetlbfs pages. As level is set to
> PUD-level by
> tdp_page_fault()->mapping_level()->host_mapping_level()->kvm_host_page_size()->vma_kernel_pagesize().
> As VMA which is mmap of hugetlbfs sets vma->vm_ops to hugetlb_vm_ops() where
> hugetlb_vm_op_pagesize() will return appropriate page-size.
> 
> Specifically, I don’t think THP ever merges small pages to 1GB pages. I think
> this is why transparent_hugepage_adjust() checks PageTransCompoundMap() only
> in case level == PT_PAGE_TABLE_LEVEL. I think you should keep this check in
> the case of !is_zone_device_page().

I would add 1gb support for DAX as a third patch in this series.  To pave
the way in patch 2/2, change it to replace "bool pfn_is_huge_mapped()" with
"int host_pfn_mapping_level()", and maybe also renaming host_mapping_level()
to host_vma_mapping_level() to avoid confusion.

Then allowed_hugepage_adjust() would look something like:

static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
				    kvm_pfn_t *pfnp, int *levelp, int max_level)
{
	kvm_pfn_t pfn = *pfnp;
	int level = *levelp;	
	unsigned long mask;

	if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
	    level == PT_PAGE_TABLE_LEVEL)
		return;

	/*
	 * mmu_notifier_retry() was successful and mmu_lock is held, so
	 * the pmd/pud can't be split from under us.
	 */
	level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);

	*levelp = level = min(level, max_level);
	mask = KVM_PAGES_PER_HPAGE(level) - 1;
	VM_BUG_ON((gfn & mask) != (pfn & mask));
	*pfnp = pfn & ~mask;
}
Liran Alon Dec. 13, 2019, 5:31 p.m. UTC | #7
> On 13 Dec 2019, at 19:19, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> 
> On Fri, Dec 13, 2019 at 03:07:31AM +0200, Liran Alon wrote:
>> 
>>> On 12 Dec 2019, at 21:55, Barret Rhoden <brho@google.com> wrote:
>>> 
>>>>>> Note that KVM already faulted in the page (or huge page) in the host's
>>>>>> page table, and we hold the KVM mmu spinlock.  We grabbed that lock in
>>>>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>>>>> 
>>>>>> Signed-off-by: Barret Rhoden <brho@google.com>
>>>>> 
>>>>> I don’t think the right place to change for this functionality is
>>>>> transparent_hugepage_adjust() which is meant to handle PFNs that are
>>>>> mapped as part of a transparent huge-page.
>>>>> 
>>>>> For example, this would prevent mapping DAX-backed file page as 1GB.  As
>>>>> transparent_hugepage_adjust() only handles the case (level ==
>>>>> PT_PAGE_TABLE_LEVEL).
> 
> Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing.  It's
> unlikely THP itself will support 1GB pages any time soon, but having the
> logic there wouldn't hurt anything.

I agree.

> 
>>>>> As you are parsing the page-tables to discover the page-size the PFN is
>>>>> mapped in, I think you should instead modify kvm_host_page_size() to
>>>>> parse page-tables instead of rely on vma_kernel_pagesize() (Which relies
>>>>> on vma->vm_ops->pagesize()) in case of is_zone_device_page().
>>>>> 
>>>>> The main complication though of doing this is that at this point you
>>>>> don’t yet have the PFN that is retrieved by try_async_pf(). So maybe you
>>>>> should consider modifying the order of calls in tdp_page_fault() &
>>>>> FNAME(page_fault)().
>>>>> 
>>>>> -Liran
>>>> Or alternatively when thinking about it more, maybe just rename
>>>> transparent_hugepage_adjust() to not be specific to THP and better handle
>>>> the case of parsing page-tables changing mapping-level to 1GB.
>>>> That is probably easier and more elegant.
> 
> Agreed.
> 
>>> I can rename it to hugepage_adjust(), since it's not just THP anymore.
> 
> Or maybe allowed_hugepage_adjust()?  To pair with disallowed_hugepage_adjust(),
> which adjusts KVM's page size in the opposite direction to avoid the iTLB
> multi-hit issue.
> 
>> 
>> Sounds good.
>> 
>>> 
>>> I was a little hesitant to change the this to handle 1 GB pages with this
>>> patchset at first.  I didn't want to break the non-DAX case stuff by doing
>>> so.
>> 
>> Why would it affect non-DAX case?
>> Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
>> i.e. THP merged pages should still be detected by PageTransCompoundMap().
> 
> I think what Barret is saying is that teaching thp_adjust() how to do 1gb
> mappings would naturally affect the code path for THP pages.  But I agree
> that it would be superficial.
> 
>>> Specifically, can a THP page be 1 GB, and if so, how can you tell?  If you
>>> can't tell easily, I could walk the page table for all cases, instead of
>>> just zone_device().
> 
> No, THP doesn't currently support 1gb pages.  Expliciting returning
> PMD_SIZE on PageTransCompoundMap() would be a good thing from a readability
> perspective.

Right.

> 
>> I prefer to walk page-tables only for is_zone_device_page().
>> 
>>> 
>>> I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think,
>>> which would open this up to hugetlbfs pages (based on the comments).  Is
>>> there any reason why that would be a bad idea?
> 
> No, the "level == PT_PAGE_TABLE_LEVEL" check is to filter out the case
> where KVM is already planning on using a large page, e.g. when the memory
> is backed by hugetlbs.

Right.

> 
>> KVM already supports mapping 1GB hugetlbfs pages. As level is set to
>> PUD-level by
>> tdp_page_fault()->mapping_level()->host_mapping_level()->kvm_host_page_size()->vma_kernel_pagesize().
>> As VMA which is mmap of hugetlbfs sets vma->vm_ops to hugetlb_vm_ops() where
>> hugetlb_vm_op_pagesize() will return appropriate page-size.
>> 
>> Specifically, I don’t think THP ever merges small pages to 1GB pages. I think
>> this is why transparent_hugepage_adjust() checks PageTransCompoundMap() only
>> in case level == PT_PAGE_TABLE_LEVEL. I think you should keep this check in
>> the case of !is_zone_device_page().
> 
> I would add 1gb support for DAX as a third patch in this series.  To pave
> the way in patch 2/2, change it to replace "bool pfn_is_huge_mapped()" with
> "int host_pfn_mapping_level()", and maybe also renaming host_mapping_level()
> to host_vma_mapping_level() to avoid confusion.

I agree.
So also rename kvm_host_page_size() to kvm_host_vma_page_size() :)

> 
> Then allowed_hugepage_adjust() would look something like:
> 
> static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
> 				    kvm_pfn_t *pfnp, int *levelp, int max_level)
> {
> 	kvm_pfn_t pfn = *pfnp;
> 	int level = *levelp;	
> 	unsigned long mask;
> 
> 	if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
> 	    level == PT_PAGE_TABLE_LEVEL)
> 		return;
> 
> 	/*
> 	 * mmu_notifier_retry() was successful and mmu_lock is held, so
> 	 * the pmd/pud can't be split from under us.
> 	 */
> 	level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
> 
> 	*levelp = level = min(level, max_level);
> 	mask = KVM_PAGES_PER_HPAGE(level) - 1;
> 	VM_BUG_ON((gfn & mask) != (pfn & mask));
> 	*pfnp = pfn & ~mask;

Why don’t you still need to kvm_release_pfn_clean() for original pfn and kvm_get_pfn() for new huge-page start pfn?

> }

Yep. This is similar to what I had in mind.

Then just put logic of parsing page-tables in case it’s is_zone_device_page() or returning PMD_SIZE in case it’s PageTransCompoundMap() inside host_pfn_mapping_level(). This make code very straight-forward.

-Liran
Sean Christopherson Dec. 13, 2019, 5:50 p.m. UTC | #8
On Fri, Dec 13, 2019 at 07:31:55PM +0200, Liran Alon wrote:
> 
> > On 13 Dec 2019, at 19:19, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> > 
> > Then allowed_hugepage_adjust() would look something like:
> > 
> > static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
> > 				    kvm_pfn_t *pfnp, int *levelp, int max_level)
> > {
> > 	kvm_pfn_t pfn = *pfnp;
> > 	int level = *levelp;	
> > 	unsigned long mask;
> > 
> > 	if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
> > 	    level == PT_PAGE_TABLE_LEVEL)
> > 		return;
> > 
> > 	/*
> > 	 * mmu_notifier_retry() was successful and mmu_lock is held, so
> > 	 * the pmd/pud can't be split from under us.
> > 	 */
> > 	level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
> > 
> > 	*levelp = level = min(level, max_level);
> > 	mask = KVM_PAGES_PER_HPAGE(level) - 1;
> > 	VM_BUG_ON((gfn & mask) != (pfn & mask));
> > 	*pfnp = pfn & ~mask;
> 
> Why don’t you still need to kvm_release_pfn_clean() for original pfn and
> kvm_get_pfn() for new huge-page start pfn?

That code is gone in kvm/queue.  thp_adjust() is now called from
__direct_map() and FNAME(fetch), and so its pfn adjustment doesn't bleed
back to the page fault handlers.  The only reason the put/get pfn code
existed was because the page fault handlers called kvm_release_pfn_clean()
on the pfn, i.e. they would have put the wrong pfn.
Liran Alon Dec. 13, 2019, 6:08 p.m. UTC | #9
> On 13 Dec 2019, at 19:50, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> 
> On Fri, Dec 13, 2019 at 07:31:55PM +0200, Liran Alon wrote:
>> 
>>> On 13 Dec 2019, at 19:19, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
>>> 
>>> Then allowed_hugepage_adjust() would look something like:
>>> 
>>> static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
>>> 				    kvm_pfn_t *pfnp, int *levelp, int max_level)
>>> {
>>> 	kvm_pfn_t pfn = *pfnp;
>>> 	int level = *levelp;	
>>> 	unsigned long mask;
>>> 
>>> 	if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
>>> 	    level == PT_PAGE_TABLE_LEVEL)
>>> 		return;
>>> 
>>> 	/*
>>> 	 * mmu_notifier_retry() was successful and mmu_lock is held, so
>>> 	 * the pmd/pud can't be split from under us.
>>> 	 */
>>> 	level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
>>> 
>>> 	*levelp = level = min(level, max_level);
>>> 	mask = KVM_PAGES_PER_HPAGE(level) - 1;
>>> 	VM_BUG_ON((gfn & mask) != (pfn & mask));
>>> 	*pfnp = pfn & ~mask;
>> 
>> Why don’t you still need to kvm_release_pfn_clean() for original pfn and
>> kvm_get_pfn() for new huge-page start pfn?
> 
> That code is gone in kvm/queue.  thp_adjust() is now called from
> __direct_map() and FNAME(fetch), and so its pfn adjustment doesn't bleed
> back to the page fault handlers.  The only reason the put/get pfn code
> existed was because the page fault handlers called kvm_release_pfn_clean()
> on the pfn, i.e. they would have put the wrong pfn.

Ack. Thanks for the explaining this.
Barret Rhoden Dec. 16, 2019, 4:05 p.m. UTC | #10
On 12/13/19 12:19 PM, Sean Christopherson wrote:
> Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing.  It's
> unlikely THP itself will support 1GB pages any time soon, but having the
> logic there wouldn't hurt anything.
> 

Cool.  This was my main concern - I didn't want to break THP.

I'll rework the series based on all of your comments.

Thanks,

Barret
Sean Christopherson Jan. 7, 2020, 7:05 p.m. UTC | #11
On Mon, Dec 16, 2019 at 11:05:26AM -0500, Barret Rhoden wrote:
> On 12/13/19 12:19 PM, Sean Christopherson wrote:
> >Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing.  It's
> >unlikely THP itself will support 1GB pages any time soon, but having the
> >logic there wouldn't hurt anything.
> >
> 
> Cool.  This was my main concern - I didn't want to break THP.
> 
> I'll rework the series based on all of your comments.

Hopefully you haven't put too much effort into the rework, because I want
to commandeer the proposed changes and use them as the basis for a more
aggressive overhaul of KVM's hugepage handling.  Ironically, there's a bug
in KVM's THP handling that I _think_ can be avoided by using the DAX
approach of walking the host PTEs.

I'm in the process of testing, hopefully I'll get a series sent out later
today.  If not, I should at least be able to provide an update.
Barret Rhoden Jan. 7, 2020, 7:19 p.m. UTC | #12
Hi -

On 1/7/20 2:05 PM, Sean Christopherson wrote:
> On Mon, Dec 16, 2019 at 11:05:26AM -0500, Barret Rhoden wrote:
>> On 12/13/19 12:19 PM, Sean Christopherson wrote:
>>> Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing.  It's
>>> unlikely THP itself will support 1GB pages any time soon, but having the
>>> logic there wouldn't hurt anything.
>>>
>>
>> Cool.  This was my main concern - I didn't want to break THP.
>>
>> I'll rework the series based on all of your comments.
> 
> Hopefully you haven't put too much effort into the rework, because I want
> to commandeer the proposed changes and use them as the basis for a more
> aggressive overhaul of KVM's hugepage handling.  Ironically, there's a bug
> in KVM's THP handling that I _think_ can be avoided by using the DAX
> approach of walking the host PTEs.
> 
> I'm in the process of testing, hopefully I'll get a series sent out later
> today.  If not, I should at least be able to provide an update.

Nice timing.  I was just about to get back to this, so I haven't put any 
time in yet.  =)

Please CC me, and I'll try your patches out on my end.

Thanks,

Barret
Sean Christopherson Jan. 8, 2020, 1:20 a.m. UTC | #13
On Tue, Jan 07, 2020 at 02:19:06PM -0500, Barret Rhoden wrote:
> On 1/7/20 2:05 PM, Sean Christopherson wrote:
> >Hopefully you haven't put too much effort into the rework, because I want
> >to commandeer the proposed changes and use them as the basis for a more
> >aggressive overhaul of KVM's hugepage handling.  Ironically, there's a bug
> >in KVM's THP handling that I _think_ can be avoided by using the DAX
> >approach of walking the host PTEs.
> >
> >I'm in the process of testing, hopefully I'll get a series sent out later
> >today.  If not, I should at least be able to provide an update.
> 
> Nice timing.  I was just about to get back to this, so I haven't put any
> time in yet.  =)
> 
> Please CC me, and I'll try your patches out on my end.

Will do.  Barring last minute hiccups, the code is ready, just need to
finish off a few changelogs.  Should get it out early tomorrow.

One question that may help avoid some churn: are huge DAX pages not
tracked as compound pages?  The comment from your/this patch is pretty
unequivocal, but I wanted to double check that they will really return
false for PageCompound(), as opposed to only returning false for
PageTransCompoundMap().

	/*
	 * DAX pages do not use compound pages.  ...
	 */

Thanks!
Dan Williams Jan. 8, 2020, 1:39 a.m. UTC | #14
On Tue, Jan 7, 2020 at 5:20 PM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Tue, Jan 07, 2020 at 02:19:06PM -0500, Barret Rhoden wrote:
> > On 1/7/20 2:05 PM, Sean Christopherson wrote:
> > >Hopefully you haven't put too much effort into the rework, because I want
> > >to commandeer the proposed changes and use them as the basis for a more
> > >aggressive overhaul of KVM's hugepage handling.  Ironically, there's a bug
> > >in KVM's THP handling that I _think_ can be avoided by using the DAX
> > >approach of walking the host PTEs.
> > >
> > >I'm in the process of testing, hopefully I'll get a series sent out later
> > >today.  If not, I should at least be able to provide an update.
> >
> > Nice timing.  I was just about to get back to this, so I haven't put any
> > time in yet.  =)
> >
> > Please CC me, and I'll try your patches out on my end.
>
> Will do.  Barring last minute hiccups, the code is ready, just need to
> finish off a few changelogs.  Should get it out early tomorrow.
>
> One question that may help avoid some churn: are huge DAX pages not
> tracked as compound pages?  The comment from your/this patch is pretty
> unequivocal, but I wanted to double check that they will really return
> false for PageCompound(), as opposed to only returning false for
> PageTransCompoundMap().

PageCompound() returns false.

>
>         /*
>          * DAX pages do not use compound pages.  ...
>          */
>

None of the head / tail page infrastructure is set up for dax pages.
They are just independent 'struct page' objects that are
opportunistically mapped by different pte sizes in the dax core.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7269130ea5e2..ea8f6951398b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3328,6 +3328,30 @@  static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	__direct_pte_prefetch(vcpu, sp, sptep);
 }
 
+static bool pfn_is_huge_mapped(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+	unsigned long hva;
+
+	if (!is_zone_device_page(page))
+		return PageTransCompoundMap(page);
+
+	/*
+	 * DAX pages do not use compound pages.  The page should have already
+	 * been mapped into the host-side page table during try_async_pf(), so
+	 * we can check the page tables directly.
+	 */
+	hva = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(hva))
+		return false;
+
+	/*
+	 * Our caller grabbed the KVM mmu_lock with a successful
+	 * mmu_notifier_retry, so we're safe to walk the page table.
+	 */
+	return dev_pagemap_mapping_shift(hva, current->mm) > PAGE_SHIFT;
+}
+
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 					gfn_t gfn, kvm_pfn_t *pfnp,
 					int *levelp)
@@ -3342,8 +3366,8 @@  static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	 * here.
 	 */
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
-	    !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
-	    PageTransCompoundMap(pfn_to_page(pfn))) {
+	    level == PT_PAGE_TABLE_LEVEL &&
+	    pfn_is_huge_mapped(vcpu->kvm, gfn, pfn)) {
 		unsigned long mask;
 
 		/*
@@ -5957,8 +5981,7 @@  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		 * mapping if the indirect sp has level = 1.
 		 */
 		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
-		    !kvm_is_zone_device_pfn(pfn) &&
-		    PageTransCompoundMap(pfn_to_page(pfn))) {
+		    pfn_is_huge_mapped(kvm, sp->gfn, pfn)) {
 			pte_list_remove(rmap_head, sptep);
 
 			if (kvm_available_flush_tlb_with_range())