diff mbox series

[1/5] KVM: arm64: Walk userspace page tables to compute the THP mapping size

Message ID 20210717095541.1486210-2-maz@kernel.org (mailing list archive)
State New
Headers show
Series KVM: Remove kvm_is_transparent_hugepage() and friends | expand

Commit Message

Marc Zyngier July 17, 2021, 9:55 a.m. UTC
We currently rely on the kvm_is_transparent_hugepage() helper to
discover whether a given page has the potential to be mapped as
a block mapping.

However, this API doesn't really give un everything we want:
- we don't get the size: this is not crucial today as we only
  support PMD-sized THPs, but we'd like to have larger sizes
  in the future
- we're the only user left of the API, and there is a will
  to remove it altogether

To address the above, implement a simple walker using the existing
page table infrastructure, and plumb it into transparent_hugepage_adjust().
No new page sizes are supported in the process.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/mmu.c | 46 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

Comments

Paolo Bonzini July 19, 2021, 6:31 a.m. UTC | #1
On 17/07/21 11:55, Marc Zyngier wrote:
> We currently rely on the kvm_is_transparent_hugepage() helper to
> discover whether a given page has the potential to be mapped as
> a block mapping.
> 
> However, this API doesn't really give un everything we want:
> - we don't get the size: this is not crucial today as we only
>    support PMD-sized THPs, but we'd like to have larger sizes
>    in the future
> - we're the only user left of the API, and there is a will
>    to remove it altogether
> 
> To address the above, implement a simple walker using the existing
> page table infrastructure, and plumb it into transparent_hugepage_adjust().
> No new page sizes are supported in the process.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>

If it's okay for you to reuse the KVM page walker that's fine of course, 
but the arch/x86/mm functions lookup_address_in_{mm,pgd} are mostly 
machine-independent and it may make sense to move them to mm/.

That would also allow reusing the x86 function host_pfn_mapping_level.

Paolo

> ---
>   arch/arm64/kvm/mmu.c | 46 ++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 3155c9e778f0..db6314b93e99 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -433,6 +433,44 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
>   	return 0;
>   }
>   
> +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
> +	/* We shouldn't need any other callback to walk the PT */
> +	.phys_to_virt		= kvm_host_va,
> +};
> +
> +struct user_walk_data {
> +	u32	level;
> +};
> +
> +static int user_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> +		       enum kvm_pgtable_walk_flags flag, void * const arg)
> +{
> +	struct user_walk_data *data = arg;
> +
> +	data->level = level;
> +	return 0;
> +}
> +
> +static int get_user_mapping_size(struct kvm *kvm, u64 addr)
> +{
> +	struct user_walk_data data;
> +	struct kvm_pgtable pgt = {
> +		.pgd		= (kvm_pte_t *)kvm->mm->pgd,
> +		.ia_bits	= VA_BITS,
> +		.start_level	= 4 - CONFIG_PGTABLE_LEVELS,
> +		.mm_ops		= &kvm_user_mm_ops,
> +	};
> +	struct kvm_pgtable_walker walker = {
> +		.cb		= user_walker,
> +		.flags		= KVM_PGTABLE_WALK_LEAF,
> +		.arg		= &data,
> +	};
> +
> +	kvm_pgtable_walk(&pgt, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, &walker);
> +
> +	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(data.level));
> +}
> +
>   static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>   	.zalloc_page		= stage2_memcache_zalloc_page,
>   	.zalloc_pages_exact	= kvm_host_zalloc_pages_exact,
> @@ -780,7 +818,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
>    * Returns the size of the mapping.
>    */
>   static unsigned long
> -transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
>   			    unsigned long hva, kvm_pfn_t *pfnp,
>   			    phys_addr_t *ipap)
>   {
> @@ -791,8 +829,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>   	 * sure that the HVA and IPA are sufficiently aligned and that the
>   	 * block map is contained within the memslot.
>   	 */
> -	if (kvm_is_transparent_hugepage(pfn) &&
> -	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
> +	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
> +	    get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
>   		/*
>   		 * The address we faulted on is backed by a transparent huge
>   		 * page.  However, because we map the compound huge page and
> @@ -1051,7 +1089,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   	 * backed by a THP and thus use block mapping if possible.
>   	 */
>   	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
> -		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
> +		vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva,
>   							   &pfn, &fault_ipa);
>   
>   	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
>
Marc Zyngier July 19, 2021, 9:31 a.m. UTC | #2
On Mon, 19 Jul 2021 07:31:30 +0100,
Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 17/07/21 11:55, Marc Zyngier wrote:
> > We currently rely on the kvm_is_transparent_hugepage() helper to
> > discover whether a given page has the potential to be mapped as
> > a block mapping.
> > 
> > However, this API doesn't really give un everything we want:
> > - we don't get the size: this is not crucial today as we only
> >    support PMD-sized THPs, but we'd like to have larger sizes
> >    in the future
> > - we're the only user left of the API, and there is a will
> >    to remove it altogether
> > 
> > To address the above, implement a simple walker using the existing
> > page table infrastructure, and plumb it into transparent_hugepage_adjust().
> > No new page sizes are supported in the process.
> > 
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> 
> If it's okay for you to reuse the KVM page walker that's fine of
> course, but the arch/x86/mm functions lookup_address_in_{mm,pgd} are
> mostly machine-independent and it may make sense to move them to mm/.
>
> That would also allow reusing the x86 function host_pfn_mapping_level.

That could work to some extent, but the way the x86 code equates level
to mapping size is a bit at odds with the multiple page sizes that
arm64 deals with.

We're also trying to move away from the whole P*D abstraction, because
this isn't something we want to deal with in the self-contained EL2
object.

Thanks,

	M.
Alexandru Elisei July 20, 2021, 5:23 p.m. UTC | #3
Hi Marc,

I just can't figure out why having the mmap lock is not needed to walk the
userspace page tables. Any hints? Or am I not seeing where it's taken?

On 7/17/21 10:55 AM, Marc Zyngier wrote:
> We currently rely on the kvm_is_transparent_hugepage() helper to
> discover whether a given page has the potential to be mapped as
> a block mapping.
>
> However, this API doesn't really give un everything we want:
> - we don't get the size: this is not crucial today as we only
>   support PMD-sized THPs, but we'd like to have larger sizes
>   in the future
> - we're the only user left of the API, and there is a will
>   to remove it altogether
>
> To address the above, implement a simple walker using the existing
> page table infrastructure, and plumb it into transparent_hugepage_adjust().
> No new page sizes are supported in the process.
>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/mmu.c | 46 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 42 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 3155c9e778f0..db6314b93e99 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -433,6 +433,44 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
>  	return 0;
>  }
>  
> +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
> +	/* We shouldn't need any other callback to walk the PT */
> +	.phys_to_virt		= kvm_host_va,
> +};
> +
> +struct user_walk_data {
> +	u32	level;
> +};
> +
> +static int user_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> +		       enum kvm_pgtable_walk_flags flag, void * const arg)
> +{
> +	struct user_walk_data *data = arg;
> +
> +	data->level = level;
> +	return 0;
> +}
> +
> +static int get_user_mapping_size(struct kvm *kvm, u64 addr)
> +{
> +	struct user_walk_data data;
> +	struct kvm_pgtable pgt = {
> +		.pgd		= (kvm_pte_t *)kvm->mm->pgd,
> +		.ia_bits	= VA_BITS,
> +		.start_level	= 4 - CONFIG_PGTABLE_LEVELS,
> +		.mm_ops		= &kvm_user_mm_ops,
> +	};
> +	struct kvm_pgtable_walker walker = {
> +		.cb		= user_walker,
> +		.flags		= KVM_PGTABLE_WALK_LEAF,
> +		.arg		= &data,
> +	};
> +
> +	kvm_pgtable_walk(&pgt, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, &walker);

I take it that it is guaranteed that kvm_pgtable_walk() will never fail? For
example, I can see it failing if someone messes with KVM_PGTABLE_MAX_LEVELS. To be
honest, I would rather have a check here instead of potentially feeding a bogus
value to ARM64_HW_PGTABLE_LEVEL_SHIFT. It could be a VM_WARN_ON, so there's no
runtime overhead unless CONFIG_DEBUG_VM.

The patch looks good to me so far, but I want to give it another look (or two)
after I figure out why the mmap semaphone is not needed.

Thanks,

Alex

> +
> +	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(data.level));
> +}
> +
>  static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>  	.zalloc_page		= stage2_memcache_zalloc_page,
>  	.zalloc_pages_exact	= kvm_host_zalloc_pages_exact,
> @@ -780,7 +818,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
>   * Returns the size of the mapping.
>   */
>  static unsigned long
> -transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
>  			    unsigned long hva, kvm_pfn_t *pfnp,
>  			    phys_addr_t *ipap)
>  {
> @@ -791,8 +829,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>  	 * sure that the HVA and IPA are sufficiently aligned and that the
>  	 * block map is contained within the memslot.
>  	 */
> -	if (kvm_is_transparent_hugepage(pfn) &&
> -	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
> +	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
> +	    get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
>  		/*
>  		 * The address we faulted on is backed by a transparent huge
>  		 * page.  However, because we map the compound huge page and
> @@ -1051,7 +1089,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	 * backed by a THP and thus use block mapping if possible.
>  	 */
>  	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
> -		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
> +		vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva,
>  							   &pfn, &fault_ipa);
>  
>  	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
Sean Christopherson July 20, 2021, 8:33 p.m. UTC | #4
On Tue, Jul 20, 2021, Alexandru Elisei wrote:
> Hi Marc,
> 
> I just can't figure out why having the mmap lock is not needed to walk the
> userspace page tables. Any hints? Or am I not seeing where it's taken?

Disclaimer: I'm not super familiar with arm64's page tables, but the relevant KVM
functionality is common across x86 and arm64.

KVM arm64 (and x86) unconditionally registers a mmu_notifier for the mm_struct
associated with the VM, and disallows calling ioctls from a different process,
i.e. walking the page tables during KVM_RUN is guaranteed to use the mm for which
KVM registered the mmu_notifier.  As part of registration, the mmu_notifier
does mmgrab() and doesn't do mmdrop() until it's unregistered.  That ensures the
mm_struct itself is live.

For the page tables liveliness, KVM implements mmu_notifier_ops.release, which is
invoked at the beginning of exit_mmap(), before the page tables are freed.  In
its implementation, KVM takes mmu_lock and zaps all its shadow page tables, a.k.a.
the stage2 tables in KVM arm64.  The flow in question, get_user_mapping_size(),
also runs under mmu_lock, and so effectively blocks exit_mmap() and thus is
guaranteed to run with live userspace tables.

Lastly, KVM also implements mmu_notifier_ops.invalidate_range_{start,end}.  KVM's
invalidate_range implementations also take mmu_lock, and also update a sequence
counter and a flag stating that there's an invalidation in progress.  When
installing a stage2 entry, KVM snapshots the sequence counter before taking
mmu_lock, and then checks it again after acquiring mmu_lock.  If the counter
mismatches, or an invalidation is in-progress, then KVM bails and resumes the
guest without fixing the fault.

E.g. if the host zaps userspace page tables and KVM "wins" the race, the subsequent
kvm_mmu_notifier_invalidate_range_start() will zap the recently installed stage2
entries.  And if the host zap "wins" the race, KVM will resume the guest, which
in normal operation will hit the exception again and go back through the entire
process of installing stage2 entries.

Looking at the arm64 code, one thing I'm not clear on is whether arm64 correctly
handles the case where exit_mmap() wins the race.  The invalidate_range hooks will
still be called, so userspace page tables aren't a problem, but
kvm_arch_flush_shadow_all() -> kvm_free_stage2_pgd() nullifies mmu->pgt without
any additional notifications that I see.  x86 deals with this by ensuring its
top-level TDP entry (stage2 equivalent) is valid while the page fault handler is
running.

  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
  {
	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
	struct kvm_pgtable *pgt = NULL;

	spin_lock(&kvm->mmu_lock);
	pgt = mmu->pgt;
	if (pgt) {
		mmu->pgd_phys = 0;
		mmu->pgt = NULL;
		free_percpu(mmu->last_vcpu_ran);
	}
	spin_unlock(&kvm->mmu_lock);

	...
  }

AFAICT, nothing in user_mem_abort() would prevent consuming that null mmu->pgt
if exit_mmap() collidied with user_mem_abort().

  static int user_mem_abort(...)
  {

	...

	spin_lock(&kvm->mmu_lock);
	pgt = vcpu->arch.hw_mmu->pgt;         <-- hw_mmu->pgt may be NULL (hw_mmu points at vcpu->kvm->arch.mmu)
	if (mmu_notifier_retry(kvm, mmu_seq)) <-- mmu_seq not guaranteed to change
		goto out_unlock;

	...

	if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
	} else {
		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
					     __pfn_to_phys(pfn), prot,
					     memcache);
	}
  }
Will Deacon July 21, 2021, 2:58 p.m. UTC | #5
Hey Sean,

On Tue, Jul 20, 2021 at 08:33:46PM +0000, Sean Christopherson wrote:
> On Tue, Jul 20, 2021, Alexandru Elisei wrote:
> > I just can't figure out why having the mmap lock is not needed to walk the
> > userspace page tables. Any hints? Or am I not seeing where it's taken?
> 
> Disclaimer: I'm not super familiar with arm64's page tables, but the relevant KVM
> functionality is common across x86 and arm64.

No need for the disclaimer, there are so many moving parts here that I don't
think it's possible to be familiar with them all! Thanks for taking the time
to write it up so clearly.

> KVM arm64 (and x86) unconditionally registers a mmu_notifier for the mm_struct
> associated with the VM, and disallows calling ioctls from a different process,
> i.e. walking the page tables during KVM_RUN is guaranteed to use the mm for which
> KVM registered the mmu_notifier.  As part of registration, the mmu_notifier
> does mmgrab() and doesn't do mmdrop() until it's unregistered.  That ensures the
> mm_struct itself is live.
> 
> For the page tables liveliness, KVM implements mmu_notifier_ops.release, which is
> invoked at the beginning of exit_mmap(), before the page tables are freed.  In
> its implementation, KVM takes mmu_lock and zaps all its shadow page tables, a.k.a.
> the stage2 tables in KVM arm64.  The flow in question, get_user_mapping_size(),
> also runs under mmu_lock, and so effectively blocks exit_mmap() and thus is
> guaranteed to run with live userspace tables.

Unless I missed a case, exit_mmap() only runs when mm_struct::mm_users drops
to zero, right? The vCPU tasks should hold references to that afaict, so I
don't think it should be possible for exit_mmap() to run while there are
vCPUs running with the corresponding page-table.

> Looking at the arm64 code, one thing I'm not clear on is whether arm64 correctly
> handles the case where exit_mmap() wins the race.  The invalidate_range hooks will
> still be called, so userspace page tables aren't a problem, but
> kvm_arch_flush_shadow_all() -> kvm_free_stage2_pgd() nullifies mmu->pgt without
> any additional notifications that I see.  x86 deals with this by ensuring its
> top-level TDP entry (stage2 equivalent) is valid while the page fault handler is
> running.

But the fact that x86 handles this race has me worried. What am I missing?

I agree that, if the race can occur, we don't appear to handle it in the
arm64 backend.

Cheers,

Will
Sean Christopherson July 21, 2021, 3:56 p.m. UTC | #6
On Wed, Jul 21, 2021, Will Deacon wrote:
> > For the page tables liveliness, KVM implements mmu_notifier_ops.release, which is
> > invoked at the beginning of exit_mmap(), before the page tables are freed.  In
> > its implementation, KVM takes mmu_lock and zaps all its shadow page tables, a.k.a.
> > the stage2 tables in KVM arm64.  The flow in question, get_user_mapping_size(),
> > also runs under mmu_lock, and so effectively blocks exit_mmap() and thus is
> > guaranteed to run with live userspace tables.
> 
> Unless I missed a case, exit_mmap() only runs when mm_struct::mm_users drops
> to zero, right?

Yep.

> The vCPU tasks should hold references to that afaict, so I don't think it
> should be possible for exit_mmap() to run while there are vCPUs running with
> the corresponding page-table.

Ah, right, I was thinking of non-KVM code that operated on the page tables without
holding a reference to mm_users.

> > Looking at the arm64 code, one thing I'm not clear on is whether arm64 correctly
> > handles the case where exit_mmap() wins the race.  The invalidate_range hooks will
> > still be called, so userspace page tables aren't a problem, but
> > kvm_arch_flush_shadow_all() -> kvm_free_stage2_pgd() nullifies mmu->pgt without
> > any additional notifications that I see.  x86 deals with this by ensuring its
> > top-level TDP entry (stage2 equivalent) is valid while the page fault handler is
> > running.
> 
> But the fact that x86 handles this race has me worried. What am I missing?

I don't think you're missing anything.  I forgot that KVM_RUN would require an
elevated mm_users.  x86 does handle the impossible race, but that's coincidental.
The extra protections in x86 are to deal with other cases where a vCPU's top-level
SPTE can be invalidated while the vCPU is running.
Alexandru Elisei July 21, 2021, 4:37 p.m. UTC | #7
Hi Sean,

Thank you for writing this, it explains exactly what I wanted to know.

On 7/20/21 9:33 PM, Sean Christopherson wrote:
> On Tue, Jul 20, 2021, Alexandru Elisei wrote:
>> Hi Marc,
>>
>> I just can't figure out why having the mmap lock is not needed to walk the
>> userspace page tables. Any hints? Or am I not seeing where it's taken?
> Disclaimer: I'm not super familiar with arm64's page tables, but the relevant KVM
> functionality is common across x86 and arm64.
>
> KVM arm64 (and x86) unconditionally registers a mmu_notifier for the mm_struct
> associated with the VM, and disallows calling ioctls from a different process,
> i.e. walking the page tables during KVM_RUN is guaranteed to use the mm for which
> KVM registered the mmu_notifier.  As part of registration, the mmu_notifier
> does mmgrab() and doesn't do mmdrop() until it's unregistered.  That ensures the
> mm_struct itself is live.
>
> For the page tables liveliness, KVM implements mmu_notifier_ops.release, which is
> invoked at the beginning of exit_mmap(), before the page tables are freed.  In
> its implementation, KVM takes mmu_lock and zaps all its shadow page tables, a.k.a.
> the stage2 tables in KVM arm64.  The flow in question, get_user_mapping_size(),
> also runs under mmu_lock, and so effectively blocks exit_mmap() and thus is
> guaranteed to run with live userspace tables.
>
> Lastly, KVM also implements mmu_notifier_ops.invalidate_range_{start,end}.  KVM's
> invalidate_range implementations also take mmu_lock, and also update a sequence
> counter and a flag stating that there's an invalidation in progress.  When
> installing a stage2 entry, KVM snapshots the sequence counter before taking
> mmu_lock, and then checks it again after acquiring mmu_lock.  If the counter
> mismatches, or an invalidation is in-progress, then KVM bails and resumes the
> guest without fixing the fault.
>
> E.g. if the host zaps userspace page tables and KVM "wins" the race, the subsequent
> kvm_mmu_notifier_invalidate_range_start() will zap the recently installed stage2
> entries.  And if the host zap "wins" the race, KVM will resume the guest, which
> in normal operation will hit the exception again and go back through the entire
> process of installing stage2 entries.
>
> Looking at the arm64 code, one thing I'm not clear on is whether arm64 correctly
> handles the case where exit_mmap() wins the race.  The invalidate_range hooks will
> still be called, so userspace page tables aren't a problem, but
> kvm_arch_flush_shadow_all() -> kvm_free_stage2_pgd() nullifies mmu->pgt without
> any additional notifications that I see.  x86 deals with this by ensuring its
> top-level TDP entry (stage2 equivalent) is valid while the page fault handler is
> running.
>
>   void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>   {
> 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
> 	struct kvm_pgtable *pgt = NULL;
>
> 	spin_lock(&kvm->mmu_lock);
> 	pgt = mmu->pgt;
> 	if (pgt) {
> 		mmu->pgd_phys = 0;
> 		mmu->pgt = NULL;
> 		free_percpu(mmu->last_vcpu_ran);
> 	}
> 	spin_unlock(&kvm->mmu_lock);
>
> 	...
>   }
>
> AFAICT, nothing in user_mem_abort() would prevent consuming that null mmu->pgt
> if exit_mmap() collidied with user_mem_abort().
>
>   static int user_mem_abort(...)
>   {
>
> 	...
>
> 	spin_lock(&kvm->mmu_lock);
> 	pgt = vcpu->arch.hw_mmu->pgt;         <-- hw_mmu->pgt may be NULL (hw_mmu points at vcpu->kvm->arch.mmu)
> 	if (mmu_notifier_retry(kvm, mmu_seq)) <-- mmu_seq not guaranteed to change
> 		goto out_unlock;
>
> 	...
>
> 	if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
> 		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
> 	} else {
> 		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
> 					     __pfn_to_phys(pfn), prot,
> 					     memcache);
> 	}
>   }
Marc Zyngier July 23, 2021, 8:48 a.m. UTC | #8
On Tue, 20 Jul 2021 18:23:02 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> I just can't figure out why having the mmap lock is not needed to walk the
> userspace page tables. Any hints? Or am I not seeing where it's taken?

I trust Sean's explanation was complete enough!

> On 7/17/21 10:55 AM, Marc Zyngier wrote:
> > We currently rely on the kvm_is_transparent_hugepage() helper to
> > discover whether a given page has the potential to be mapped as
> > a block mapping.
> >
> > However, this API doesn't really give un everything we want:
> > - we don't get the size: this is not crucial today as we only
> >   support PMD-sized THPs, but we'd like to have larger sizes
> >   in the future
> > - we're the only user left of the API, and there is a will
> >   to remove it altogether
> >
> > To address the above, implement a simple walker using the existing
> > page table infrastructure, and plumb it into transparent_hugepage_adjust().
> > No new page sizes are supported in the process.
> >
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > ---
> >  arch/arm64/kvm/mmu.c | 46 ++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 42 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 3155c9e778f0..db6314b93e99 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -433,6 +433,44 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
> >  	return 0;
> >  }
> >  
> > +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
> > +	/* We shouldn't need any other callback to walk the PT */
> > +	.phys_to_virt		= kvm_host_va,
> > +};
> > +
> > +struct user_walk_data {
> > +	u32	level;
> > +};
> > +
> > +static int user_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > +		       enum kvm_pgtable_walk_flags flag, void * const arg)
> > +{
> > +	struct user_walk_data *data = arg;
> > +
> > +	data->level = level;
> > +	return 0;
> > +}
> > +
> > +static int get_user_mapping_size(struct kvm *kvm, u64 addr)
> > +{
> > +	struct user_walk_data data;
> > +	struct kvm_pgtable pgt = {
> > +		.pgd		= (kvm_pte_t *)kvm->mm->pgd,
> > +		.ia_bits	= VA_BITS,
> > +		.start_level	= 4 - CONFIG_PGTABLE_LEVELS,
> > +		.mm_ops		= &kvm_user_mm_ops,
> > +	};
> > +	struct kvm_pgtable_walker walker = {
> > +		.cb		= user_walker,
> > +		.flags		= KVM_PGTABLE_WALK_LEAF,
> > +		.arg		= &data,
> > +	};
> > +
> > +	kvm_pgtable_walk(&pgt, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, &walker);
> 
> I take it that it is guaranteed that kvm_pgtable_walk() will never
> fail? For example, I can see it failing if someone messes with
> KVM_PGTABLE_MAX_LEVELS.

But that's an architectural constant. How could it be messed with?
When we introduce 5 levels of page tables, we'll have to check all
this anyway.

> To be honest, I would rather have a check here instead of
> potentially feeding a bogus value to ARM64_HW_PGTABLE_LEVEL_SHIFT.
> It could be a VM_WARN_ON, so there's no runtime overhead unless
> CONFIG_DEBUG_VM.

Fair enough. That's easy enough to check.

> The patch looks good to me so far, but I want to give it another
> look (or two) after I figure out why the mmap semaphone is not
> needed.

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3155c9e778f0..db6314b93e99 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -433,6 +433,44 @@  int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 	return 0;
 }
 
+static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
+	/* We shouldn't need any other callback to walk the PT */
+	.phys_to_virt		= kvm_host_va,
+};
+
+struct user_walk_data {
+	u32	level;
+};
+
+static int user_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+		       enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+	struct user_walk_data *data = arg;
+
+	data->level = level;
+	return 0;
+}
+
+static int get_user_mapping_size(struct kvm *kvm, u64 addr)
+{
+	struct user_walk_data data;
+	struct kvm_pgtable pgt = {
+		.pgd		= (kvm_pte_t *)kvm->mm->pgd,
+		.ia_bits	= VA_BITS,
+		.start_level	= 4 - CONFIG_PGTABLE_LEVELS,
+		.mm_ops		= &kvm_user_mm_ops,
+	};
+	struct kvm_pgtable_walker walker = {
+		.cb		= user_walker,
+		.flags		= KVM_PGTABLE_WALK_LEAF,
+		.arg		= &data,
+	};
+
+	kvm_pgtable_walk(&pgt, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, &walker);
+
+	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(data.level));
+}
+
 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.zalloc_page		= stage2_memcache_zalloc_page,
 	.zalloc_pages_exact	= kvm_host_zalloc_pages_exact,
@@ -780,7 +818,7 @@  static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
  * Returns the size of the mapping.
  */
 static unsigned long
-transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
+transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			    unsigned long hva, kvm_pfn_t *pfnp,
 			    phys_addr_t *ipap)
 {
@@ -791,8 +829,8 @@  transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 	 * sure that the HVA and IPA are sufficiently aligned and that the
 	 * block map is contained within the memslot.
 	 */
-	if (kvm_is_transparent_hugepage(pfn) &&
-	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+	    get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
 		/*
 		 * The address we faulted on is backed by a transparent huge
 		 * page.  However, because we map the compound huge page and
@@ -1051,7 +1089,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * backed by a THP and thus use block mapping if possible.
 	 */
 	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
-		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
+		vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva,
 							   &pfn, &fault_ipa);
 
 	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {