[12/17] kvm-arm: Add explicit hyp page table modifiers

Message ID	1459787177-12767-13-git-send-email-suzuki.poulose@arm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Suzuki K Poulose <suzuki.poulose@arm.com> To: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org, kvmarm@lists.cs.columbia.edu, kvm@vger.kernel.org, marc.zyngier@arm.com, christoffer.dall@linaro.org, mark.rutland@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, Suzuki K Poulose <suzuki.poulose@arm.com> Subject: [PATCH 12/17] kvm-arm: Add explicit hyp page table modifiers Date: Mon, 4 Apr 2016 17:26:12 +0100 Message-Id: <1459787177-12767-13-git-send-email-suzuki.poulose@arm.com> In-Reply-To: <1459787177-12767-1-git-send-email-suzuki.poulose@arm.com> References: <1459787177-12767-1-git-send-email-suzuki.poulose@arm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

Suzuki K Poulose April 4, 2016, 4:26 p.m. UTC

We have common routines to modify hyp and stage2 page tables
based on the 'kvm' parameter. For a smoother transition to
using separate routines for each, duplicate the routines
and modify the copy to work on hyp.

Marks the forked routines with _hyp_ and gets rid of the
kvm parameter which is no longer needed and is NULL for hyp.
Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
from the hyp versions. Uses explicit host page table accessors
instead of the kvm_* page table helpers.

Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---
 arch/arm/kvm/mmu.c |  123 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 118 insertions(+), 5 deletions(-)

Christoffer Dall April 8, 2016, 1:15 p.m. UTC | #1

On Mon, Apr 04, 2016 at 05:26:12PM +0100, Suzuki K Poulose wrote:
> We have common routines to modify hyp and stage2 page tables
> based on the 'kvm' parameter. For a smoother transition to
> using separate routines for each, duplicate the routines
> and modify the copy to work on hyp.
> 
> Marks the forked routines with _hyp_ and gets rid of the
> kvm parameter which is no longer needed and is NULL for hyp.
> Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
> from the hyp versions. Uses explicit host page table accessors
> instead of the kvm_* page table helpers.
> 
> Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Marc Zyngier <marc.zyngier@arm.com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> ---
>  arch/arm/kvm/mmu.c |  123 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 118 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index b46a337..2b491e5 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -388,6 +388,119 @@ static void stage2_flush_vm(struct kvm *kvm)
>  	srcu_read_unlock(&kvm->srcu, idx);
>  }
>  
> +static void clear_hyp_pgd_entry(pgd_t *pgd)
> +{
> +	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
> +	pgd_clear(pgd);
> +	pud_free(NULL, pud_table);
> +	put_page(virt_to_page(pgd));
> +}
> +
> +static void clear_hyp_pud_entry(pud_t *pud)
> +{
> +	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
> +	VM_BUG_ON(pud_huge(*pud));
> +	pud_clear(pud);
> +	pmd_free(NULL, pmd_table);
> +	put_page(virt_to_page(pud));
> +}
> +
> +static void clear_hyp_pmd_entry(pmd_t *pmd)
> +{
> +	pte_t *pte_table = pte_offset_kernel(pmd, 0);
> +	VM_BUG_ON(pmd_thp_or_huge(*pmd));
> +	pmd_clear(pmd);
> +	pte_free_kernel(NULL, pte_table);
> +	put_page(virt_to_page(pmd));
> +}
> +
> +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
> +{
> +	pte_t *pte, *start_pte;
> +
> +	start_pte = pte = pte_offset_kernel(pmd, addr);
> +	do {
> +		if (!pte_none(*pte)) {
> +			pte_t old_pte = *pte;
> +
> +			kvm_set_pte(pte, __pte(0));
> +
> +			/* XXX: Do we need to invalidate the cache for device mappings ? */

no, we will not be swapping out any pages mapped in Hyp mode so you can
get rid of both of the following two lines.

> +			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
> +				kvm_flush_dcache_pte(old_pte);
> +
> +			put_page(virt_to_page(pte));
> +		}
> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> +
> +	if (hyp_pte_table_empty(start_pte))
> +		clear_hyp_pmd_entry(pmd);
> +}
> +
> +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
> +{
> +	phys_addr_t next;
> +	pmd_t *pmd, *start_pmd;
> +
> +	start_pmd = pmd = pmd_offset(pud, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (!pmd_none(*pmd)) {
> +			if (pmd_thp_or_huge(*pmd)) {

do we ever actually map anything with section mappings in the Hyp
mappings?

> +				pmd_t old_pmd = *pmd;
> +
> +				pmd_clear(pmd);
> +				kvm_flush_dcache_pmd(old_pmd);
> +				put_page(virt_to_page(pmd));
> +			} else {
> +				unmap_hyp_ptes(pmd, addr, next);
> +			}
> +		}
> +	} while (pmd++, addr = next, addr != end);
> +
> +	if (hyp_pmd_table_empty(start_pmd))
> +		clear_hyp_pud_entry(pud);
> +}
> +
> +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
> +{
> +	phys_addr_t next;
> +	pud_t *pud, *start_pud;
> +
> +	start_pud = pud = pud_offset(pgd, addr);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (!pud_none(*pud)) {
> +			if (pud_huge(*pud)) {

do we ever actually map anything with huge pud
mappings for the Hyp space?

> +				pud_t old_pud = *pud;
> +
> +				pud_clear(pud);
> +				kvm_flush_dcache_pud(old_pud);
> +				put_page(virt_to_page(pud));
> +			} else {
> +				unmap_hyp_pmds(pud, addr, next);
> +			}
> +		}
> +	} while (pud++, addr = next, addr != end);
> +
> +	if (hyp_pud_table_empty(start_pud))
> +		clear_hyp_pgd_entry(pgd);
> +}
> +
> +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
> +{
> +	pgd_t *pgd;
> +	phys_addr_t addr = start, end = start + size;
> +	phys_addr_t next;
> +
> +	pgd = pgdp + pgd_index(addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (!pgd_none(*pgd))
> +			unmap_hyp_puds(pgd, addr, next);
> +	} while (pgd++, addr = next, addr != end);

shouldn't we flush the EL2 (hyp) TLB here, strictly speaking?

Or do we rely on all mappings ever created/torn down here to always have
the same VA/PA relationship?  Since we didn't flush the EL2 TLB in the
existing code, that indeed does seem to be the case.

That, in turn, raises the question why we don't simply map all pages
that could be referenced by a kmalloc() in Hyp mode during the init
phase and be done with all this hyp mapping/unmapping stuff?

In any case, that behavior doesn't have to change now, but if we don't
add a TLB flush here, I'd like a comment to explain why that's not
needed.

> +}
> +
>  /**
>   * free_boot_hyp_pgd - free HYP boot page tables
>   *
> @@ -398,14 +511,14 @@ void free_boot_hyp_pgd(void)
>  	mutex_lock(&kvm_hyp_pgd_mutex);
>  
>  	if (boot_hyp_pgd) {
> -		unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
> -		unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
> +		unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
> +		unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
>  		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
>  		boot_hyp_pgd = NULL;
>  	}
>  
>  	if (hyp_pgd)
> -		unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
> +		unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
>  
>  	mutex_unlock(&kvm_hyp_pgd_mutex);
>  }
> @@ -430,9 +543,9 @@ void free_hyp_pgds(void)
>  
>  	if (hyp_pgd) {
>  		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
> -			unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
> +			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
>  		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
> -			unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
> +			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
>  
>  		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
>  		hyp_pgd = NULL;
> -- 
> 1.7.9.5
> 

Otherwise looks good!

I'm quite happy to see the hyp/stage2 stuff decoupled.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Marc Zyngier April 8, 2016, 3:09 p.m. UTC | #2

On 08/04/16 14:15, Christoffer Dall wrote:
> On Mon, Apr 04, 2016 at 05:26:12PM +0100, Suzuki K Poulose wrote:
>> We have common routines to modify hyp and stage2 page tables
>> based on the 'kvm' parameter. For a smoother transition to
>> using separate routines for each, duplicate the routines
>> and modify the copy to work on hyp.
>>
>> Marks the forked routines with _hyp_ and gets rid of the
>> kvm parameter which is no longer needed and is NULL for hyp.
>> Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
>> from the hyp versions. Uses explicit host page table accessors
>> instead of the kvm_* page table helpers.
>>
>> Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>> ---
>>  arch/arm/kvm/mmu.c |  123 +++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 118 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index b46a337..2b491e5 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -388,6 +388,119 @@ static void stage2_flush_vm(struct kvm *kvm)
>>  	srcu_read_unlock(&kvm->srcu, idx);
>>  }
>>  
>> +static void clear_hyp_pgd_entry(pgd_t *pgd)
>> +{
>> +	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
>> +	pgd_clear(pgd);
>> +	pud_free(NULL, pud_table);
>> +	put_page(virt_to_page(pgd));
>> +}
>> +
>> +static void clear_hyp_pud_entry(pud_t *pud)
>> +{
>> +	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
>> +	VM_BUG_ON(pud_huge(*pud));
>> +	pud_clear(pud);
>> +	pmd_free(NULL, pmd_table);
>> +	put_page(virt_to_page(pud));
>> +}
>> +
>> +static void clear_hyp_pmd_entry(pmd_t *pmd)
>> +{
>> +	pte_t *pte_table = pte_offset_kernel(pmd, 0);
>> +	VM_BUG_ON(pmd_thp_or_huge(*pmd));
>> +	pmd_clear(pmd);
>> +	pte_free_kernel(NULL, pte_table);
>> +	put_page(virt_to_page(pmd));
>> +}
>> +
>> +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	pte_t *pte, *start_pte;
>> +
>> +	start_pte = pte = pte_offset_kernel(pmd, addr);
>> +	do {
>> +		if (!pte_none(*pte)) {
>> +			pte_t old_pte = *pte;
>> +
>> +			kvm_set_pte(pte, __pte(0));
>> +
>> +			/* XXX: Do we need to invalidate the cache for device mappings ? */
> 
> no, we will not be swapping out any pages mapped in Hyp mode so you can
> get rid of both of the following two lines.
> 
>> +			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
>> +				kvm_flush_dcache_pte(old_pte);
>> +
>> +			put_page(virt_to_page(pte));
>> +		}
>> +	} while (pte++, addr += PAGE_SIZE, addr != end);
>> +
>> +	if (hyp_pte_table_empty(start_pte))
>> +		clear_hyp_pmd_entry(pmd);
>> +}
>> +
>> +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	phys_addr_t next;
>> +	pmd_t *pmd, *start_pmd;
>> +
>> +	start_pmd = pmd = pmd_offset(pud, addr);
>> +	do {
>> +		next = pmd_addr_end(addr, end);
>> +		if (!pmd_none(*pmd)) {
>> +			if (pmd_thp_or_huge(*pmd)) {
> 
> do we ever actually map anything with section mappings in the Hyp
> mappings?

No, this is purely a page mapping so far. On my system, the HYP text is
just over 4 pages big (4k pages), so the incentive is pretty low, unless
we can demonstrate some big gains due to the reduced TLB impact.

>> +				pmd_t old_pmd = *pmd;
>> +
>> +				pmd_clear(pmd);
>> +				kvm_flush_dcache_pmd(old_pmd);
>> +				put_page(virt_to_page(pmd));
>> +			} else {
>> +				unmap_hyp_ptes(pmd, addr, next);
>> +			}
>> +		}
>> +	} while (pmd++, addr = next, addr != end);
>> +
>> +	if (hyp_pmd_table_empty(start_pmd))
>> +		clear_hyp_pud_entry(pud);
>> +}
>> +
>> +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
>> +{
>> +	phys_addr_t next;
>> +	pud_t *pud, *start_pud;
>> +
>> +	start_pud = pud = pud_offset(pgd, addr);
>> +	do {
>> +		next = pud_addr_end(addr, end);
>> +		if (!pud_none(*pud)) {
>> +			if (pud_huge(*pud)) {
> 
> do we ever actually map anything with huge pud
> mappings for the Hyp space?

Same thing. Looks like there is some potential simplification here.

> 
>> +				pud_t old_pud = *pud;
>> +
>> +				pud_clear(pud);
>> +				kvm_flush_dcache_pud(old_pud);
>> +				put_page(virt_to_page(pud));
>> +			} else {
>> +				unmap_hyp_pmds(pud, addr, next);
>> +			}
>> +		}
>> +	} while (pud++, addr = next, addr != end);
>> +
>> +	if (hyp_pud_table_empty(start_pud))
>> +		clear_hyp_pgd_entry(pgd);
>> +}
>> +
>> +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
>> +{
>> +	pgd_t *pgd;
>> +	phys_addr_t addr = start, end = start + size;
>> +	phys_addr_t next;
>> +
>> +	pgd = pgdp + pgd_index(addr);
>> +	do {
>> +		next = pgd_addr_end(addr, end);
>> +		if (!pgd_none(*pgd))
>> +			unmap_hyp_puds(pgd, addr, next);
>> +	} while (pgd++, addr = next, addr != end);
> 
> shouldn't we flush the EL2 (hyp) TLB here, strictly speaking?
> 
> Or do we rely on all mappings ever created/torn down here to always have
> the same VA/PA relationship?  Since we didn't flush the EL2 TLB in the
> existing code, that indeed does seem to be the case.

Actually, we never unmap anything from HYP. Once a structure (kvm, vcpu)
is mapped there, it stays forever, whatever happens to the VM (that's
because we'd otherwise have to refcount the number of objects in a page,
and I'm lazy...).

> That, in turn, raises the question why we don't simply map all pages
> that could be referenced by a kmalloc() in Hyp mode during the init
> phase and be done with all this hyp mapping/unmapping stuff?
> 
> In any case, that behavior doesn't have to change now, but if we don't
> add a TLB flush here, I'd like a comment to explain why that's not
> needed.

Hope you have your answer above... ;-)

Thanks,

	M.

Christoffer Dall April 8, 2016, 3:16 p.m. UTC | #3

On Fri, Apr 08, 2016 at 04:09:11PM +0100, Marc Zyngier wrote:
> On 08/04/16 14:15, Christoffer Dall wrote:
> > On Mon, Apr 04, 2016 at 05:26:12PM +0100, Suzuki K Poulose wrote:
> >> We have common routines to modify hyp and stage2 page tables
> >> based on the 'kvm' parameter. For a smoother transition to
> >> using separate routines for each, duplicate the routines
> >> and modify the copy to work on hyp.
> >>
> >> Marks the forked routines with _hyp_ and gets rid of the
> >> kvm parameter which is no longer needed and is NULL for hyp.
> >> Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
> >> from the hyp versions. Uses explicit host page table accessors
> >> instead of the kvm_* page table helpers.
> >>
> >> Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
> >> Cc: Marc Zyngier <marc.zyngier@arm.com>
> >> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> >> ---
> >>  arch/arm/kvm/mmu.c |  123 +++++++++++++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 118 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >> index b46a337..2b491e5 100644
> >> --- a/arch/arm/kvm/mmu.c
> >> +++ b/arch/arm/kvm/mmu.c
> >> @@ -388,6 +388,119 @@ static void stage2_flush_vm(struct kvm *kvm)
> >>  	srcu_read_unlock(&kvm->srcu, idx);
> >>  }
> >>  
> >> +static void clear_hyp_pgd_entry(pgd_t *pgd)
> >> +{
> >> +	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
> >> +	pgd_clear(pgd);
> >> +	pud_free(NULL, pud_table);
> >> +	put_page(virt_to_page(pgd));
> >> +}
> >> +
> >> +static void clear_hyp_pud_entry(pud_t *pud)
> >> +{
> >> +	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
> >> +	VM_BUG_ON(pud_huge(*pud));
> >> +	pud_clear(pud);
> >> +	pmd_free(NULL, pmd_table);
> >> +	put_page(virt_to_page(pud));
> >> +}
> >> +
> >> +static void clear_hyp_pmd_entry(pmd_t *pmd)
> >> +{
> >> +	pte_t *pte_table = pte_offset_kernel(pmd, 0);
> >> +	VM_BUG_ON(pmd_thp_or_huge(*pmd));
> >> +	pmd_clear(pmd);
> >> +	pte_free_kernel(NULL, pte_table);
> >> +	put_page(virt_to_page(pmd));
> >> +}
> >> +
> >> +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
> >> +{
> >> +	pte_t *pte, *start_pte;
> >> +
> >> +	start_pte = pte = pte_offset_kernel(pmd, addr);
> >> +	do {
> >> +		if (!pte_none(*pte)) {
> >> +			pte_t old_pte = *pte;
> >> +
> >> +			kvm_set_pte(pte, __pte(0));
> >> +
> >> +			/* XXX: Do we need to invalidate the cache for device mappings ? */
> > 
> > no, we will not be swapping out any pages mapped in Hyp mode so you can
> > get rid of both of the following two lines.
> > 
> >> +			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
> >> +				kvm_flush_dcache_pte(old_pte);
> >> +
> >> +			put_page(virt_to_page(pte));
> >> +		}
> >> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> >> +
> >> +	if (hyp_pte_table_empty(start_pte))
> >> +		clear_hyp_pmd_entry(pmd);
> >> +}
> >> +
> >> +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
> >> +{
> >> +	phys_addr_t next;
> >> +	pmd_t *pmd, *start_pmd;
> >> +
> >> +	start_pmd = pmd = pmd_offset(pud, addr);
> >> +	do {
> >> +		next = pmd_addr_end(addr, end);
> >> +		if (!pmd_none(*pmd)) {
> >> +			if (pmd_thp_or_huge(*pmd)) {
> > 
> > do we ever actually map anything with section mappings in the Hyp
> > mappings?
> 
> No, this is purely a page mapping so far. On my system, the HYP text is
> just over 4 pages big (4k pages), so the incentive is pretty low, unless
> we can demonstrate some big gains due to the reduced TLB impact.
> 
> >> +				pmd_t old_pmd = *pmd;
> >> +
> >> +				pmd_clear(pmd);
> >> +				kvm_flush_dcache_pmd(old_pmd);
> >> +				put_page(virt_to_page(pmd));
> >> +			} else {
> >> +				unmap_hyp_ptes(pmd, addr, next);
> >> +			}
> >> +		}
> >> +	} while (pmd++, addr = next, addr != end);
> >> +
> >> +	if (hyp_pmd_table_empty(start_pmd))
> >> +		clear_hyp_pud_entry(pud);
> >> +}
> >> +
> >> +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
> >> +{
> >> +	phys_addr_t next;
> >> +	pud_t *pud, *start_pud;
> >> +
> >> +	start_pud = pud = pud_offset(pgd, addr);
> >> +	do {
> >> +		next = pud_addr_end(addr, end);
> >> +		if (!pud_none(*pud)) {
> >> +			if (pud_huge(*pud)) {
> > 
> > do we ever actually map anything with huge pud
> > mappings for the Hyp space?
> 
> Same thing. Looks like there is some potential simplification here.
> 
> > 
> >> +				pud_t old_pud = *pud;
> >> +
> >> +				pud_clear(pud);
> >> +				kvm_flush_dcache_pud(old_pud);
> >> +				put_page(virt_to_page(pud));
> >> +			} else {
> >> +				unmap_hyp_pmds(pud, addr, next);
> >> +			}
> >> +		}
> >> +	} while (pud++, addr = next, addr != end);
> >> +
> >> +	if (hyp_pud_table_empty(start_pud))
> >> +		clear_hyp_pgd_entry(pgd);
> >> +}
> >> +
> >> +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
> >> +{
> >> +	pgd_t *pgd;
> >> +	phys_addr_t addr = start, end = start + size;
> >> +	phys_addr_t next;
> >> +
> >> +	pgd = pgdp + pgd_index(addr);
> >> +	do {
> >> +		next = pgd_addr_end(addr, end);
> >> +		if (!pgd_none(*pgd))
> >> +			unmap_hyp_puds(pgd, addr, next);
> >> +	} while (pgd++, addr = next, addr != end);
> > 
> > shouldn't we flush the EL2 (hyp) TLB here, strictly speaking?
> > 
> > Or do we rely on all mappings ever created/torn down here to always have
> > the same VA/PA relationship?  Since we didn't flush the EL2 TLB in the
> > existing code, that indeed does seem to be the case.
> 
> Actually, we never unmap anything from HYP. Once a structure (kvm, vcpu)
> is mapped there, it stays forever, whatever happens to the VM (that's
> because we'd otherwise have to refcount the number of objects in a page,
> and I'm lazy...).
> 
> > That, in turn, raises the question why we don't simply map all pages
> > that could be referenced by a kmalloc() in Hyp mode during the init
> > phase and be done with all this hyp mapping/unmapping stuff?
> > 
> > In any case, that behavior doesn't have to change now, but if we don't
> > add a TLB flush here, I'd like a comment to explain why that's not
> > needed.
> 
> Hope you have your answer above... ;-)
> 
Not quite: Could we just map the linearly mapped region in Hyp mode from
the beginning and be done with all this?

Otherwise yes, I have the answer, and we should add a comment too.

Thanks,
-Christoffer


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Marc Zyngier April 8, 2016, 3:22 p.m. UTC | #4

On 08/04/16 16:16, Christoffer Dall wrote:
> On Fri, Apr 08, 2016 at 04:09:11PM +0100, Marc Zyngier wrote:
>> On 08/04/16 14:15, Christoffer Dall wrote:
>>> On Mon, Apr 04, 2016 at 05:26:12PM +0100, Suzuki K Poulose wrote:
>>>> We have common routines to modify hyp and stage2 page tables
>>>> based on the 'kvm' parameter. For a smoother transition to
>>>> using separate routines for each, duplicate the routines
>>>> and modify the copy to work on hyp.
>>>>
>>>> Marks the forked routines with _hyp_ and gets rid of the
>>>> kvm parameter which is no longer needed and is NULL for hyp.
>>>> Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
>>>> from the hyp versions. Uses explicit host page table accessors
>>>> instead of the kvm_* page table helpers.
>>>>
>>>> Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
>>>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>>>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>>>> ---
>>>>  arch/arm/kvm/mmu.c |  123 +++++++++++++++++++++++++++++++++++++++++++++++++---
>>>>  1 file changed, 118 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index b46a337..2b491e5 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -388,6 +388,119 @@ static void stage2_flush_vm(struct kvm *kvm)
>>>>  	srcu_read_unlock(&kvm->srcu, idx);
>>>>  }
>>>>  
>>>> +static void clear_hyp_pgd_entry(pgd_t *pgd)
>>>> +{
>>>> +	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
>>>> +	pgd_clear(pgd);
>>>> +	pud_free(NULL, pud_table);
>>>> +	put_page(virt_to_page(pgd));
>>>> +}
>>>> +
>>>> +static void clear_hyp_pud_entry(pud_t *pud)
>>>> +{
>>>> +	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
>>>> +	VM_BUG_ON(pud_huge(*pud));
>>>> +	pud_clear(pud);
>>>> +	pmd_free(NULL, pmd_table);
>>>> +	put_page(virt_to_page(pud));
>>>> +}
>>>> +
>>>> +static void clear_hyp_pmd_entry(pmd_t *pmd)
>>>> +{
>>>> +	pte_t *pte_table = pte_offset_kernel(pmd, 0);
>>>> +	VM_BUG_ON(pmd_thp_or_huge(*pmd));
>>>> +	pmd_clear(pmd);
>>>> +	pte_free_kernel(NULL, pte_table);
>>>> +	put_page(virt_to_page(pmd));
>>>> +}
>>>> +
>>>> +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>>>> +{
>>>> +	pte_t *pte, *start_pte;
>>>> +
>>>> +	start_pte = pte = pte_offset_kernel(pmd, addr);
>>>> +	do {
>>>> +		if (!pte_none(*pte)) {
>>>> +			pte_t old_pte = *pte;
>>>> +
>>>> +			kvm_set_pte(pte, __pte(0));
>>>> +
>>>> +			/* XXX: Do we need to invalidate the cache for device mappings ? */
>>>
>>> no, we will not be swapping out any pages mapped in Hyp mode so you can
>>> get rid of both of the following two lines.
>>>
>>>> +			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
>>>> +				kvm_flush_dcache_pte(old_pte);
>>>> +
>>>> +			put_page(virt_to_page(pte));
>>>> +		}
>>>> +	} while (pte++, addr += PAGE_SIZE, addr != end);
>>>> +
>>>> +	if (hyp_pte_table_empty(start_pte))
>>>> +		clear_hyp_pmd_entry(pmd);
>>>> +}
>>>> +
>>>> +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
>>>> +{
>>>> +	phys_addr_t next;
>>>> +	pmd_t *pmd, *start_pmd;
>>>> +
>>>> +	start_pmd = pmd = pmd_offset(pud, addr);
>>>> +	do {
>>>> +		next = pmd_addr_end(addr, end);
>>>> +		if (!pmd_none(*pmd)) {
>>>> +			if (pmd_thp_or_huge(*pmd)) {
>>>
>>> do we ever actually map anything with section mappings in the Hyp
>>> mappings?
>>
>> No, this is purely a page mapping so far. On my system, the HYP text is
>> just over 4 pages big (4k pages), so the incentive is pretty low, unless
>> we can demonstrate some big gains due to the reduced TLB impact.
>>
>>>> +				pmd_t old_pmd = *pmd;
>>>> +
>>>> +				pmd_clear(pmd);
>>>> +				kvm_flush_dcache_pmd(old_pmd);
>>>> +				put_page(virt_to_page(pmd));
>>>> +			} else {
>>>> +				unmap_hyp_ptes(pmd, addr, next);
>>>> +			}
>>>> +		}
>>>> +	} while (pmd++, addr = next, addr != end);
>>>> +
>>>> +	if (hyp_pmd_table_empty(start_pmd))
>>>> +		clear_hyp_pud_entry(pud);
>>>> +}
>>>> +
>>>> +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
>>>> +{
>>>> +	phys_addr_t next;
>>>> +	pud_t *pud, *start_pud;
>>>> +
>>>> +	start_pud = pud = pud_offset(pgd, addr);
>>>> +	do {
>>>> +		next = pud_addr_end(addr, end);
>>>> +		if (!pud_none(*pud)) {
>>>> +			if (pud_huge(*pud)) {
>>>
>>> do we ever actually map anything with huge pud
>>> mappings for the Hyp space?
>>
>> Same thing. Looks like there is some potential simplification here.
>>
>>>
>>>> +				pud_t old_pud = *pud;
>>>> +
>>>> +				pud_clear(pud);
>>>> +				kvm_flush_dcache_pud(old_pud);
>>>> +				put_page(virt_to_page(pud));
>>>> +			} else {
>>>> +				unmap_hyp_pmds(pud, addr, next);
>>>> +			}
>>>> +		}
>>>> +	} while (pud++, addr = next, addr != end);
>>>> +
>>>> +	if (hyp_pud_table_empty(start_pud))
>>>> +		clear_hyp_pgd_entry(pgd);
>>>> +}
>>>> +
>>>> +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
>>>> +{
>>>> +	pgd_t *pgd;
>>>> +	phys_addr_t addr = start, end = start + size;
>>>> +	phys_addr_t next;
>>>> +
>>>> +	pgd = pgdp + pgd_index(addr);
>>>> +	do {
>>>> +		next = pgd_addr_end(addr, end);
>>>> +		if (!pgd_none(*pgd))
>>>> +			unmap_hyp_puds(pgd, addr, next);
>>>> +	} while (pgd++, addr = next, addr != end);
>>>
>>> shouldn't we flush the EL2 (hyp) TLB here, strictly speaking?
>>>
>>> Or do we rely on all mappings ever created/torn down here to always have
>>> the same VA/PA relationship?  Since we didn't flush the EL2 TLB in the
>>> existing code, that indeed does seem to be the case.
>>
>> Actually, we never unmap anything from HYP. Once a structure (kvm, vcpu)
>> is mapped there, it stays forever, whatever happens to the VM (that's
>> because we'd otherwise have to refcount the number of objects in a page,
>> and I'm lazy...).
>>
>>> That, in turn, raises the question why we don't simply map all pages
>>> that could be referenced by a kmalloc() in Hyp mode during the init
>>> phase and be done with all this hyp mapping/unmapping stuff?
>>>
>>> In any case, that behavior doesn't have to change now, but if we don't
>>> add a TLB flush here, I'd like a comment to explain why that's not
>>> needed.
>>
>> Hope you have your answer above... ;-)
>>
> Not quite: Could we just map the linearly mapped region in Hyp mode from
> the beginning and be done with all this?

We could. Not sure what the implications may be, apart from using more
memory for page tables. In that case, section mappings would be in order
though.

> Otherwise yes, I have the answer, and we should add a comment too.

Agreed.

	M.

Suzuki K Poulose April 8, 2016, 3:22 p.m. UTC | #5

On 08/04/16 16:09, Marc Zyngier wrote:
> On 08/04/16 14:15, Christoffer Dall wrote:
>> On Mon, Apr 04, 2016 at 05:26:12PM +0100, Suzuki K Poulose wrote:
>>> We have common routines to modify hyp and stage2 page tables
>>> based on the 'kvm' parameter. For a smoother transition to
>>> using separate routines for each, duplicate the routines
>>> and modify the copy to work on hyp.
>>>
>>> Marks the forked routines with _hyp_ and gets rid of the
>>> kvm parameter which is no longer needed and is NULL for hyp.
>>> Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
>>> from the hyp versions. Uses explicit host page table accessors
>>> instead of the kvm_* page table helpers.
>>>
>>> Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
>>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>

>>> +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>>> +{
>>> +	pte_t *pte, *start_pte;
>>> +
>>> +	start_pte = pte = pte_offset_kernel(pmd, addr);
>>> +	do {
>>> +		if (!pte_none(*pte)) {
>>> +			pte_t old_pte = *pte;
>>> +
>>> +			kvm_set_pte(pte, __pte(0));
>>> +
>>> +			/* XXX: Do we need to invalidate the cache for device mappings ? */
>>
>> no, we will not be swapping out any pages mapped in Hyp mode so you can
>> get rid of both of the following two lines.

OK, will remove this hunk.

>>
>>> +			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
>>> +				kvm_flush_dcache_pte(old_pte);
>>> +
>>> +			put_page(virt_to_page(pte));
>>> +		}
>>> +	} while (pte++, addr += PAGE_SIZE, addr != end);
>>> +
>>> +	if (hyp_pte_table_empty(start_pte))
>>> +		clear_hyp_pmd_entry(pmd);
>>> +}
>>> +
>>> +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
>>> +{
>>> +	phys_addr_t next;
>>> +	pmd_t *pmd, *start_pmd;
>>> +
>>> +	start_pmd = pmd = pmd_offset(pud, addr);
>>> +	do {
>>> +		next = pmd_addr_end(addr, end);
>>> +		if (!pmd_none(*pmd)) {
>>> +			if (pmd_thp_or_huge(*pmd)) {
>>
>> do we ever actually map anything with section mappings in the Hyp
>> mappings?
>
> No, this is purely a page mapping so far. On my system, the HYP text is
> just over 4 pages big (4k pages), so the incentive is pretty low, unless
> we can demonstrate some big gains due to the reduced TLB impact.


>>> +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
>>> +{
>>> +	phys_addr_t next;
>>> +	pud_t *pud, *start_pud;
>>> +
>>> +	start_pud = pud = pud_offset(pgd, addr);
>>> +	do {
>>> +		next = pud_addr_end(addr, end);
>>> +		if (!pud_none(*pud)) {
>>> +			if (pud_huge(*pud)) {
>>
>> do we ever actually map anything with huge pud
>> mappings for the Hyp space?
>
> Same thing. Looks like there is some potential simplification here.

Right, we don't map anything with section mapping. I can clean these up.

>>> +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
>>> +{
>>> +	pgd_t *pgd;
>>> +	phys_addr_t addr = start, end = start + size;
>>> +	phys_addr_t next;
>>> +
>>> +	pgd = pgdp + pgd_index(addr);
>>> +	do {
>>> +		next = pgd_addr_end(addr, end);
>>> +		if (!pgd_none(*pgd))
>>> +			unmap_hyp_puds(pgd, addr, next);
>>> +	} while (pgd++, addr = next, addr != end);
>>
>> shouldn't we flush the EL2 (hyp) TLB here, strictly speaking?
>>
>> Or do we rely on all mappings ever created/torn down here to always have
>> the same VA/PA relationship?  Since we didn't flush the EL2 TLB in the
>> existing code, that indeed does seem to be the case.
>
> Actually, we never unmap anything from HYP.

Except for the kvm tearing down where we clean up all the hyp table.

> Once a structure (kvm, vcpu)is mapped there, it stays forever, whatever happens
> to the VM (that's because we'd otherwise have to refcount the number of objects in a page,
> and I'm lazy...).

Thats one of my TODO list if there is sufficient interest in getting that done.

Thanks
Suzuki
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Christoffer Dall April 8, 2016, 3:25 p.m. UTC | #6

On Fri, Apr 8, 2016 at 5:22 PM, Suzuki K Poulose <Suzuki.Poulose@arm.com> wrote:
> On 08/04/16 16:09, Marc Zyngier wrote:
>>
>> On 08/04/16 14:15, Christoffer Dall wrote:
>>>
>>> On Mon, Apr 04, 2016 at 05:26:12PM +0100, Suzuki K Poulose wrote:
>>>>
>>>> We have common routines to modify hyp and stage2 page tables
>>>> based on the 'kvm' parameter. For a smoother transition to
>>>> using separate routines for each, duplicate the routines
>>>> and modify the copy to work on hyp.
>>>>
>>>> Marks the forked routines with _hyp_ and gets rid of the
>>>> kvm parameter which is no longer needed and is NULL for hyp.
>>>> Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls
>>>> from the hyp versions. Uses explicit host page table accessors
>>>> instead of the kvm_* page table helpers.
>>>>
>>>> Suggested-by: Christoffer Dall <christoffer.dall@linaro.org>
>>>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>>>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>
>
>>>> +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t
>>>> end)
>>>> +{
>>>> +       pte_t *pte, *start_pte;
>>>> +
>>>> +       start_pte = pte = pte_offset_kernel(pmd, addr);
>>>> +       do {
>>>> +               if (!pte_none(*pte)) {
>>>> +                       pte_t old_pte = *pte;
>>>> +
>>>> +                       kvm_set_pte(pte, __pte(0));
>>>> +
>>>> +                       /* XXX: Do we need to invalidate the cache for
>>>> device mappings ? */
>>>
>>>
>>> no, we will not be swapping out any pages mapped in Hyp mode so you can
>>> get rid of both of the following two lines.
>
>
> OK, will remove this hunk.
>
>
>>>
>>>> +                       if (!kvm_is_device_pfn(pte_pfn(old_pte)))
>>>> +                               kvm_flush_dcache_pte(old_pte);
>>>> +
>>>> +                       put_page(virt_to_page(pte));
>>>> +               }
>>>> +       } while (pte++, addr += PAGE_SIZE, addr != end);
>>>> +
>>>> +       if (hyp_pte_table_empty(start_pte))
>>>> +               clear_hyp_pmd_entry(pmd);
>>>> +}
>>>> +
>>>> +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t
>>>> end)
>>>> +{
>>>> +       phys_addr_t next;
>>>> +       pmd_t *pmd, *start_pmd;
>>>> +
>>>> +       start_pmd = pmd = pmd_offset(pud, addr);
>>>> +       do {
>>>> +               next = pmd_addr_end(addr, end);
>>>> +               if (!pmd_none(*pmd)) {
>>>> +                       if (pmd_thp_or_huge(*pmd)) {
>>>
>>>
>>> do we ever actually map anything with section mappings in the Hyp
>>> mappings?
>>
>>
>> No, this is purely a page mapping so far. On my system, the HYP text is
>> just over 4 pages big (4k pages), so the incentive is pretty low, unless
>> we can demonstrate some big gains due to the reduced TLB impact.
>
>
>
>>>> +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t
>>>> end)
>>>> +{
>>>> +       phys_addr_t next;
>>>> +       pud_t *pud, *start_pud;
>>>> +
>>>> +       start_pud = pud = pud_offset(pgd, addr);
>>>> +       do {
>>>> +               next = pud_addr_end(addr, end);
>>>> +               if (!pud_none(*pud)) {
>>>> +                       if (pud_huge(*pud)) {
>>>
>>>
>>> do we ever actually map anything with huge pud
>>> mappings for the Hyp space?
>>
>>
>> Same thing. Looks like there is some potential simplification here.
>
>
> Right, we don't map anything with section mapping. I can clean these up.
>
>>>> +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
>>>> +{
>>>> +       pgd_t *pgd;
>>>> +       phys_addr_t addr = start, end = start + size;
>>>> +       phys_addr_t next;
>>>> +
>>>> +       pgd = pgdp + pgd_index(addr);
>>>> +       do {
>>>> +               next = pgd_addr_end(addr, end);
>>>> +               if (!pgd_none(*pgd))
>>>> +                       unmap_hyp_puds(pgd, addr, next);
>>>> +       } while (pgd++, addr = next, addr != end);
>>>
>>>
>>> shouldn't we flush the EL2 (hyp) TLB here, strictly speaking?
>>>
>>> Or do we rely on all mappings ever created/torn down here to always have
>>> the same VA/PA relationship?  Since we didn't flush the EL2 TLB in the
>>> existing code, that indeed does seem to be the case.
>>
>>
>> Actually, we never unmap anything from HYP.
>
>
> Except for the kvm tearing down where we clean up all the hyp table.
>
>> Once a structure (kvm, vcpu)is mapped there, it stays forever, whatever
>> happens
>> to the VM (that's because we'd otherwise have to refcount the number of
>> objects in a page,
>> and I'm lazy...).
>
>
> Thats one of my TODO list if there is sufficient interest in getting that
> done.
>
I think you can ignore it for now... I'm sure we have bigger fish to fry.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[12/17] kvm-arm: Add explicit hyp page table modifiers

Commit Message

Comments

Patch