diff mbox series

[RFC] kvm: arm64: Try stage2 block mapping for host device MMIO

Message ID 20210122083650.21812-1-zhukeqian1@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] kvm: arm64: Try stage2 block mapping for host device MMIO | expand

Commit Message

zhukeqian Jan. 22, 2021, 8:36 a.m. UTC
The MMIO region of a device maybe huge (GB level), try to use block
mapping in stage2 to speedup both map and unmap.

Especially for unmap, it performs TLBI right after each invalidation
of PTE. If all mapping is of PAGE_SIZE, it takes much time to handle
GB level range.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
---
 arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++++++++++
 arch/arm64/kvm/mmu.c                 | 12 ++++++++----
 3 files changed, 34 insertions(+), 4 deletions(-)

Comments

Marc Zyngier Jan. 22, 2021, 9:45 a.m. UTC | #1
On 2021-01-22 08:36, Keqian Zhu wrote:
> The MMIO region of a device maybe huge (GB level), try to use block
> mapping in stage2 to speedup both map and unmap.
> 
> Especially for unmap, it performs TLBI right after each invalidation
> of PTE. If all mapping is of PAGE_SIZE, it takes much time to handle
> GB level range.

This is only on VM teardown, right? Or do you unmap the device more 
ofet?
Can you please quantify the speedup and the conditions this occurs in?

I have the feeling that we are just circling around another problem,
which is that we could rely on a VM-wide TLBI when tearing down the
guest. I worked on something like that[1] a long while ago, and parked
it for some reason. Maybe it is worth reviving.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/elide-cmo-tlbi

> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++++++++++
>  arch/arm64/kvm/mmu.c                 | 12 ++++++++----
>  3 files changed, 34 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h
> b/arch/arm64/include/asm/kvm_pgtable.h
> index 52ab38db04c7..2266ac45f10c 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -82,6 +82,17 @@ struct kvm_pgtable_walker {
>  	const enum kvm_pgtable_walk_flags	flags;
>  };
> 
> +/**
> + * kvm_supported_pgsize() - Get the max supported page size of a 
> mapping.
> + * @pgt:	Initialised page-table structure.
> + * @addr:	Virtual address at which to place the mapping.
> + * @end:	End virtual address of the mapping.
> + * @phys:	Physical address of the memory to map.
> + *
> + * The smallest return value is PAGE_SIZE.
> + */
> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, 
> u64 phys);
> +
>  /**
>   * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 
> page-table.
>   * @pgt:	Uninitialised page-table structure to initialise.
> diff --git a/arch/arm64/kvm/hyp/pgtable.c 
> b/arch/arm64/kvm/hyp/pgtable.c
> index bdf8e55ed308..ab11609b9b13 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -81,6 +81,21 @@ static bool kvm_block_mapping_supported(u64 addr,
> u64 end, u64 phys, u32 level)
>  	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
>  }
> 
> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, 
> u64 phys)
> +{
> +	u32 lvl;
> +	u64 pgsize = PAGE_SIZE;
> +
> +	for (lvl = pgt->start_level; lvl < KVM_PGTABLE_MAX_LEVELS; lvl++) {
> +		if (kvm_block_mapping_supported(addr, end, phys, lvl)) {
> +			pgsize = kvm_granule_size(lvl);
> +			break;
> +		}
> +	}
> +
> +	return pgsize;
> +}
> +
>  static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 
> level)
>  {
>  	u64 shift = kvm_granule_shift(level);
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 7d2257cc5438..80b403fc8e64 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -499,7 +499,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  			  phys_addr_t pa, unsigned long size, bool writable)
>  {
> -	phys_addr_t addr;
> +	phys_addr_t addr, end;
> +	unsigned long pgsize;
>  	int ret = 0;
>  	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
>  	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
> @@ -509,21 +510,24 @@ int kvm_phys_addr_ioremap(struct kvm *kvm,
> phys_addr_t guest_ipa,
> 
>  	size += offset_in_page(guest_ipa);
>  	guest_ipa &= PAGE_MASK;
> +	end = guest_ipa + size;
> 
> -	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
> +	for (addr = guest_ipa; addr < end; addr += pgsize) {
>  		ret = kvm_mmu_topup_memory_cache(&cache,
>  						 kvm_mmu_cache_min_pages(kvm));
>  		if (ret)
>  			break;
> 
> +		pgsize = kvm_supported_pgsize(pgt, addr, end, pa);
> +
>  		spin_lock(&kvm->mmu_lock);
> -		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
> +		ret = kvm_pgtable_stage2_map(pgt, addr, pgsize, pa, prot,
>  					     &cache);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			break;
> 
> -		pa += PAGE_SIZE;
> +		pa += pgsize;
>  	}
> 
>  	kvm_mmu_free_memory_cache(&cache);

This otherwise looks neat enough.

Thanks,

         M.
zhukeqian Jan. 25, 2021, 11:25 a.m. UTC | #2
Hi Marc,

On 2021/1/22 17:45, Marc Zyngier wrote:
> On 2021-01-22 08:36, Keqian Zhu wrote:
>> The MMIO region of a device maybe huge (GB level), try to use block
>> mapping in stage2 to speedup both map and unmap.
>>
>> Especially for unmap, it performs TLBI right after each invalidation
>> of PTE. If all mapping is of PAGE_SIZE, it takes much time to handle
>> GB level range.
> 
> This is only on VM teardown, right? Or do you unmap the device more ofet?
> Can you please quantify the speedup and the conditions this occurs in?

Yes, and there are some other paths (includes what your patch series handles) will do the unmap action:

1、guest reboot without S2FWB: stage2_unmap_vm()which only unmaps guest regular RAM.
2、userspace deletes memslot: kvm_arch_flush_shadow_memslot().
3、rollback of device MMIO mapping: kvm_arch_prepare_memory_region().
4、rollback of dirty log tracking: If we enable hugepage for guest RAM, after dirty log is stopped,
                                   the newly created block mappings will unmap all page mappings.
5、mmu notifier: kvm_unmap_hva_range(). AFAICS, we will use this path when VM teardown or guest resets pass-through devices.
                                        The bugfix[1] gives the reason for unmapping MMIO region when guest resets pass-through devices.

unmap related to MMIO region, as this patch solves:
point 1 is not applied.
point 2 occurs when userspace unplug pass-through devices.
point 3 can occurs, but rarely.
point 4 is not applied.
point 5 occurs when VM teardown or guest resets pass-through devices.

And I had a look at your patch series, it can solve:
For VM teardown, elide CMO and perform VMALL instead of individually (But current kernel do not go through this path when VM teardown).
For rollback of dirty log tracking, elide CMO.
For kvm_unmap_hva_range, if event is MMU_NOTIFY_UNMAP. elide CMO.

(But I doubt the CMOs in unmap. As we perform CMOs in user_mem_abort when install new stage2 mapping for VM,
 maybe the CMO in unmap is unnecessary under all conditions :-) ?)

So it shows that we are solving different parts of unmap, so they are not conflicting. At least this patch can
still speedup map of device MMIO region, and speedup unmap of device MMIO region even if we do not need to perform
CMO and TLBI ;-).

speedup: unmap 8GB MMIO on FPGA.

           before            after opt
cost    30+ minutes            949ms

Thanks,
Keqian

> 
> I have the feeling that we are just circling around another problem,
> which is that we could rely on a VM-wide TLBI when tearing down the
> guest. I worked on something like that[1] a long while ago, and parked
> it for some reason. Maybe it is worth reviving.
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/elide-cmo-tlbi
> 
>>
>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>> ---
>>  arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
>>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++++++++++
>>  arch/arm64/kvm/mmu.c                 | 12 ++++++++----
>>  3 files changed, 34 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_pgtable.h
>> b/arch/arm64/include/asm/kvm_pgtable.h
>> index 52ab38db04c7..2266ac45f10c 100644
>> --- a/arch/arm64/include/asm/kvm_pgtable.h
>> +++ b/arch/arm64/include/asm/kvm_pgtable.h
>> @@ -82,6 +82,17 @@ struct kvm_pgtable_walker {
>>      const enum kvm_pgtable_walk_flags    flags;
>>  };
>>
>> +/**
>> + * kvm_supported_pgsize() - Get the max supported page size of a mapping.
>> + * @pgt:    Initialised page-table structure.
>> + * @addr:    Virtual address at which to place the mapping.
>> + * @end:    End virtual address of the mapping.
>> + * @phys:    Physical address of the memory to map.
>> + *
>> + * The smallest return value is PAGE_SIZE.
>> + */
>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys);
>> +
>>  /**
>>   * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
>>   * @pgt:    Uninitialised page-table structure to initialise.
>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>> index bdf8e55ed308..ab11609b9b13 100644
>> --- a/arch/arm64/kvm/hyp/pgtable.c
>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>> @@ -81,6 +81,21 @@ static bool kvm_block_mapping_supported(u64 addr,
>> u64 end, u64 phys, u32 level)
>>      return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
>>  }
>>
>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys)
>> +{
>> +    u32 lvl;
>> +    u64 pgsize = PAGE_SIZE;
>> +
>> +    for (lvl = pgt->start_level; lvl < KVM_PGTABLE_MAX_LEVELS; lvl++) {
>> +        if (kvm_block_mapping_supported(addr, end, phys, lvl)) {
>> +            pgsize = kvm_granule_size(lvl);
>> +            break;
>> +        }
>> +    }
>> +
>> +    return pgsize;
>> +}
>> +
>>  static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
>>  {
>>      u64 shift = kvm_granule_shift(level);
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index 7d2257cc5438..80b403fc8e64 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -499,7 +499,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>                phys_addr_t pa, unsigned long size, bool writable)
>>  {
>> -    phys_addr_t addr;
>> +    phys_addr_t addr, end;
>> +    unsigned long pgsize;
>>      int ret = 0;
>>      struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
>>      struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
>> @@ -509,21 +510,24 @@ int kvm_phys_addr_ioremap(struct kvm *kvm,
>> phys_addr_t guest_ipa,
>>
>>      size += offset_in_page(guest_ipa);
>>      guest_ipa &= PAGE_MASK;
>> +    end = guest_ipa + size;
>>
>> -    for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
>> +    for (addr = guest_ipa; addr < end; addr += pgsize) {
>>          ret = kvm_mmu_topup_memory_cache(&cache,
>>                           kvm_mmu_cache_min_pages(kvm));
>>          if (ret)
>>              break;
>>
>> +        pgsize = kvm_supported_pgsize(pgt, addr, end, pa);
>> +
>>          spin_lock(&kvm->mmu_lock);
>> -        ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
>> +        ret = kvm_pgtable_stage2_map(pgt, addr, pgsize, pa, prot,
>>                           &cache);
>>          spin_unlock(&kvm->mmu_lock);
>>          if (ret)
>>              break;
>>
>> -        pa += PAGE_SIZE;
>> +        pa += pgsize;
>>      }
>>
>>      kvm_mmu_free_memory_cache(&cache);
> 
> This otherwise looks neat enough.
> 
> Thanks,
> 
>         M.
zhukeqian Jan. 25, 2021, 11:31 a.m. UTC | #3
I forget to give the link of the bugfix I mentioned below :-).

[1] https://lkml.org/lkml/2020/5/1/1294

On 2021/1/25 19:25, Keqian Zhu wrote:
> Hi Marc,
> 
> On 2021/1/22 17:45, Marc Zyngier wrote:
>> On 2021-01-22 08:36, Keqian Zhu wrote:
>>> The MMIO region of a device maybe huge (GB level), try to use block
>>> mapping in stage2 to speedup both map and unmap.
>>>
>>> Especially for unmap, it performs TLBI right after each invalidation
>>> of PTE. If all mapping is of PAGE_SIZE, it takes much time to handle
>>> GB level range.
>>
>> This is only on VM teardown, right? Or do you unmap the device more ofet?
>> Can you please quantify the speedup and the conditions this occurs in?
> 
> Yes, and there are some other paths (includes what your patch series handles) will do the unmap action:
> 
> 1、guest reboot without S2FWB: stage2_unmap_vm()which only unmaps guest regular RAM.
> 2、userspace deletes memslot: kvm_arch_flush_shadow_memslot().
> 3、rollback of device MMIO mapping: kvm_arch_prepare_memory_region().
> 4、rollback of dirty log tracking: If we enable hugepage for guest RAM, after dirty log is stopped,
>                                    the newly created block mappings will unmap all page mappings.
> 5、mmu notifier: kvm_unmap_hva_range(). AFAICS, we will use this path when VM teardown or guest resets pass-through devices.
>                                         The bugfix[1] gives the reason for unmapping MMIO region when guest resets pass-through devices.
> 
> unmap related to MMIO region, as this patch solves:
> point 1 is not applied.
> point 2 occurs when userspace unplug pass-through devices.
> point 3 can occurs, but rarely.
> point 4 is not applied.
> point 5 occurs when VM teardown or guest resets pass-through devices.
> 
> And I had a look at your patch series, it can solve:
> For VM teardown, elide CMO and perform VMALL instead of individually (But current kernel do not go through this path when VM teardown).
> For rollback of dirty log tracking, elide CMO.
> For kvm_unmap_hva_range, if event is MMU_NOTIFY_UNMAP. elide CMO.
> 
> (But I doubt the CMOs in unmap. As we perform CMOs in user_mem_abort when install new stage2 mapping for VM,
>  maybe the CMO in unmap is unnecessary under all conditions :-) ?)
> 
> So it shows that we are solving different parts of unmap, so they are not conflicting. At least this patch can
> still speedup map of device MMIO region, and speedup unmap of device MMIO region even if we do not need to perform
> CMO and TLBI ;-).
> 
> speedup: unmap 8GB MMIO on FPGA.
> 
>            before            after opt
> cost    30+ minutes            949ms
> 
> Thanks,
> Keqian
> 
>>
>> I have the feeling that we are just circling around another problem,
>> which is that we could rely on a VM-wide TLBI when tearing down the
>> guest. I worked on something like that[1] a long while ago, and parked
>> it for some reason. Maybe it is worth reviving.
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/elide-cmo-tlbi
>>
>>>
>>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>>> ---
>>>  arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
>>>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++++++++++
>>>  arch/arm64/kvm/mmu.c                 | 12 ++++++++----
>>>  3 files changed, 34 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_pgtable.h
>>> b/arch/arm64/include/asm/kvm_pgtable.h
>>> index 52ab38db04c7..2266ac45f10c 100644
>>> --- a/arch/arm64/include/asm/kvm_pgtable.h
>>> +++ b/arch/arm64/include/asm/kvm_pgtable.h
>>> @@ -82,6 +82,17 @@ struct kvm_pgtable_walker {
>>>      const enum kvm_pgtable_walk_flags    flags;
>>>  };
>>>
>>> +/**
>>> + * kvm_supported_pgsize() - Get the max supported page size of a mapping.
>>> + * @pgt:    Initialised page-table structure.
>>> + * @addr:    Virtual address at which to place the mapping.
>>> + * @end:    End virtual address of the mapping.
>>> + * @phys:    Physical address of the memory to map.
>>> + *
>>> + * The smallest return value is PAGE_SIZE.
>>> + */
>>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys);
>>> +
>>>  /**
>>>   * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
>>>   * @pgt:    Uninitialised page-table structure to initialise.
>>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>>> index bdf8e55ed308..ab11609b9b13 100644
>>> --- a/arch/arm64/kvm/hyp/pgtable.c
>>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>>> @@ -81,6 +81,21 @@ static bool kvm_block_mapping_supported(u64 addr,
>>> u64 end, u64 phys, u32 level)
>>>      return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
>>>  }
>>>
>>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys)
>>> +{
>>> +    u32 lvl;
>>> +    u64 pgsize = PAGE_SIZE;
>>> +
>>> +    for (lvl = pgt->start_level; lvl < KVM_PGTABLE_MAX_LEVELS; lvl++) {
>>> +        if (kvm_block_mapping_supported(addr, end, phys, lvl)) {
>>> +            pgsize = kvm_granule_size(lvl);
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    return pgsize;
>>> +}
>>> +
>>>  static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
>>>  {
>>>      u64 shift = kvm_granule_shift(level);
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index 7d2257cc5438..80b403fc8e64 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -499,7 +499,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>>  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>                phys_addr_t pa, unsigned long size, bool writable)
>>>  {
>>> -    phys_addr_t addr;
>>> +    phys_addr_t addr, end;
>>> +    unsigned long pgsize;
>>>      int ret = 0;
>>>      struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
>>>      struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
>>> @@ -509,21 +510,24 @@ int kvm_phys_addr_ioremap(struct kvm *kvm,
>>> phys_addr_t guest_ipa,
>>>
>>>      size += offset_in_page(guest_ipa);
>>>      guest_ipa &= PAGE_MASK;
>>> +    end = guest_ipa + size;
>>>
>>> -    for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
>>> +    for (addr = guest_ipa; addr < end; addr += pgsize) {
>>>          ret = kvm_mmu_topup_memory_cache(&cache,
>>>                           kvm_mmu_cache_min_pages(kvm));
>>>          if (ret)
>>>              break;
>>>
>>> +        pgsize = kvm_supported_pgsize(pgt, addr, end, pa);
>>> +
>>>          spin_lock(&kvm->mmu_lock);
>>> -        ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
>>> +        ret = kvm_pgtable_stage2_map(pgt, addr, pgsize, pa, prot,
>>>                           &cache);
>>>          spin_unlock(&kvm->mmu_lock);
>>>          if (ret)
>>>              break;
>>>
>>> -        pa += PAGE_SIZE;
>>> +        pa += pgsize;
>>>      }
>>>
>>>      kvm_mmu_free_memory_cache(&cache);
>>
>> This otherwise looks neat enough.
>>
>> Thanks,
>>
>>         M.
zhukeqian March 2, 2021, 12:19 p.m. UTC | #4
Hi Marc,

Do you have further suggestion on this? Block mapping do bring obvious benefit.

Thanks,
Keqian

On 2021/1/25 19:25, Keqian Zhu wrote:
> Hi Marc,
> 
> On 2021/1/22 17:45, Marc Zyngier wrote:
>> On 2021-01-22 08:36, Keqian Zhu wrote:
>>> The MMIO region of a device maybe huge (GB level), try to use block
>>> mapping in stage2 to speedup both map and unmap.
>>>
>>> Especially for unmap, it performs TLBI right after each invalidation
>>> of PTE. If all mapping is of PAGE_SIZE, it takes much time to handle
>>> GB level range.
>>
>> This is only on VM teardown, right? Or do you unmap the device more ofet?
>> Can you please quantify the speedup and the conditions this occurs in?
> 
> Yes, and there are some other paths (includes what your patch series handles) will do the unmap action:
> 
> 1、guest reboot without S2FWB: stage2_unmap_vm()which only unmaps guest regular RAM.
> 2、userspace deletes memslot: kvm_arch_flush_shadow_memslot().
> 3、rollback of device MMIO mapping: kvm_arch_prepare_memory_region().
> 4、rollback of dirty log tracking: If we enable hugepage for guest RAM, after dirty log is stopped,
>                                    the newly created block mappings will unmap all page mappings.
> 5、mmu notifier: kvm_unmap_hva_range(). AFAICS, we will use this path when VM teardown or guest resets pass-through devices.
>                                         The bugfix[1] gives the reason for unmapping MMIO region when guest resets pass-through devices.
> 
> unmap related to MMIO region, as this patch solves:
> point 1 is not applied.
> point 2 occurs when userspace unplug pass-through devices.
> point 3 can occurs, but rarely.
> point 4 is not applied.
> point 5 occurs when VM teardown or guest resets pass-through devices.
> 
> And I had a look at your patch series, it can solve:
> For VM teardown, elide CMO and perform VMALL instead of individually (But current kernel do not go through this path when VM teardown).
> For rollback of dirty log tracking, elide CMO.
> For kvm_unmap_hva_range, if event is MMU_NOTIFY_UNMAP. elide CMO.
> 
> (But I doubt the CMOs in unmap. As we perform CMOs in user_mem_abort when install new stage2 mapping for VM,
>  maybe the CMO in unmap is unnecessary under all conditions :-) ?)
> 
> So it shows that we are solving different parts of unmap, so they are not conflicting. At least this patch can
> still speedup map of device MMIO region, and speedup unmap of device MMIO region even if we do not need to perform
> CMO and TLBI ;-).
> 
> speedup: unmap 8GB MMIO on FPGA.
> 
>            before            after opt
> cost    30+ minutes            949ms
> 
> Thanks,
> Keqian
> 
>>
>> I have the feeling that we are just circling around another problem,
>> which is that we could rely on a VM-wide TLBI when tearing down the
>> guest. I worked on something like that[1] a long while ago, and parked
>> it for some reason. Maybe it is worth reviving.
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/elide-cmo-tlbi
>>
>>>
>>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>>> ---
>>>  arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
>>>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++++++++++
>>>  arch/arm64/kvm/mmu.c                 | 12 ++++++++----
>>>  3 files changed, 34 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_pgtable.h
>>> b/arch/arm64/include/asm/kvm_pgtable.h
>>> index 52ab38db04c7..2266ac45f10c 100644
>>> --- a/arch/arm64/include/asm/kvm_pgtable.h
>>> +++ b/arch/arm64/include/asm/kvm_pgtable.h
>>> @@ -82,6 +82,17 @@ struct kvm_pgtable_walker {
>>>      const enum kvm_pgtable_walk_flags    flags;
>>>  };
>>>
>>> +/**
>>> + * kvm_supported_pgsize() - Get the max supported page size of a mapping.
>>> + * @pgt:    Initialised page-table structure.
>>> + * @addr:    Virtual address at which to place the mapping.
>>> + * @end:    End virtual address of the mapping.
>>> + * @phys:    Physical address of the memory to map.
>>> + *
>>> + * The smallest return value is PAGE_SIZE.
>>> + */
>>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys);
>>> +
>>>  /**
>>>   * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
>>>   * @pgt:    Uninitialised page-table structure to initialise.
>>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>>> index bdf8e55ed308..ab11609b9b13 100644
>>> --- a/arch/arm64/kvm/hyp/pgtable.c
>>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>>> @@ -81,6 +81,21 @@ static bool kvm_block_mapping_supported(u64 addr,
>>> u64 end, u64 phys, u32 level)
>>>      return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
>>>  }
>>>
>>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys)
>>> +{
>>> +    u32 lvl;
>>> +    u64 pgsize = PAGE_SIZE;
>>> +
>>> +    for (lvl = pgt->start_level; lvl < KVM_PGTABLE_MAX_LEVELS; lvl++) {
>>> +        if (kvm_block_mapping_supported(addr, end, phys, lvl)) {
>>> +            pgsize = kvm_granule_size(lvl);
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    return pgsize;
>>> +}
>>> +
>>>  static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
>>>  {
>>>      u64 shift = kvm_granule_shift(level);
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index 7d2257cc5438..80b403fc8e64 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -499,7 +499,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>>  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>                phys_addr_t pa, unsigned long size, bool writable)
>>>  {
>>> -    phys_addr_t addr;
>>> +    phys_addr_t addr, end;
>>> +    unsigned long pgsize;
>>>      int ret = 0;
>>>      struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
>>>      struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
>>> @@ -509,21 +510,24 @@ int kvm_phys_addr_ioremap(struct kvm *kvm,
>>> phys_addr_t guest_ipa,
>>>
>>>      size += offset_in_page(guest_ipa);
>>>      guest_ipa &= PAGE_MASK;
>>> +    end = guest_ipa + size;
>>>
>>> -    for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
>>> +    for (addr = guest_ipa; addr < end; addr += pgsize) {
>>>          ret = kvm_mmu_topup_memory_cache(&cache,
>>>                           kvm_mmu_cache_min_pages(kvm));
>>>          if (ret)
>>>              break;
>>>
>>> +        pgsize = kvm_supported_pgsize(pgt, addr, end, pa);
>>> +
>>>          spin_lock(&kvm->mmu_lock);
>>> -        ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
>>> +        ret = kvm_pgtable_stage2_map(pgt, addr, pgsize, pa, prot,
>>>                           &cache);
>>>          spin_unlock(&kvm->mmu_lock);
>>>          if (ret)
>>>              break;
>>>
>>> -        pa += PAGE_SIZE;
>>> +        pa += pgsize;
>>>      }
>>>
>>>      kvm_mmu_free_memory_cache(&cache);
>>
>> This otherwise looks neat enough.
>>
>> Thanks,
>>
>>         M.
> .
>
zhukeqian March 11, 2021, 2:28 p.m. UTC | #5
Hi Marc,

On 2021/3/11 16:43, Marc Zyngier wrote:
> Digging this patch back from my Inbox...
Yeah, thanks ;-)

> 
> On Fri, 22 Jan 2021 08:36:50 +0000,
> Keqian Zhu <zhukeqian1@huawei.com> wrote:
>>
>> The MMIO region of a device maybe huge (GB level), try to use block
>> mapping in stage2 to speedup both map and unmap.
>>
>> Especially for unmap, it performs TLBI right after each invalidation
>> of PTE. If all mapping is of PAGE_SIZE, it takes much time to handle
>> GB level range.
>>
>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>> ---
>>  arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
>>  arch/arm64/kvm/hyp/pgtable.c         | 15 +++++++++++++++
>>  arch/arm64/kvm/mmu.c                 | 12 ++++++++----
>>  3 files changed, 34 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
>> index 52ab38db04c7..2266ac45f10c 100644
>> --- a/arch/arm64/include/asm/kvm_pgtable.h
>> +++ b/arch/arm64/include/asm/kvm_pgtable.h
>> @@ -82,6 +82,17 @@ struct kvm_pgtable_walker {
>>  	const enum kvm_pgtable_walk_flags	flags;
>>  };
>>  
>> +/**
>> + * kvm_supported_pgsize() - Get the max supported page size of a mapping.
>> + * @pgt:	Initialised page-table structure.
>> + * @addr:	Virtual address at which to place the mapping.
>> + * @end:	End virtual address of the mapping.
>> + * @phys:	Physical address of the memory to map.
>> + *
>> + * The smallest return value is PAGE_SIZE.
>> + */
>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys);
>> +
>>  /**
>>   * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
>>   * @pgt:	Uninitialised page-table structure to initialise.
>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>> index bdf8e55ed308..ab11609b9b13 100644
>> --- a/arch/arm64/kvm/hyp/pgtable.c
>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>> @@ -81,6 +81,21 @@ static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
>>  	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
>>  }
>>  
>> +u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys)
>> +{
>> +	u32 lvl;
>> +	u64 pgsize = PAGE_SIZE;
>> +
>> +	for (lvl = pgt->start_level; lvl < KVM_PGTABLE_MAX_LEVELS; lvl++) {
>> +		if (kvm_block_mapping_supported(addr, end, phys, lvl)) {
>> +			pgsize = kvm_granule_size(lvl);
>> +			break;
>> +		}
>> +	}
>> +
>> +	return pgsize;
>> +}
>> +
>>  static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
>>  {
>>  	u64 shift = kvm_granule_shift(level);
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index 7d2257cc5438..80b403fc8e64 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -499,7 +499,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>  			  phys_addr_t pa, unsigned long size, bool writable)
>>  {
>> -	phys_addr_t addr;
>> +	phys_addr_t addr, end;
>> +	unsigned long pgsize;
>>  	int ret = 0;
>>  	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
>>  	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
>> @@ -509,21 +510,24 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>  
>>  	size += offset_in_page(guest_ipa);
>>  	guest_ipa &= PAGE_MASK;
>> +	end = guest_ipa + size;
>>  
>> -	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
>> +	for (addr = guest_ipa; addr < end; addr += pgsize) {
>>  		ret = kvm_mmu_topup_memory_cache(&cache,
>>  						 kvm_mmu_cache_min_pages(kvm));
>>  		if (ret)
>>  			break;
>>  
>> +		pgsize = kvm_supported_pgsize(pgt, addr, end, pa);
>> +
>>  		spin_lock(&kvm->mmu_lock);
>> -		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
>> +		ret = kvm_pgtable_stage2_map(pgt, addr, pgsize, pa, prot,
>>  					     &cache);
>>  		spin_unlock(&kvm->mmu_lock);
>>  		if (ret)
>>  			break;
>>  
>> -		pa += PAGE_SIZE;
>> +		pa += pgsize;
>>  	}
>>  
>>  	kvm_mmu_free_memory_cache(&cache);
> 
> There is one issue with this patch, which is that it only does half
> the job. A VM_PFNMAP VMA can definitely be faulted in dynamically, and
> in that case we force this to be a page mapping. This conflicts with
> what you are doing here.
Oh yes, these two paths should keep a same mapping logic.

I try to search the "force_pte" and find out some discussion [1] between you and Christoffer.
And I failed to get a reason about forcing pte mapping for device MMIO region (expect that
we want to keep a same logic with the eager mapping path). So if you don't object to it, I
will try to implement block mapping for device MMIO in user_mem_abort().

> 
> There is also the fact that if we can map things on demand, why are we
> still mapping these MMIO regions ahead of time?
Indeed. Though this provides good *startup* performance for guest accessing MMIO, it's hard
to keep the two paths in sync. We can keep this minor optimization or delete it to avoid hard
maintenance, which one do you prefer?

BTW, could you please have a look at my another patch series[2] about HW/SW combined dirty log? ;)

Thanks,
Keqian

[1] https://patchwork.kernel.org/project/linux-arm-kernel/patch/20191211165651.7889-2-maz@kernel.org/
[2] https://lore.kernel.org/linux-arm-kernel/20210126124444.27136-1-zhukeqian1@huawei.com/


> 
> Thanks,
> 
> 	M.
>
zhukeqian March 12, 2021, 9:29 a.m. UTC | #6
Hi Marc,

On 2021/3/12 16:52, Marc Zyngier wrote:
> On Thu, 11 Mar 2021 14:28:17 +0000,
> Keqian Zhu <zhukeqian1@huawei.com> wrote:
>>
>> Hi Marc,
>>
>> On 2021/3/11 16:43, Marc Zyngier wrote:
>>> Digging this patch back from my Inbox...
>> Yeah, thanks ;-)
>>
>>>
>>> On Fri, 22 Jan 2021 08:36:50 +0000,
>>> Keqian Zhu <zhukeqian1@huawei.com> wrote:
>>>>
>>>> The MMIO region of a device maybe huge (GB level), try to use block
>>>> mapping in stage2 to speedup both map and unmap.
[...]

>>>>  			break;
>>>>  
>>>> -		pa += PAGE_SIZE;
>>>> +		pa += pgsize;
>>>>  	}
>>>>  
>>>>  	kvm_mmu_free_memory_cache(&cache);
>>>
>>> There is one issue with this patch, which is that it only does half
>>> the job. A VM_PFNMAP VMA can definitely be faulted in dynamically, and
>>> in that case we force this to be a page mapping. This conflicts with
>>> what you are doing here.
>> Oh yes, these two paths should keep a same mapping logic.
>>
>> I try to search the "force_pte" and find out some discussion [1]
>> between you and Christoffer.  And I failed to get a reason about
>> forcing pte mapping for device MMIO region (expect that we want to
>> keep a same logic with the eager mapping path). So if you don't
>> object to it, I will try to implement block mapping for device MMIO
>> in user_mem_abort().
>>
>>>
>>> There is also the fact that if we can map things on demand, why are we
>>> still mapping these MMIO regions ahead of time?
>>
>> Indeed. Though this provides good *startup* performance for guest
>> accessing MMIO, it's hard to keep the two paths in sync. We can keep
>> this minor optimization or delete it to avoid hard maintenance,
>> which one do you prefer?
> 
> I think we should be able to get rid of the startup path. If we can do
> it for memory, I see no reason not to do it for MMIO.
OK, I will do.

> 
>> BTW, could you please have a look at my another patch series[2]
>> about HW/SW combined dirty log? ;)
> 
> I will eventually, but while I really appreciate your contributions in
> terms of features and bug fixes, I would really *love* it if you were
> a bit more active on the list when it comes to reviewing other
> people's code.
> 
> There is no shortage of patches that really need reviewing, and just
> pointing me in the direction of your favourite series doesn't really
> help. I have something like 200+ patches that need careful reviewing
> in my inbox, and they all deserve the same level of attention.
> 
> To make it short, help me to help you!
My apologies, and I can't agree more.

I have noticed this, and have reviewed several patches of IOMMU community.
For that some patches are with much background knowledge, so it's hard to
review. I will dig into them in the future.

Thanks for your valuable advice. :)

Thanks,
Keqian


> 
> Thanks,
> 
> 	M.
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 52ab38db04c7..2266ac45f10c 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -82,6 +82,17 @@  struct kvm_pgtable_walker {
 	const enum kvm_pgtable_walk_flags	flags;
 };
 
+/**
+ * kvm_supported_pgsize() - Get the max supported page size of a mapping.
+ * @pgt:	Initialised page-table structure.
+ * @addr:	Virtual address at which to place the mapping.
+ * @end:	End virtual address of the mapping.
+ * @phys:	Physical address of the memory to map.
+ *
+ * The smallest return value is PAGE_SIZE.
+ */
+u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys);
+
 /**
  * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
  * @pgt:	Uninitialised page-table structure to initialise.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bdf8e55ed308..ab11609b9b13 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -81,6 +81,21 @@  static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
 	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
 }
 
+u64 kvm_supported_pgsize(struct kvm_pgtable *pgt, u64 addr, u64 end, u64 phys)
+{
+	u32 lvl;
+	u64 pgsize = PAGE_SIZE;
+
+	for (lvl = pgt->start_level; lvl < KVM_PGTABLE_MAX_LEVELS; lvl++) {
+		if (kvm_block_mapping_supported(addr, end, phys, lvl)) {
+			pgsize = kvm_granule_size(lvl);
+			break;
+		}
+	}
+
+	return pgsize;
+}
+
 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
 {
 	u64 shift = kvm_granule_shift(level);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7d2257cc5438..80b403fc8e64 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -499,7 +499,8 @@  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size, bool writable)
 {
-	phys_addr_t addr;
+	phys_addr_t addr, end;
+	unsigned long pgsize;
 	int ret = 0;
 	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
 	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
@@ -509,21 +510,24 @@  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
 	size += offset_in_page(guest_ipa);
 	guest_ipa &= PAGE_MASK;
+	end = guest_ipa + size;
 
-	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
+	for (addr = guest_ipa; addr < end; addr += pgsize) {
 		ret = kvm_mmu_topup_memory_cache(&cache,
 						 kvm_mmu_cache_min_pages(kvm));
 		if (ret)
 			break;
 
+		pgsize = kvm_supported_pgsize(pgt, addr, end, pa);
+
 		spin_lock(&kvm->mmu_lock);
-		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+		ret = kvm_pgtable_stage2_map(pgt, addr, pgsize, pa, prot,
 					     &cache);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			break;
 
-		pa += PAGE_SIZE;
+		pa += pgsize;
 	}
 
 	kvm_mmu_free_memory_cache(&cache);