diff mbox series

[v4,2/2] kvm/arm64: Try stage2 block mapping for host device MMIO

Message ID 20210415140328.24200-3-zhukeqian1@huawei.com (mailing list archive)
State New, archived
Headers show
Series kvm/arm64: Try stage2 block mapping for host device MMIO | expand

Commit Message

zhukeqian April 15, 2021, 2:03 p.m. UTC
The MMIO region of a device maybe huge (GB level), try to use
block mapping in stage2 to speedup both map and unmap.

Compared to normal memory mapping, we should consider two more
points when try block mapping for MMIO region:

1. For normal memory mapping, the PA(host physical address) and
HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
the HVA to request hugepage, so we don't need to consider PA
alignment when verifing block mapping. But for device memory
mapping, the PA and HVA may have different alignment.

2. For normal memory mapping, we are sure hugepage size properly
fit into vma, so we don't check whether the mapping size exceeds
the boundary of vma. But for device memory mapping, we should pay
attention to this.

This adds get_vma_page_shift() to get page shift for both normal
memory and device MMIO region, and check these two points when
selecting block mapping size for MMIO region.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
---
 arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 10 deletions(-)

Comments

zhukeqian April 15, 2021, 2:08 p.m. UTC | #1
Hi Marc,

On 2021/4/15 22:03, Keqian Zhu wrote:
> The MMIO region of a device maybe huge (GB level), try to use
> block mapping in stage2 to speedup both map and unmap.
> 
> Compared to normal memory mapping, we should consider two more
> points when try block mapping for MMIO region:
> 
> 1. For normal memory mapping, the PA(host physical address) and
> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
> the HVA to request hugepage, so we don't need to consider PA
> alignment when verifing block mapping. But for device memory
> mapping, the PA and HVA may have different alignment.
> 
> 2. For normal memory mapping, we are sure hugepage size properly
> fit into vma, so we don't check whether the mapping size exceeds
> the boundary of vma. But for device memory mapping, we should pay
> attention to this.
> 
> This adds get_vma_page_shift() to get page shift for both normal
> memory and device MMIO region, and check these two points when
> selecting block mapping size for MMIO region.
> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> ---
>  arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 51 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c59af5ca01b0..5a1cc7751e6d 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>  	return PAGE_SIZE;
>  }
>  
> +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
> +{
> +	unsigned long pa;
> +
> +	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
> +		return huge_page_shift(hstate_vma(vma));
> +
> +	if (!(vma->vm_flags & VM_PFNMAP))
> +		return PAGE_SHIFT;
> +
> +	VM_BUG_ON(is_vm_hugetlb_page(vma));
> +
> +	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
> +
> +#ifndef __PAGETABLE_PMD_FOLDED
> +	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
> +	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
> +	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
> +		return PUD_SHIFT;
> +#endif
> +
> +	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
> +	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
> +	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
> +		return PMD_SHIFT;
> +
> +	return PAGE_SHIFT;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>  			  unsigned long fault_status)
> @@ -769,7 +798,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> -	/* Let's check if we will get back a huge page backed by hugetlbfs */
> +	/*
> +	 * Let's check if we will get back a huge page backed by hugetlbfs, or
> +	 * get block mapping for device MMIO region.
> +	 */
>  	mmap_read_lock(current->mm);
>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>  	if (unlikely(!vma)) {
> @@ -778,15 +810,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> -	if (is_vm_hugetlb_page(vma))
> -		vma_shift = huge_page_shift(hstate_vma(vma));
> -	else
> -		vma_shift = PAGE_SHIFT;
> -
> -	if (logging_active ||
> -	    (vma->vm_flags & VM_PFNMAP)) {
> +	/*
> +	 * logging_active is guaranteed to never be true for VM_PFNMAP
> +	 * memslots.
> +	 */
> +	if (logging_active) {
>  		force_pte = true;
>  		vma_shift = PAGE_SHIFT;
> +	} else {
> +		vma_shift = get_vma_page_shift(vma, hva);
>  	}
I use a if/else manner in v4, please check that. Thanks very much!


BRs,
Keqian

>  
>  	switch (vma_shift) {
> @@ -854,8 +886,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  
>  	if (kvm_is_device_pfn(pfn)) {
> +		/*
> +		 * If the page was identified as device early by looking at
> +		 * the VMA flags, vma_pagesize is already representing the
> +		 * largest quantity we can map.  If instead it was mapped
> +		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
> +		 * and must not be upgraded.
> +		 *
> +		 * In both cases, we don't let transparent_hugepage_adjust()
> +		 * change things at the last minute.
> +		 */
>  		device = true;
> -		force_pte = true;
>  	} else if (logging_active && !write_fault) {
>  		/*
>  		 * Only actually map the page as writable if this was a write
> @@ -876,7 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	 * If we are not forced to use page mapping, check if we are
>  	 * backed by a THP and thus use block mapping if possible.
>  	 */
> -	if (vma_pagesize == PAGE_SIZE && !force_pte)
> +	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
>  		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
>  							   &pfn, &fault_ipa);
>  	if (writable)
>
Marc Zyngier April 16, 2021, 2:44 p.m. UTC | #2
On Thu, 15 Apr 2021 15:08:09 +0100,
Keqian Zhu <zhukeqian1@huawei.com> wrote:
> 
> Hi Marc,
> 
> On 2021/4/15 22:03, Keqian Zhu wrote:
> > The MMIO region of a device maybe huge (GB level), try to use
> > block mapping in stage2 to speedup both map and unmap.
> > 
> > Compared to normal memory mapping, we should consider two more
> > points when try block mapping for MMIO region:
> > 
> > 1. For normal memory mapping, the PA(host physical address) and
> > HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
> > the HVA to request hugepage, so we don't need to consider PA
> > alignment when verifing block mapping. But for device memory
> > mapping, the PA and HVA may have different alignment.
> > 
> > 2. For normal memory mapping, we are sure hugepage size properly
> > fit into vma, so we don't check whether the mapping size exceeds
> > the boundary of vma. But for device memory mapping, we should pay
> > attention to this.
> > 
> > This adds get_vma_page_shift() to get page shift for both normal
> > memory and device MMIO region, and check these two points when
> > selecting block mapping size for MMIO region.
> > 
> > Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> > ---
> >  arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 51 insertions(+), 10 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index c59af5ca01b0..5a1cc7751e6d 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >  	return PAGE_SIZE;
> >  }
> >  
> > +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
> > +{
> > +	unsigned long pa;
> > +
> > +	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
> > +		return huge_page_shift(hstate_vma(vma));
> > +
> > +	if (!(vma->vm_flags & VM_PFNMAP))
> > +		return PAGE_SHIFT;
> > +
> > +	VM_BUG_ON(is_vm_hugetlb_page(vma));
> > +
> > +	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
> > +
> > +#ifndef __PAGETABLE_PMD_FOLDED
> > +	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
> > +	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
> > +	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
> > +		return PUD_SHIFT;
> > +#endif
> > +
> > +	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
> > +	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
> > +	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
> > +		return PMD_SHIFT;
> > +
> > +	return PAGE_SHIFT;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  			  struct kvm_memory_slot *memslot, unsigned long hva,
> >  			  unsigned long fault_status)
> > @@ -769,7 +798,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  		return -EFAULT;
> >  	}
> >  
> > -	/* Let's check if we will get back a huge page backed by hugetlbfs */
> > +	/*
> > +	 * Let's check if we will get back a huge page backed by hugetlbfs, or
> > +	 * get block mapping for device MMIO region.
> > +	 */
> >  	mmap_read_lock(current->mm);
> >  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> >  	if (unlikely(!vma)) {
> > @@ -778,15 +810,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  		return -EFAULT;
> >  	}
> >  
> > -	if (is_vm_hugetlb_page(vma))
> > -		vma_shift = huge_page_shift(hstate_vma(vma));
> > -	else
> > -		vma_shift = PAGE_SHIFT;
> > -
> > -	if (logging_active ||
> > -	    (vma->vm_flags & VM_PFNMAP)) {
> > +	/*
> > +	 * logging_active is guaranteed to never be true for VM_PFNMAP
> > +	 * memslots.
> > +	 */
> > +	if (logging_active) {
> >  		force_pte = true;
> >  		vma_shift = PAGE_SHIFT;
> > +	} else {
> > +		vma_shift = get_vma_page_shift(vma, hva);
> >  	}
> I use a if/else manner in v4, please check that. Thanks very much!

That's fine. However, it is getting a bit late for 5.13, and we don't
have much time to left it simmer in -next. I'll probably wait until
after the merge window to pick it up.

Thanks,

	M.
zhukeqian April 17, 2021, 1:05 a.m. UTC | #3
Hi Marc,

On 2021/4/16 22:44, Marc Zyngier wrote:
> On Thu, 15 Apr 2021 15:08:09 +0100,
> Keqian Zhu <zhukeqian1@huawei.com> wrote:
>>
>> Hi Marc,
>>
>> On 2021/4/15 22:03, Keqian Zhu wrote:
>>> The MMIO region of a device maybe huge (GB level), try to use
>>> block mapping in stage2 to speedup both map and unmap.
>>>
>>> Compared to normal memory mapping, we should consider two more
>>> points when try block mapping for MMIO region:
>>>
>>> 1. For normal memory mapping, the PA(host physical address) and
>>> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
>>> the HVA to request hugepage, so we don't need to consider PA
>>> alignment when verifing block mapping. But for device memory
>>> mapping, the PA and HVA may have different alignment.
>>>
>>> 2. For normal memory mapping, we are sure hugepage size properly
>>> fit into vma, so we don't check whether the mapping size exceeds
>>> the boundary of vma. But for device memory mapping, we should pay
>>> attention to this.
>>>
>>> This adds get_vma_page_shift() to get page shift for both normal
>>> memory and device MMIO region, and check these two points when
>>> selecting block mapping size for MMIO region.
>>>
>>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>>> ---
>>>  arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>>>  1 file changed, 51 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index c59af5ca01b0..5a1cc7751e6d 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>>  	return PAGE_SIZE;
>>>  }
>>>  
[...]

>>> +	/*
>>> +	 * logging_active is guaranteed to never be true for VM_PFNMAP
>>> +	 * memslots.
>>> +	 */
>>> +	if (logging_active) {
>>>  		force_pte = true;
>>>  		vma_shift = PAGE_SHIFT;
>>> +	} else {
>>> +		vma_shift = get_vma_page_shift(vma, hva);
>>>  	}
>> I use a if/else manner in v4, please check that. Thanks very much!
> 
> That's fine. However, it is getting a bit late for 5.13, and we don't
> have much time to left it simmer in -next. I'll probably wait until
> after the merge window to pick it up.
OK, no problem. Thanks! :)

BRs,
Keqian
zhukeqian April 21, 2021, 6:36 a.m. UTC | #4
On 2021/4/21 15:52, Gavin Shan wrote:
> Hi Keqian,
> 
> On 4/16/21 12:03 AM, Keqian Zhu wrote:
>> The MMIO region of a device maybe huge (GB level), try to use
>> block mapping in stage2 to speedup both map and unmap.
>>
>> Compared to normal memory mapping, we should consider two more
>> points when try block mapping for MMIO region:
>>
>> 1. For normal memory mapping, the PA(host physical address) and
>> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
>> the HVA to request hugepage, so we don't need to consider PA
>> alignment when verifing block mapping. But for device memory
>> mapping, the PA and HVA may have different alignment.
>>
>> 2. For normal memory mapping, we are sure hugepage size properly
>> fit into vma, so we don't check whether the mapping size exceeds
>> the boundary of vma. But for device memory mapping, we should pay
>> attention to this.
>>
>> This adds get_vma_page_shift() to get page shift for both normal
>> memory and device MMIO region, and check these two points when
>> selecting block mapping size for MMIO region.
>>
>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>> ---
>>   arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>>   1 file changed, 51 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index c59af5ca01b0..5a1cc7751e6d 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>       return PAGE_SIZE;
>>   }
>>   +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
>> +{
>> +    unsigned long pa;
>> +
>> +    if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
>> +        return huge_page_shift(hstate_vma(vma));
>> +
>> +    if (!(vma->vm_flags & VM_PFNMAP))
>> +        return PAGE_SHIFT;
>> +
>> +    VM_BUG_ON(is_vm_hugetlb_page(vma));
>> +
> 
> I don't understand how VM_PFNMAP is set for hugetlbfs related vma.
> I think they are exclusive, meaning the flag is never set for
> hugetlbfs vma. If it's true, VM_PFNMAP needn't be checked on hugetlbfs
> vma and the VM_BUG_ON() becomes unnecessary.
Yes, but we're not sure all drivers follow this rule. Add a BUG_ON() is
a way to catch issue.

> 
>> +    pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
>> +
>> +#ifndef __PAGETABLE_PMD_FOLDED
>> +    if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
>> +        ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
>> +        ALIGN(hva, PUD_SIZE) <= vma->vm_end)
>> +        return PUD_SHIFT;
>> +#endif
>> +
>> +    if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
>> +        ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
>> +        ALIGN(hva, PMD_SIZE) <= vma->vm_end)
>> +        return PMD_SHIFT;
>> +
>> +    return PAGE_SHIFT;
>> +}
>> +
> 
> There is "switch(...)" fallback mechanism in user_mem_abort(). PUD_SIZE/PMD_SIZE
> can be downgraded accordingly if the addresses fails in the alignment check
> by fault_supports_stage2_huge_mapping(). I think it would make user_mem_abort()
> simplified if the logic can be moved to get_vma_page_shift().
> 
> Another question if we need the check from fault_supports_stage2_huge_mapping()
> if VM_PFNMAP area is going to be covered by block mapping. If so, the "switch(...)"
> fallback mechanism needs to be part of get_vma_page_shift().
Yes, Good suggestion. My idea is that we can keep this series simpler and do further
optimization in another patch series. Do you mind to send a patch?

Thanks,
Keqian
Gavin Shan April 21, 2021, 7:52 a.m. UTC | #5
Hi Keqian,

On 4/16/21 12:03 AM, Keqian Zhu wrote:
> The MMIO region of a device maybe huge (GB level), try to use
> block mapping in stage2 to speedup both map and unmap.
> 
> Compared to normal memory mapping, we should consider two more
> points when try block mapping for MMIO region:
> 
> 1. For normal memory mapping, the PA(host physical address) and
> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
> the HVA to request hugepage, so we don't need to consider PA
> alignment when verifing block mapping. But for device memory
> mapping, the PA and HVA may have different alignment.
> 
> 2. For normal memory mapping, we are sure hugepage size properly
> fit into vma, so we don't check whether the mapping size exceeds
> the boundary of vma. But for device memory mapping, we should pay
> attention to this.
> 
> This adds get_vma_page_shift() to get page shift for both normal
> memory and device MMIO region, and check these two points when
> selecting block mapping size for MMIO region.
> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> ---
>   arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 51 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c59af5ca01b0..5a1cc7751e6d 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>   	return PAGE_SIZE;
>   }
>   
> +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
> +{
> +	unsigned long pa;
> +
> +	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
> +		return huge_page_shift(hstate_vma(vma));
> +
> +	if (!(vma->vm_flags & VM_PFNMAP))
> +		return PAGE_SHIFT;
> +
> +	VM_BUG_ON(is_vm_hugetlb_page(vma));
> +

I don't understand how VM_PFNMAP is set for hugetlbfs related vma.
I think they are exclusive, meaning the flag is never set for
hugetlbfs vma. If it's true, VM_PFNMAP needn't be checked on hugetlbfs
vma and the VM_BUG_ON() becomes unnecessary.

> +	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
> +
> +#ifndef __PAGETABLE_PMD_FOLDED
> +	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
> +	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
> +	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
> +		return PUD_SHIFT;
> +#endif
> +
> +	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
> +	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
> +	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
> +		return PMD_SHIFT;
> +
> +	return PAGE_SHIFT;
> +}
> +

There is "switch(...)" fallback mechanism in user_mem_abort(). PUD_SIZE/PMD_SIZE
can be downgraded accordingly if the addresses fails in the alignment check
by fault_supports_stage2_huge_mapping(). I think it would make user_mem_abort()
simplified if the logic can be moved to get_vma_page_shift().

Another question if we need the check from fault_supports_stage2_huge_mapping()
if VM_PFNMAP area is going to be covered by block mapping. If so, the "switch(...)"
fallback mechanism needs to be part of get_vma_page_shift().

>   static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   			  struct kvm_memory_slot *memslot, unsigned long hva,
>   			  unsigned long fault_status)
> @@ -769,7 +798,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		return -EFAULT;
>   	}
>   
> -	/* Let's check if we will get back a huge page backed by hugetlbfs */
> +	/*
> +	 * Let's check if we will get back a huge page backed by hugetlbfs, or
> +	 * get block mapping for device MMIO region.
> +	 */
>   	mmap_read_lock(current->mm);
>   	vma = find_vma_intersection(current->mm, hva, hva + 1);
>   	if (unlikely(!vma)) {
> @@ -778,15 +810,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		return -EFAULT;
>   	}
>   
> -	if (is_vm_hugetlb_page(vma))
> -		vma_shift = huge_page_shift(hstate_vma(vma));
> -	else
> -		vma_shift = PAGE_SHIFT;
> -
> -	if (logging_active ||
> -	    (vma->vm_flags & VM_PFNMAP)) {
> +	/*
> +	 * logging_active is guaranteed to never be true for VM_PFNMAP
> +	 * memslots.
> +	 */
> +	if (logging_active) {
>   		force_pte = true;
>   		vma_shift = PAGE_SHIFT;
> +	} else {
> +		vma_shift = get_vma_page_shift(vma, hva);
>   	}
>   
>   	switch (vma_shift) {
> @@ -854,8 +886,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		return -EFAULT;
>   
>   	if (kvm_is_device_pfn(pfn)) {
> +		/*
> +		 * If the page was identified as device early by looking at
> +		 * the VMA flags, vma_pagesize is already representing the
> +		 * largest quantity we can map.  If instead it was mapped
> +		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
> +		 * and must not be upgraded.
> +		 *
> +		 * In both cases, we don't let transparent_hugepage_adjust()
> +		 * change things at the last minute.
> +		 */
>   		device = true;
> -		force_pte = true;
>   	} else if (logging_active && !write_fault) {
>   		/*
>   		 * Only actually map the page as writable if this was a write
> @@ -876,7 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   	 * If we are not forced to use page mapping, check if we are
>   	 * backed by a THP and thus use block mapping if possible.
>   	 */
> -	if (vma_pagesize == PAGE_SIZE && !force_pte)
> +	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
>   		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
>   							   &pfn, &fault_ipa);
>   	if (writable)
> 

Thanks,
Gavin
Gavin Shan April 22, 2021, 2:25 a.m. UTC | #6
Hi Keqian,

On 4/21/21 4:36 PM, Keqian Zhu wrote:
> On 2021/4/21 15:52, Gavin Shan wrote:
>> On 4/16/21 12:03 AM, Keqian Zhu wrote:
>>> The MMIO region of a device maybe huge (GB level), try to use
>>> block mapping in stage2 to speedup both map and unmap.
>>>
>>> Compared to normal memory mapping, we should consider two more
>>> points when try block mapping for MMIO region:
>>>
>>> 1. For normal memory mapping, the PA(host physical address) and
>>> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
>>> the HVA to request hugepage, so we don't need to consider PA
>>> alignment when verifing block mapping. But for device memory
>>> mapping, the PA and HVA may have different alignment.
>>>
>>> 2. For normal memory mapping, we are sure hugepage size properly
>>> fit into vma, so we don't check whether the mapping size exceeds
>>> the boundary of vma. But for device memory mapping, we should pay
>>> attention to this.
>>>
>>> This adds get_vma_page_shift() to get page shift for both normal
>>> memory and device MMIO region, and check these two points when
>>> selecting block mapping size for MMIO region.
>>>
>>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>>> ---
>>>    arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>>>    1 file changed, 51 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index c59af5ca01b0..5a1cc7751e6d 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>>        return PAGE_SIZE;
>>>    }
>>>    +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
>>> +{
>>> +    unsigned long pa;
>>> +
>>> +    if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
>>> +        return huge_page_shift(hstate_vma(vma));
>>> +
>>> +    if (!(vma->vm_flags & VM_PFNMAP))
>>> +        return PAGE_SHIFT;
>>> +
>>> +    VM_BUG_ON(is_vm_hugetlb_page(vma));
>>> +
>>
>> I don't understand how VM_PFNMAP is set for hugetlbfs related vma.
>> I think they are exclusive, meaning the flag is never set for
>> hugetlbfs vma. If it's true, VM_PFNMAP needn't be checked on hugetlbfs
>> vma and the VM_BUG_ON() becomes unnecessary.
> Yes, but we're not sure all drivers follow this rule. Add a BUG_ON() is
> a way to catch issue.
> 

I think I didn't make things clear. What I meant is VM_PFNMAP can't
be set for hugetlbfs VMAs. So the checks here can be simplified as
below if you agree:

     if (is_vm_hugetlb_page(vma))
         return huge_page_shift(hstate_vma(vma));

     if (!(vma->vm_flags & VM_PFNMAP))
         return PAGE_SHIFT;

     VM_BUG_ON(is_vm_hugetlb_page(vma));       /* Can be dropped */

>>
>>> +    pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
>>> +
>>> +#ifndef __PAGETABLE_PMD_FOLDED
>>> +    if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
>>> +        ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
>>> +        ALIGN(hva, PUD_SIZE) <= vma->vm_end)
>>> +        return PUD_SHIFT;
>>> +#endif
>>> +
>>> +    if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
>>> +        ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
>>> +        ALIGN(hva, PMD_SIZE) <= vma->vm_end)
>>> +        return PMD_SHIFT;
>>> +
>>> +    return PAGE_SHIFT;
>>> +}
>>> +
>>
>> There is "switch(...)" fallback mechanism in user_mem_abort(). PUD_SIZE/PMD_SIZE
>> can be downgraded accordingly if the addresses fails in the alignment check
>> by fault_supports_stage2_huge_mapping(). I think it would make user_mem_abort()
>> simplified if the logic can be moved to get_vma_page_shift().
>>
>> Another question if we need the check from fault_supports_stage2_huge_mapping()
>> if VM_PFNMAP area is going to be covered by block mapping. If so, the "switch(...)"
>> fallback mechanism needs to be part of get_vma_page_shift().
> Yes, Good suggestion. My idea is that we can keep this series simpler and do further
> optimization in another patch series. Do you mind to send a patch?
> 

Yeah, It's fine to keep this series as of being. There are 3 checks applied
here for MMIO region: hva/hpa/ipa and they're distributed in two functions,
making the code a bit hard to follow. I can post patch to simplify it after
your series gets merged :)

Thanks,
Gavin
Marc Zyngier April 22, 2021, 6:51 a.m. UTC | #7
On Thu, 22 Apr 2021 03:25:23 +0100,
Gavin Shan <gshan@redhat.com> wrote:
> 
> Hi Keqian,
> 
> On 4/21/21 4:36 PM, Keqian Zhu wrote:
> > On 2021/4/21 15:52, Gavin Shan wrote:
> >> On 4/16/21 12:03 AM, Keqian Zhu wrote:
> >>> The MMIO region of a device maybe huge (GB level), try to use
> >>> block mapping in stage2 to speedup both map and unmap.
> >>> 
> >>> Compared to normal memory mapping, we should consider two more
> >>> points when try block mapping for MMIO region:
> >>> 
> >>> 1. For normal memory mapping, the PA(host physical address) and
> >>> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
> >>> the HVA to request hugepage, so we don't need to consider PA
> >>> alignment when verifing block mapping. But for device memory
> >>> mapping, the PA and HVA may have different alignment.
> >>> 
> >>> 2. For normal memory mapping, we are sure hugepage size properly
> >>> fit into vma, so we don't check whether the mapping size exceeds
> >>> the boundary of vma. But for device memory mapping, we should pay
> >>> attention to this.
> >>> 
> >>> This adds get_vma_page_shift() to get page shift for both normal
> >>> memory and device MMIO region, and check these two points when
> >>> selecting block mapping size for MMIO region.
> >>> 
> >>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> >>> ---
> >>>    arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
> >>>    1 file changed, 51 insertions(+), 10 deletions(-)
> >>> 
> >>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> >>> index c59af5ca01b0..5a1cc7751e6d 100644
> >>> --- a/arch/arm64/kvm/mmu.c
> >>> +++ b/arch/arm64/kvm/mmu.c
> >>> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >>>        return PAGE_SIZE;
> >>>    }
> >>>    +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
> >>> +{
> >>> +    unsigned long pa;
> >>> +
> >>> +    if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
> >>> +        return huge_page_shift(hstate_vma(vma));
> >>> +
> >>> +    if (!(vma->vm_flags & VM_PFNMAP))
> >>> +        return PAGE_SHIFT;
> >>> +
> >>> +    VM_BUG_ON(is_vm_hugetlb_page(vma));
> >>> +
> >> 
> >> I don't understand how VM_PFNMAP is set for hugetlbfs related vma.
> >> I think they are exclusive, meaning the flag is never set for
> >> hugetlbfs vma. If it's true, VM_PFNMAP needn't be checked on hugetlbfs
> >> vma and the VM_BUG_ON() becomes unnecessary.
> > Yes, but we're not sure all drivers follow this rule. Add a BUG_ON() is
> > a way to catch issue.
> > 
> 
> I think I didn't make things clear. What I meant is VM_PFNMAP can't
> be set for hugetlbfs VMAs. So the checks here can be simplified as
> below if you agree:
> 
>     if (is_vm_hugetlb_page(vma))
>         return huge_page_shift(hstate_vma(vma));
> 
>     if (!(vma->vm_flags & VM_PFNMAP))
>         return PAGE_SHIFT;
> 
>     VM_BUG_ON(is_vm_hugetlb_page(vma));       /* Can be dropped */

No. If this case happens, I want to see it. I have explicitly asked
for it, and this check stays.

	M.
Gavin Shan April 23, 2021, 12:37 a.m. UTC | #8
On 4/16/21 12:03 AM, Keqian Zhu wrote:
> The MMIO region of a device maybe huge (GB level), try to use
> block mapping in stage2 to speedup both map and unmap.
> 
> Compared to normal memory mapping, we should consider two more
> points when try block mapping for MMIO region:
> 
> 1. For normal memory mapping, the PA(host physical address) and
> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
> the HVA to request hugepage, so we don't need to consider PA
> alignment when verifing block mapping. But for device memory
> mapping, the PA and HVA may have different alignment.
> 
> 2. For normal memory mapping, we are sure hugepage size properly
> fit into vma, so we don't check whether the mapping size exceeds
> the boundary of vma. But for device memory mapping, we should pay
> attention to this.
> 
> This adds get_vma_page_shift() to get page shift for both normal
> memory and device MMIO region, and check these two points when
> selecting block mapping size for MMIO region.
> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> ---
>   arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 51 insertions(+), 10 deletions(-)
> 

Reviewed-by: Gavin Shan <gshan@redhat.com>

> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c59af5ca01b0..5a1cc7751e6d 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>   	return PAGE_SIZE;
>   }
>   
> +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
> +{
> +	unsigned long pa;
> +
> +	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
> +		return huge_page_shift(hstate_vma(vma));
> +
> +	if (!(vma->vm_flags & VM_PFNMAP))
> +		return PAGE_SHIFT;
> +
> +	VM_BUG_ON(is_vm_hugetlb_page(vma));
> +
> +	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
> +
> +#ifndef __PAGETABLE_PMD_FOLDED
> +	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
> +	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
> +	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
> +		return PUD_SHIFT;
> +#endif
> +
> +	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
> +	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
> +	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
> +		return PMD_SHIFT;
> +
> +	return PAGE_SHIFT;
> +}
> +
>   static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   			  struct kvm_memory_slot *memslot, unsigned long hva,
>   			  unsigned long fault_status)
> @@ -769,7 +798,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		return -EFAULT;
>   	}
>   
> -	/* Let's check if we will get back a huge page backed by hugetlbfs */
> +	/*
> +	 * Let's check if we will get back a huge page backed by hugetlbfs, or
> +	 * get block mapping for device MMIO region.
> +	 */
>   	mmap_read_lock(current->mm);
>   	vma = find_vma_intersection(current->mm, hva, hva + 1);
>   	if (unlikely(!vma)) {
> @@ -778,15 +810,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		return -EFAULT;
>   	}
>   
> -	if (is_vm_hugetlb_page(vma))
> -		vma_shift = huge_page_shift(hstate_vma(vma));
> -	else
> -		vma_shift = PAGE_SHIFT;
> -
> -	if (logging_active ||
> -	    (vma->vm_flags & VM_PFNMAP)) {
> +	/*
> +	 * logging_active is guaranteed to never be true for VM_PFNMAP
> +	 * memslots.
> +	 */
> +	if (logging_active) {
>   		force_pte = true;
>   		vma_shift = PAGE_SHIFT;
> +	} else {
> +		vma_shift = get_vma_page_shift(vma, hva);
>   	}
>   
>   	switch (vma_shift) {
> @@ -854,8 +886,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		return -EFAULT;
>   
>   	if (kvm_is_device_pfn(pfn)) {
> +		/*
> +		 * If the page was identified as device early by looking at
> +		 * the VMA flags, vma_pagesize is already representing the
> +		 * largest quantity we can map.  If instead it was mapped
> +		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
> +		 * and must not be upgraded.
> +		 *
> +		 * In both cases, we don't let transparent_hugepage_adjust()
> +		 * change things at the last minute.
> +		 */
>   		device = true;
> -		force_pte = true;
>   	} else if (logging_active && !write_fault) {
>   		/*
>   		 * Only actually map the page as writable if this was a write
> @@ -876,7 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   	 * If we are not forced to use page mapping, check if we are
>   	 * backed by a THP and thus use block mapping if possible.
>   	 */
> -	if (vma_pagesize == PAGE_SIZE && !force_pte)
> +	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
>   		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
>   							   &pfn, &fault_ipa);
>   	if (writable)
>
Gavin Shan April 23, 2021, 12:42 a.m. UTC | #9
Hi Marc,

On 4/22/21 4:51 PM, Marc Zyngier wrote:
> On Thu, 22 Apr 2021 03:25:23 +0100,
> Gavin Shan <gshan@redhat.com> wrote:
>> On 4/21/21 4:36 PM, Keqian Zhu wrote:
>>> On 2021/4/21 15:52, Gavin Shan wrote:
>>>> On 4/16/21 12:03 AM, Keqian Zhu wrote:
>>>>> The MMIO region of a device maybe huge (GB level), try to use
>>>>> block mapping in stage2 to speedup both map and unmap.
>>>>>
>>>>> Compared to normal memory mapping, we should consider two more
>>>>> points when try block mapping for MMIO region:
>>>>>
>>>>> 1. For normal memory mapping, the PA(host physical address) and
>>>>> HVA have same alignment within PUD_SIZE or PMD_SIZE when we use
>>>>> the HVA to request hugepage, so we don't need to consider PA
>>>>> alignment when verifing block mapping. But for device memory
>>>>> mapping, the PA and HVA may have different alignment.
>>>>>
>>>>> 2. For normal memory mapping, we are sure hugepage size properly
>>>>> fit into vma, so we don't check whether the mapping size exceeds
>>>>> the boundary of vma. But for device memory mapping, we should pay
>>>>> attention to this.
>>>>>
>>>>> This adds get_vma_page_shift() to get page shift for both normal
>>>>> memory and device MMIO region, and check these two points when
>>>>> selecting block mapping size for MMIO region.
>>>>>
>>>>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>>>>> ---
>>>>>     arch/arm64/kvm/mmu.c | 61 ++++++++++++++++++++++++++++++++++++--------
>>>>>     1 file changed, 51 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>>>> index c59af5ca01b0..5a1cc7751e6d 100644
>>>>> --- a/arch/arm64/kvm/mmu.c
>>>>> +++ b/arch/arm64/kvm/mmu.c
>>>>> @@ -738,6 +738,35 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>>>>         return PAGE_SIZE;
>>>>>     }
>>>>>     +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
>>>>> +{
>>>>> +    unsigned long pa;
>>>>> +
>>>>> +    if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
>>>>> +        return huge_page_shift(hstate_vma(vma));
>>>>> +
>>>>> +    if (!(vma->vm_flags & VM_PFNMAP))
>>>>> +        return PAGE_SHIFT;
>>>>> +
>>>>> +    VM_BUG_ON(is_vm_hugetlb_page(vma));
>>>>> +
>>>>
>>>> I don't understand how VM_PFNMAP is set for hugetlbfs related vma.
>>>> I think they are exclusive, meaning the flag is never set for
>>>> hugetlbfs vma. If it's true, VM_PFNMAP needn't be checked on hugetlbfs
>>>> vma and the VM_BUG_ON() becomes unnecessary.
>>> Yes, but we're not sure all drivers follow this rule. Add a BUG_ON() is
>>> a way to catch issue.
>>>
>>
>> I think I didn't make things clear. What I meant is VM_PFNMAP can't
>> be set for hugetlbfs VMAs. So the checks here can be simplified as
>> below if you agree:
>>
>>      if (is_vm_hugetlb_page(vma))
>>          return huge_page_shift(hstate_vma(vma));
>>
>>      if (!(vma->vm_flags & VM_PFNMAP))
>>          return PAGE_SHIFT;
>>
>>      VM_BUG_ON(is_vm_hugetlb_page(vma));       /* Can be dropped */
> 
> No. If this case happens, I want to see it. I have explicitly asked
> for it, and this check stays.
> 

Thanks for the explanation. To keep VM_BUG_ON() sounds good to me too :)

Thanks,
Gavin
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c59af5ca01b0..5a1cc7751e6d 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -738,6 +738,35 @@  transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 	return PAGE_SIZE;
 }
 
+static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
+{
+	unsigned long pa;
+
+	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
+		return huge_page_shift(hstate_vma(vma));
+
+	if (!(vma->vm_flags & VM_PFNMAP))
+		return PAGE_SHIFT;
+
+	VM_BUG_ON(is_vm_hugetlb_page(vma));
+
+	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
+
+#ifndef __PAGETABLE_PMD_FOLDED
+	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
+	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
+	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
+		return PUD_SHIFT;
+#endif
+
+	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
+	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
+	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
+		return PMD_SHIFT;
+
+	return PAGE_SHIFT;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -769,7 +798,10 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	/* Let's check if we will get back a huge page backed by hugetlbfs */
+	/*
+	 * Let's check if we will get back a huge page backed by hugetlbfs, or
+	 * get block mapping for device MMIO region.
+	 */
 	mmap_read_lock(current->mm);
 	vma = find_vma_intersection(current->mm, hva, hva + 1);
 	if (unlikely(!vma)) {
@@ -778,15 +810,15 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma))
-		vma_shift = huge_page_shift(hstate_vma(vma));
-	else
-		vma_shift = PAGE_SHIFT;
-
-	if (logging_active ||
-	    (vma->vm_flags & VM_PFNMAP)) {
+	/*
+	 * logging_active is guaranteed to never be true for VM_PFNMAP
+	 * memslots.
+	 */
+	if (logging_active) {
 		force_pte = true;
 		vma_shift = PAGE_SHIFT;
+	} else {
+		vma_shift = get_vma_page_shift(vma, hva);
 	}
 
 	switch (vma_shift) {
@@ -854,8 +886,17 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 
 	if (kvm_is_device_pfn(pfn)) {
+		/*
+		 * If the page was identified as device early by looking at
+		 * the VMA flags, vma_pagesize is already representing the
+		 * largest quantity we can map.  If instead it was mapped
+		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+		 * and must not be upgraded.
+		 *
+		 * In both cases, we don't let transparent_hugepage_adjust()
+		 * change things at the last minute.
+		 */
 		device = true;
-		force_pte = true;
 	} else if (logging_active && !write_fault) {
 		/*
 		 * Only actually map the page as writable if this was a write
@@ -876,7 +917,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
 	 */
-	if (vma_pagesize == PAGE_SIZE && !force_pte)
+	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
 		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
 							   &pfn, &fault_ipa);
 	if (writable)