diff mbox series

kvm: arm: Fix handling of stage2 huge mappings

Message ID 1553004668-23296-1-git-send-email-suzuki.poulose@arm.com (mailing list archive)
State New, archived
Headers show
Series kvm: arm: Fix handling of stage2 huge mappings | expand

Commit Message

Suzuki K Poulose March 19, 2019, 2:11 p.m. UTC
We rely on the mmu_notifier call backs to handle the split/merge
of huge pages and thus we are guaranteed that, while creating a
block mapping, either the entire block is unmapped at stage2 or it
is missing permission.

However, we miss a case where the block mapping is split for dirty
logging case and then could later be made block mapping, if we cancel the
dirty logging. This not only creates inconsistent TLB entries for
the pages in the the block, but also leakes the table pages for
PMD level.

Handle this corner case for the huge mappings at stage2 by
unmapping the non-huge mapping for the block. This could potentially
release the upper level table. So we need to restart the table walk
once we unmap the range.

Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
Cc: Zheng Xiang <zhengxiang9@huawei.com>
Cc: Zhengui Yu <yuzenghui@huawei.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---
 virt/kvm/arm/mmu.c | 63 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 18 deletions(-)

Comments

Zenghui Yu March 19, 2019, 4:02 p.m. UTC | #1
Hi Suzuki,

On 2019/3/19 22:11, Suzuki K Poulose wrote:
> We rely on the mmu_notifier call backs to handle the split/merge
> of huge pages and thus we are guaranteed that, while creating a
> block mapping, either the entire block is unmapped at stage2 or it
> is missing permission.
> 
> However, we miss a case where the block mapping is split for dirty
> logging case and then could later be made block mapping, if we cancel the
> dirty logging. This not only creates inconsistent TLB entries for
> the pages in the the block, but also leakes the table pages for
> PMD level.
> 
> Handle this corner case for the huge mappings at stage2 by
> unmapping the non-huge mapping for the block. This could potentially
> release the upper level table. So we need to restart the table walk
> once we unmap the range.
> 
> Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
> Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
> Cc: Zheng Xiang <zhengxiang9@huawei.com>
> Cc: Zhengui Yu <yuzenghui@huawei.com>

Sorry to bother you, but this should be "Zenghui Yu", thanks!


zenghui

> Cc: Marc Zyngier <marc.zyngier@arm.com>
> Cc: Christoffer Dall <christoffer.dall@arm.com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> ---
>   virt/kvm/arm/mmu.c | 63 ++++++++++++++++++++++++++++++++++++++----------------
>   1 file changed, 45 insertions(+), 18 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index fce0983..6ad6f19d 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1060,25 +1060,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>   {
>   	pmd_t *pmd, old_pmd;
>   
> +retry:
>   	pmd = stage2_get_pmd(kvm, cache, addr);
>   	VM_BUG_ON(!pmd);
>   
>   	old_pmd = *pmd;
> +	/*
> +	 * Multiple vcpus faulting on the same PMD entry, can
> +	 * lead to them sequentially updating the PMD with the
> +	 * same value. Following the break-before-make
> +	 * (pmd_clear() followed by tlb_flush()) process can
> +	 * hinder forward progress due to refaults generated
> +	 * on missing translations.
> +	 *
> +	 * Skip updating the page table if the entry is
> +	 * unchanged.
> +	 */
> +	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
> +		return 0;
> +
>   	if (pmd_present(old_pmd)) {
>   		/*
> -		 * Multiple vcpus faulting on the same PMD entry, can
> -		 * lead to them sequentially updating the PMD with the
> -		 * same value. Following the break-before-make
> -		 * (pmd_clear() followed by tlb_flush()) process can
> -		 * hinder forward progress due to refaults generated
> -		 * on missing translations.
> +		 * If we already have PTE level mapping for this block,
> +		 * we must unmap it to avoid inconsistent TLB state and
> +		 * leaking the table page. We could end up in this situation
> +		 * if the memory slot was marked for dirty logging and was
> +		 * reverted, leaving PTE level mappings for the pages accessed
> +		 * during the period. So, unmap the PTE level mapping for this
> +		 * block and retry, as we could have released the upper level
> +		 * table in the process.
>   		 *
> -		 * Skip updating the page table if the entry is
> -		 * unchanged.
> +		 * Normal THP split/merge follows mmu_notifier callbacks and do
> +		 * get handled accordingly.
>   		 */
> -		if (pmd_val(old_pmd) == pmd_val(*new_pmd))
> -			return 0;
> -
> +		if (!pmd_thp_or_huge(old_pmd)) {
> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
> +			goto retry;
> +		}
>   		/*
>   		 * Mapping in huge pages should only happen through a
>   		 * fault.  If a page is merged into a transparent huge
> @@ -1090,8 +1108,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>   		 * should become splitting first, unmapped, merged,
>   		 * and mapped back in on-demand.
>   		 */
> -		VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
> -
> +		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
>   		pmd_clear(pmd);
>   		kvm_tlb_flush_vmid_ipa(kvm, addr);
>   	} else {
> @@ -1107,6 +1124,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
>   {
>   	pud_t *pudp, old_pud;
>   
> +retry:
>   	pudp = stage2_get_pud(kvm, cache, addr);
>   	VM_BUG_ON(!pudp);
>   
> @@ -1114,16 +1132,25 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
>   
>   	/*
>   	 * A large number of vcpus faulting on the same stage 2 entry,
> -	 * can lead to a refault due to the
> -	 * stage2_pud_clear()/tlb_flush(). Skip updating the page
> -	 * tables if there is no change.
> +	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
> +	 * Skip updating the page tables if there is no change.
>   	 */
>   	if (pud_val(old_pud) == pud_val(*new_pudp))
>   		return 0;
>   
>   	if (stage2_pud_present(kvm, old_pud)) {
> -		stage2_pud_clear(kvm, pudp);
> -		kvm_tlb_flush_vmid_ipa(kvm, addr);
> +		/*
> +		 * If we already have table level mapping for this block, unmap
> +		 * the range for this block and retry.
> +		 */
> +		if (!stage2_pud_huge(kvm, old_pud)) {
> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
> +			goto retry;
> +		} else {
> +			WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp));
> +			stage2_pud_clear(kvm, pudp);
> +			kvm_tlb_flush_vmid_ipa(kvm, addr);
> +		}
>   	} else {
>   		get_page(virt_to_page(pudp));
>   	}
>
Marc Zyngier March 20, 2019, 8:15 a.m. UTC | #2
Hi Suzuki,

On Tue, 19 Mar 2019 14:11:08 +0000,
Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
> 
> We rely on the mmu_notifier call backs to handle the split/merge
> of huge pages and thus we are guaranteed that, while creating a
> block mapping, either the entire block is unmapped at stage2 or it
> is missing permission.
> 
> However, we miss a case where the block mapping is split for dirty
> logging case and then could later be made block mapping, if we cancel the
> dirty logging. This not only creates inconsistent TLB entries for
> the pages in the the block, but also leakes the table pages for
> PMD level.
> 
> Handle this corner case for the huge mappings at stage2 by
> unmapping the non-huge mapping for the block. This could potentially
> release the upper level table. So we need to restart the table walk
> once we unmap the range.
> 
> Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
> Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
> Cc: Zheng Xiang <zhengxiang9@huawei.com>
> Cc: Zhengui Yu <yuzenghui@huawei.com>
> Cc: Marc Zyngier <marc.zyngier@arm.com>
> Cc: Christoffer Dall <christoffer.dall@arm.com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> ---
>  virt/kvm/arm/mmu.c | 63 ++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 45 insertions(+), 18 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index fce0983..6ad6f19d 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1060,25 +1060,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  {
>  	pmd_t *pmd, old_pmd;
>  
> +retry:
>  	pmd = stage2_get_pmd(kvm, cache, addr);
>  	VM_BUG_ON(!pmd);
>  
>  	old_pmd = *pmd;
> +	/*
> +	 * Multiple vcpus faulting on the same PMD entry, can
> +	 * lead to them sequentially updating the PMD with the
> +	 * same value. Following the break-before-make
> +	 * (pmd_clear() followed by tlb_flush()) process can
> +	 * hinder forward progress due to refaults generated
> +	 * on missing translations.
> +	 *
> +	 * Skip updating the page table if the entry is
> +	 * unchanged.
> +	 */
> +	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
> +		return 0;
> +
>  	if (pmd_present(old_pmd)) {
>  		/*
> -		 * Multiple vcpus faulting on the same PMD entry, can
> -		 * lead to them sequentially updating the PMD with the
> -		 * same value. Following the break-before-make
> -		 * (pmd_clear() followed by tlb_flush()) process can
> -		 * hinder forward progress due to refaults generated
> -		 * on missing translations.
> +		 * If we already have PTE level mapping for this block,
> +		 * we must unmap it to avoid inconsistent TLB state and
> +		 * leaking the table page. We could end up in this situation
> +		 * if the memory slot was marked for dirty logging and was
> +		 * reverted, leaving PTE level mappings for the pages accessed
> +		 * during the period. So, unmap the PTE level mapping for this
> +		 * block and retry, as we could have released the upper level
> +		 * table in the process.
>  		 *
> -		 * Skip updating the page table if the entry is
> -		 * unchanged.
> +		 * Normal THP split/merge follows mmu_notifier callbacks and do
> +		 * get handled accordingly.
>  		 */
> -		if (pmd_val(old_pmd) == pmd_val(*new_pmd))
> -			return 0;
> -
> +		if (!pmd_thp_or_huge(old_pmd)) {
> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
> +			goto retry;

This looks slightly dodgy. Doing this retry results in another call to
stage2_get_pmd(), which may or may not result in allocating a PUD. I
think this is safe as if we managed to get here, it means the whole
hierarchy was already present and nothing was allocated in the first
round.

Somehow, I would feel more comfortable with just not even trying.
Unmap, don't fix the fault, let the vcpu come again for additional
punishment. But this is probably more invasive, as none of the
stage2_set_p*() return value is ever evaluated. Oh well.

> +		}
>  		/*
>  		 * Mapping in huge pages should only happen through a
>  		 * fault.  If a page is merged into a transparent huge
> @@ -1090,8 +1108,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  		 * should become splitting first, unmapped, merged,
>  		 * and mapped back in on-demand.
>  		 */
> -		VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
> -
> +		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
>  		pmd_clear(pmd);
>  		kvm_tlb_flush_vmid_ipa(kvm, addr);
>  	} else {
> @@ -1107,6 +1124,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
>  {
>  	pud_t *pudp, old_pud;
>  
> +retry:
>  	pudp = stage2_get_pud(kvm, cache, addr);
>  	VM_BUG_ON(!pudp);
>  
> @@ -1114,16 +1132,25 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
>  
>  	/*
>  	 * A large number of vcpus faulting on the same stage 2 entry,
> -	 * can lead to a refault due to the
> -	 * stage2_pud_clear()/tlb_flush(). Skip updating the page
> -	 * tables if there is no change.
> +	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
> +	 * Skip updating the page tables if there is no change.
>  	 */
>  	if (pud_val(old_pud) == pud_val(*new_pudp))
>  		return 0;
>  
>  	if (stage2_pud_present(kvm, old_pud)) {
> -		stage2_pud_clear(kvm, pudp);
> -		kvm_tlb_flush_vmid_ipa(kvm, addr);
> +		/*
> +		 * If we already have table level mapping for this block, unmap
> +		 * the range for this block and retry.
> +		 */
> +		if (!stage2_pud_huge(kvm, old_pud)) {
> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);

This broke 32bit. I've added the following hunk to fix it:

diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
index de2089501b8b..b8f21088a744 100644
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -68,6 +68,9 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 #define stage2_pmd_table_empty(kvm, pmdp)	kvm_page_empty(pmdp)
 #define stage2_pud_table_empty(kvm, pudp)	false
 
+#define S2_PUD_MASK				PGDIR_MASK
+#define S2_PUD_SIZE				PGDIR_SIZE
+
 static inline bool kvm_stage2_has_pud(struct kvm *kvm)
 {
 	return false;

> +			goto retry;
> +		} else {
> +			WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp));
> +			stage2_pud_clear(kvm, pudp);
> +			kvm_tlb_flush_vmid_ipa(kvm, addr);
> +		}

The 'else' line could go, and would make the code similar to the PMD path.

>  	} else {
>  		get_page(virt_to_page(pudp));
>  	}
> -- 
> 2.7.4
> 

If you're OK with the above nits, I'll squash them into the patch.

Thanks,

	M.
Suzuki K Poulose March 20, 2019, 9:44 a.m. UTC | #3
Hi Marc,

On 20/03/2019 08:15, Marc Zyngier wrote:
> Hi Suzuki,
> 
> On Tue, 19 Mar 2019 14:11:08 +0000,
> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
>>
>> We rely on the mmu_notifier call backs to handle the split/merge
>> of huge pages and thus we are guaranteed that, while creating a
>> block mapping, either the entire block is unmapped at stage2 or it
>> is missing permission.
>>
>> However, we miss a case where the block mapping is split for dirty
>> logging case and then could later be made block mapping, if we cancel the
>> dirty logging. This not only creates inconsistent TLB entries for
>> the pages in the the block, but also leakes the table pages for
>> PMD level.
>>
>> Handle this corner case for the huge mappings at stage2 by
>> unmapping the non-huge mapping for the block. This could potentially
>> release the upper level table. So we need to restart the table walk
>> once we unmap the range.
>>
>> Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
>> Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
>> Cc: Zheng Xiang <zhengxiang9@huawei.com>
>> Cc: Zhengui Yu <yuzenghui@huawei.com>
>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>> Cc: Christoffer Dall <christoffer.dall@arm.com>
>> Signed-off-by: Suzuki K Poulose 

...

>> +retry:
>>   	pmd = stage2_get_pmd(kvm, cache, addr);
>>   	VM_BUG_ON(!pmd);
>>   

...

>>   	if (pmd_present(old_pmd)) {
>>   		/*
>> -		 * Multiple vcpus faulting on the same PMD entry, can
>> -		 * lead to them sequentially updating the PMD with the
>> -		 * same value. Following the break-before-make
>> -		 * (pmd_clear() followed by tlb_flush()) process can
>> -		 * hinder forward progress due to refaults generated
>> -		 * on missing translations.
>> +		 * If we already have PTE level mapping for this block,
>> +		 * we must unmap it to avoid inconsistent TLB state and
>> +		 * leaking the table page. We could end up in this situation
>> +		 * if the memory slot was marked for dirty logging and was
>> +		 * reverted, leaving PTE level mappings for the pages accessed
>> +		 * during the period. So, unmap the PTE level mapping for this
>> +		 * block and retry, as we could have released the upper level
>> +		 * table in the process.
>>   		 *
>> -		 * Skip updating the page table if the entry is
>> -		 * unchanged.
>> +		 * Normal THP split/merge follows mmu_notifier callbacks and do
>> +		 * get handled accordingly.
>>   		 */
>> -		if (pmd_val(old_pmd) == pmd_val(*new_pmd))
>> -			return 0;
>> -
>> +		if (!pmd_thp_or_huge(old_pmd)) {
>> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
>> +			goto retry;
> 
> This looks slightly dodgy. Doing this retry results in another call to
> stage2_get_pmd(), which may or may not result in allocating a PUD. I
> think this is safe as if we managed to get here, it means the whole
> hierarchy was already present and nothing was allocated in the first
> round.
> 
> Somehow, I would feel more comfortable with just not even trying.
> Unmap, don't fix the fault, let the vcpu come again for additional
> punishment. But this is probably more invasive, as none of the
> stage2_set_p*() return value is ever evaluated. Oh well.
> 

Yes. The other option was to unmap_stage2_ptes() and get the page refcount
on the new pmd. But that kind of makes it a bit difficult to follow the
code.

>>   	if (stage2_pud_present(kvm, old_pud)) {
>> -		stage2_pud_clear(kvm, pudp);
>> -		kvm_tlb_flush_vmid_ipa(kvm, addr);
>> +		/*
>> +		 * If we already have table level mapping for this block, unmap
>> +		 * the range for this block and retry.
>> +		 */
>> +		if (!stage2_pud_huge(kvm, old_pud)) {
>> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
> 
> This broke 32bit. I've added the following hunk to fix it:

Grrr! Sorry about that.

> 
> diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
> index de2089501b8b..b8f21088a744 100644
> --- a/arch/arm/include/asm/stage2_pgtable.h
> +++ b/arch/arm/include/asm/stage2_pgtable.h
> @@ -68,6 +68,9 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
>   #define stage2_pmd_table_empty(kvm, pmdp)	kvm_page_empty(pmdp)
>   #define stage2_pud_table_empty(kvm, pudp)	false
>   
> +#define S2_PUD_MASK				PGDIR_MASK
> +#define S2_PUD_SIZE				PGDIR_SIZE
> +

We should really get rid of the S2_P{U/M}D_* definitions, as they are
always the same as the host. The only thing that changes is the PGD size
which varies according to the IPA and the concatenation.

>   static inline bool kvm_stage2_has_pud(struct kvm *kvm)
>   {
>   	return false;
> 
>> +			goto retry;
>> +		} else {
>> +			WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp));
>> +			stage2_pud_clear(kvm, pudp);
>> +			kvm_tlb_flush_vmid_ipa(kvm, addr);
>> +		}
> 
> The 'else' line could go, and would make the code similar to the PMD path.
> 

Yep. I think the pud_pfn() may not be defined for some configs, if the hugetlbfs
is not selected on arm32. So, we should move them to kvm_pud_pfn() instead.


>>   	} else {
>>   		get_page(virt_to_page(pudp));
>>   	}
>> -- 
>> 2.7.4
>>
> 
> If you're OK with the above nits, I'll squash them into the patch.

With the kvm_pud_pfn() changes, yes. Alternately, I could resend the updated
patch, fixing the typo in Zenghui's name. Let me know.

Cheers
Suzuki
Marc Zyngier March 20, 2019, 10:11 a.m. UTC | #4
On Wed, 20 Mar 2019 09:44:38 +0000
Suzuki K Poulose <suzuki.poulose@arm.com> wrote:

> Hi Marc,
> 
> On 20/03/2019 08:15, Marc Zyngier wrote:
> > Hi Suzuki,
> > 
> > On Tue, 19 Mar 2019 14:11:08 +0000,
> > Suzuki K Poulose <suzuki.poulose@arm.com> wrote:  
> >>
> >> We rely on the mmu_notifier call backs to handle the split/merge
> >> of huge pages and thus we are guaranteed that, while creating a
> >> block mapping, either the entire block is unmapped at stage2 or it
> >> is missing permission.
> >>
> >> However, we miss a case where the block mapping is split for dirty
> >> logging case and then could later be made block mapping, if we cancel the
> >> dirty logging. This not only creates inconsistent TLB entries for
> >> the pages in the the block, but also leakes the table pages for
> >> PMD level.
> >>
> >> Handle this corner case for the huge mappings at stage2 by
> >> unmapping the non-huge mapping for the block. This could potentially
> >> release the upper level table. So we need to restart the table walk
> >> once we unmap the range.
> >>
> >> Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
> >> Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
> >> Cc: Zheng Xiang <zhengxiang9@huawei.com>
> >> Cc: Zhengui Yu <yuzenghui@huawei.com>
> >> Cc: Marc Zyngier <marc.zyngier@arm.com>
> >> Cc: Christoffer Dall <christoffer.dall@arm.com>
> >> Signed-off-by: Suzuki K Poulose ...  
> 
> >> +retry:
> >>   	pmd = stage2_get_pmd(kvm, cache, addr);
> >>   	VM_BUG_ON(!pmd);
> >>   ...  
> 
> >>   	if (pmd_present(old_pmd)) {
> >>   		/*
> >> -		 * Multiple vcpus faulting on the same PMD entry, can
> >> -		 * lead to them sequentially updating the PMD with the
> >> -		 * same value. Following the break-before-make
> >> -		 * (pmd_clear() followed by tlb_flush()) process can
> >> -		 * hinder forward progress due to refaults generated
> >> -		 * on missing translations.
> >> +		 * If we already have PTE level mapping for this block,
> >> +		 * we must unmap it to avoid inconsistent TLB state and
> >> +		 * leaking the table page. We could end up in this situation
> >> +		 * if the memory slot was marked for dirty logging and was
> >> +		 * reverted, leaving PTE level mappings for the pages accessed
> >> +		 * during the period. So, unmap the PTE level mapping for this
> >> +		 * block and retry, as we could have released the upper level
> >> +		 * table in the process.
> >>   		 *
> >> -		 * Skip updating the page table if the entry is
> >> -		 * unchanged.
> >> +		 * Normal THP split/merge follows mmu_notifier callbacks and do
> >> +		 * get handled accordingly.
> >>   		 */
> >> -		if (pmd_val(old_pmd) == pmd_val(*new_pmd))
> >> -			return 0;
> >> -
> >> +		if (!pmd_thp_or_huge(old_pmd)) {
> >> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
> >> +			goto retry;  
> > 
> > This looks slightly dodgy. Doing this retry results in another call to
> > stage2_get_pmd(), which may or may not result in allocating a PUD. I
> > think this is safe as if we managed to get here, it means the whole
> > hierarchy was already present and nothing was allocated in the first
> > round.
> > 
> > Somehow, I would feel more comfortable with just not even trying.
> > Unmap, don't fix the fault, let the vcpu come again for additional
> > punishment. But this is probably more invasive, as none of the
> > stage2_set_p*() return value is ever evaluated. Oh well.
> >   
> 
> Yes. The other option was to unmap_stage2_ptes() and get the page refcount
> on the new pmd. But that kind of makes it a bit difficult to follow the
> code.
> 
> >>   	if (stage2_pud_present(kvm, old_pud)) {
> >> -		stage2_pud_clear(kvm, pudp);
> >> -		kvm_tlb_flush_vmid_ipa(kvm, addr);
> >> +		/*
> >> +		 * If we already have table level mapping for this block, unmap
> >> +		 * the range for this block and retry.
> >> +		 */
> >> +		if (!stage2_pud_huge(kvm, old_pud)) {
> >> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);  
> > 
> > This broke 32bit. I've added the following hunk to fix it:  
> 
> Grrr! Sorry about that.
> 
> > 
> > diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
> > index de2089501b8b..b8f21088a744 100644
> > --- a/arch/arm/include/asm/stage2_pgtable.h
> > +++ b/arch/arm/include/asm/stage2_pgtable.h
> > @@ -68,6 +68,9 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> >   #define stage2_pmd_table_empty(kvm, pmdp)	kvm_page_empty(pmdp)
> >   #define stage2_pud_table_empty(kvm, pudp)	false  
> >   > +#define S2_PUD_MASK				PGDIR_MASK  
> > +#define S2_PUD_SIZE				PGDIR_SIZE
> > +  
> 
> We should really get rid of the S2_P{U/M}D_* definitions, as they are
> always the same as the host. The only thing that changes is the PGD size
> which varies according to the IPA and the concatenation.
> 
> >   static inline bool kvm_stage2_has_pud(struct kvm *kvm)
> >   {
> >   	return false;
> >   
> >> +			goto retry;
> >> +		} else {
> >> +			WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp));
> >> +			stage2_pud_clear(kvm, pudp);
> >> +			kvm_tlb_flush_vmid_ipa(kvm, addr);
> >> +		}  
> > 
> > The 'else' line could go, and would make the code similar to the PMD path.
> >   
> 
> Yep. I think the pud_pfn() may not be defined for some configs, if the hugetlbfs
> is not selected on arm32. So, we should move them to kvm_pud_pfn() instead.
> 
> 
> >>   	} else {
> >>   		get_page(virt_to_page(pudp));
> >>   	}  
> >> -- >> 2.7.4  
> >>  
> > 
> > If you're OK with the above nits, I'll squash them into the patch.  
> 
> With the kvm_pud_pfn() changes, yes. Alternately, I could resend the updated
> patch, fixing the typo in Zenghui's name. Let me know.

Sure, feel free to send a fixed version. I'll drop the currently queued
patch.

Thanks,

	M.
Suzuki K Poulose March 20, 2019, 10:23 a.m. UTC | #5
Marc,

On 20/03/2019 10:11, Marc Zyngier wrote:
> On Wed, 20 Mar 2019 09:44:38 +0000
> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
> 
>> Hi Marc,
>>
>> On 20/03/2019 08:15, Marc Zyngier wrote:
>>> Hi Suzuki,
>>>
>>> On Tue, 19 Mar 2019 14:11:08 +0000,
>>> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
>>>>
>>>> We rely on the mmu_notifier call backs to handle the split/merge
>>>> of huge pages and thus we are guaranteed that, while creating a
>>>> block mapping, either the entire block is unmapped at stage2 or it
>>>> is missing permission.
>>>>
>>>> However, we miss a case where the block mapping is split for dirty
>>>> logging case and then could later be made block mapping, if we cancel the
>>>> dirty logging. This not only creates inconsistent TLB entries for
>>>> the pages in the the block, but also leakes the table pages for
>>>> PMD level.
>>>>
>>>> Handle this corner case for the huge mappings at stage2 by
>>>> unmapping the non-huge mapping for the block. This could potentially
>>>> release the upper level table. So we need to restart the table walk
>>>> once we unmap the range.
>>>>
>>>> Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
>>>> Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
>>>> Cc: Zheng Xiang <zhengxiang9@huawei.com>
>>>> Cc: Zhengui Yu <yuzenghui@huawei.com>
>>>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>>>> Cc: Christoffer Dall <christoffer.dall@arm.com>
>>>> Signed-off-by: Suzuki K Poulose ...


>>>> +		if (!pmd_thp_or_huge(old_pmd)) {
>>>> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
>>>> +			goto retry;
>>>

>>>> +		if (!stage2_pud_huge(kvm, old_pud)) {
>>>> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
>>>

>> We should really get rid of the S2_P{U/M}D_* definitions, as they are
>> always the same as the host. The only thing that changes is the PGD size
>> which varies according to the IPA and the concatenation.
>>

Also what do you think about using  P{M,U}D_* instead of S2_P{M,U}D_*
above ? I could make that change with the respin.

> 
> Sure, feel free to send a fixed version. I'll drop the currently queued
> patch.
> 


Thanks. Sorry for the trouble.

Cheers
Suzuki
Marc Zyngier March 20, 2019, 10:35 a.m. UTC | #6
On Wed, 20 Mar 2019 10:23:39 +0000
Suzuki K Poulose <suzuki.poulose@arm.com> wrote:

Hi Suzuki,

> Marc,
> 
> On 20/03/2019 10:11, Marc Zyngier wrote:
> > On Wed, 20 Mar 2019 09:44:38 +0000
> > Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
> >   
> >> Hi Marc,
> >>
> >> On 20/03/2019 08:15, Marc Zyngier wrote:  
> >>> Hi Suzuki,
> >>>
> >>> On Tue, 19 Mar 2019 14:11:08 +0000,
> >>> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:  
> >>>>
> >>>> We rely on the mmu_notifier call backs to handle the split/merge
> >>>> of huge pages and thus we are guaranteed that, while creating a
> >>>> block mapping, either the entire block is unmapped at stage2 or it
> >>>> is missing permission.
> >>>>
> >>>> However, we miss a case where the block mapping is split for dirty
> >>>> logging case and then could later be made block mapping, if we cancel the
> >>>> dirty logging. This not only creates inconsistent TLB entries for
> >>>> the pages in the the block, but also leakes the table pages for
> >>>> PMD level.
> >>>>
> >>>> Handle this corner case for the huge mappings at stage2 by
> >>>> unmapping the non-huge mapping for the block. This could potentially
> >>>> release the upper level table. So we need to restart the table walk
> >>>> once we unmap the range.
> >>>>
> >>>> Fixes : ad361f093c1e31d ("KVM: ARM: Support hugetlbfs backed huge pages")
> >>>> Reported-by: Zheng Xiang <zhengxiang9@huawei.com>
> >>>> Cc: Zheng Xiang <zhengxiang9@huawei.com>
> >>>> Cc: Zhengui Yu <yuzenghui@huawei.com>
> >>>> Cc: Marc Zyngier <marc.zyngier@arm.com>
> >>>> Cc: Christoffer Dall <christoffer.dall@arm.com>
> >>>> Signed-off-by: Suzuki K Poulose ...  
> 
> 
> >>>> +		if (!pmd_thp_or_huge(old_pmd)) {
> >>>> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
> >>>> +			goto retry;  
> >>>  
> 
> >>>> +		if (!stage2_pud_huge(kvm, old_pud)) {
> >>>> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);  
> >>>  
> 
> >> We should really get rid of the S2_P{U/M}D_* definitions, as they are
> >> always the same as the host. The only thing that changes is the PGD size
> >> which varies according to the IPA and the concatenation.
> >>  
> 
> Also what do you think about using  P{M,U}D_* instead of S2_P{M,U}D_*
> above ? I could make that change with the respin.

Given that this is a fix, I'd like it to be as small as obvious as
possible, making it easier to backport.

I'm happy to take another patch for 5.2 that will drop the whole S2_P*
if we still think that this should be the case (though what I'd really
like is to have architectural levels instead of these arbitrary
definitions).

Thanks,

	M.
Suzuki K Poulose March 20, 2019, 11:12 a.m. UTC | #7
Marc,

On 20/03/2019 10:35, Marc Zyngier wrote:
> On Wed, 20 Mar 2019 10:23:39 +0000
> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
> 
> Hi Suzuki,
> 
>> Marc,
>>
>> On 20/03/2019 10:11, Marc Zyngier wrote:
>>> On Wed, 20 Mar 2019 09:44:38 +0000
>>> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
>>>    
>>>> Hi Marc,
>>>>
>>>> On 20/03/2019 08:15, Marc Zyngier wrote:
>>>>> Hi Suzuki,
>>>>>
>>>>> On Tue, 19 Mar 2019 14:11:08 +0000,
>>>>> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:

...

>>>>>> +		if (!pmd_thp_or_huge(old_pmd)) {
>>>>>> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
>>>>>> +			goto retry;
>>>>>   
>>
>>>>>> +		if (!stage2_pud_huge(kvm, old_pud)) {
>>>>>> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
>>>>>   
>>
>>>> We should really get rid of the S2_P{U/M}D_* definitions, as they are
>>>> always the same as the host. The only thing that changes is the PGD size
>>>> which varies according to the IPA and the concatenation.
>>>>   
>>
>> Also what do you think about using  P{M,U}D_* instead of S2_P{M,U}D_*
>> above ? I could make that change with the respin.
> 
> Given that this is a fix, I'd like it to be as small as obvious as
> possible, making it easier to backport.
> 
> I'm happy to take another patch for 5.2 that will drop the whole S2_P*
> if we still think that this should be the case (though what I'd really
> like is to have architectural levels instead of these arbitrary
> definitions).

I only meant the two new instances added above in the patch. Of course, I
could send something to fix the existing ones.

Cheers
Suzuki
Marc Zyngier March 20, 2019, 5:24 p.m. UTC | #8
On Wed, 20 Mar 2019 11:12:47 +0000
Suzuki K Poulose <suzuki.poulose@arm.com> wrote:

> Marc,
> 
> On 20/03/2019 10:35, Marc Zyngier wrote:
> > On Wed, 20 Mar 2019 10:23:39 +0000
> > Suzuki K Poulose <suzuki.poulose@arm.com> wrote:
> > 
> > Hi Suzuki,
> >   
> >> Marc,
> >>
> >> On 20/03/2019 10:11, Marc Zyngier wrote:  
> >>> On Wed, 20 Mar 2019 09:44:38 +0000
> >>> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:  
> >>>    >>>> Hi Marc,  
> >>>>
> >>>> On 20/03/2019 08:15, Marc Zyngier wrote:  
> >>>>> Hi Suzuki,
> >>>>>
> >>>>> On Tue, 19 Mar 2019 14:11:08 +0000,
> >>>>> Suzuki K Poulose <suzuki.poulose@arm.com> wrote:  
> 
> ...
> 
> >>>>>> +		if (!pmd_thp_or_huge(old_pmd)) {
> >>>>>> +			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
> >>>>>> +			goto retry;  
> >>>>>   >>  
> >>>>>> +		if (!stage2_pud_huge(kvm, old_pud)) {
> >>>>>> +			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);  
> >>>>>   >>  
> >>>> We should really get rid of the S2_P{U/M}D_* definitions, as they are
> >>>> always the same as the host. The only thing that changes is the PGD size
> >>>> which varies according to the IPA and the concatenation.  
> >>>>   >>  
> >> Also what do you think about using  P{M,U}D_* instead of S2_P{M,U}D_*
> >> above ? I could make that change with the respin.  
> > 
> > Given that this is a fix, I'd like it to be as small as obvious as
> > possible, making it easier to backport.
> > 
> > I'm happy to take another patch for 5.2 that will drop the whole S2_P*
> > if we still think that this should be the case (though what I'd really
> > like is to have architectural levels instead of these arbitrary
> > definitions).  
> 
> I only meant the two new instances added above in the patch. Of course, I
> could send something to fix the existing ones.

I'd rather be consistent, and use the same names all over the code.
Once we decide to change, we do it all in one go.

Thanks,

	M.
diff mbox series

Patch

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fce0983..6ad6f19d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1060,25 +1060,43 @@  static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 {
 	pmd_t *pmd, old_pmd;
 
+retry:
 	pmd = stage2_get_pmd(kvm, cache, addr);
 	VM_BUG_ON(!pmd);
 
 	old_pmd = *pmd;
+	/*
+	 * Multiple vcpus faulting on the same PMD entry, can
+	 * lead to them sequentially updating the PMD with the
+	 * same value. Following the break-before-make
+	 * (pmd_clear() followed by tlb_flush()) process can
+	 * hinder forward progress due to refaults generated
+	 * on missing translations.
+	 *
+	 * Skip updating the page table if the entry is
+	 * unchanged.
+	 */
+	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+		return 0;
+
 	if (pmd_present(old_pmd)) {
 		/*
-		 * Multiple vcpus faulting on the same PMD entry, can
-		 * lead to them sequentially updating the PMD with the
-		 * same value. Following the break-before-make
-		 * (pmd_clear() followed by tlb_flush()) process can
-		 * hinder forward progress due to refaults generated
-		 * on missing translations.
+		 * If we already have PTE level mapping for this block,
+		 * we must unmap it to avoid inconsistent TLB state and
+		 * leaking the table page. We could end up in this situation
+		 * if the memory slot was marked for dirty logging and was
+		 * reverted, leaving PTE level mappings for the pages accessed
+		 * during the period. So, unmap the PTE level mapping for this
+		 * block and retry, as we could have released the upper level
+		 * table in the process.
 		 *
-		 * Skip updating the page table if the entry is
-		 * unchanged.
+		 * Normal THP split/merge follows mmu_notifier callbacks and do
+		 * get handled accordingly.
 		 */
-		if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-			return 0;
-
+		if (!pmd_thp_or_huge(old_pmd)) {
+			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+			goto retry;
+		}
 		/*
 		 * Mapping in huge pages should only happen through a
 		 * fault.  If a page is merged into a transparent huge
@@ -1090,8 +1108,7 @@  static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 		 * should become splitting first, unmapped, merged,
 		 * and mapped back in on-demand.
 		 */
-		VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
+		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
 		pmd_clear(pmd);
 		kvm_tlb_flush_vmid_ipa(kvm, addr);
 	} else {
@@ -1107,6 +1124,7 @@  static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
 {
 	pud_t *pudp, old_pud;
 
+retry:
 	pudp = stage2_get_pud(kvm, cache, addr);
 	VM_BUG_ON(!pudp);
 
@@ -1114,16 +1132,25 @@  static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
 
 	/*
 	 * A large number of vcpus faulting on the same stage 2 entry,
-	 * can lead to a refault due to the
-	 * stage2_pud_clear()/tlb_flush(). Skip updating the page
-	 * tables if there is no change.
+	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+	 * Skip updating the page tables if there is no change.
 	 */
 	if (pud_val(old_pud) == pud_val(*new_pudp))
 		return 0;
 
 	if (stage2_pud_present(kvm, old_pud)) {
-		stage2_pud_clear(kvm, pudp);
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
+		/*
+		 * If we already have table level mapping for this block, unmap
+		 * the range for this block and retry.
+		 */
+		if (!stage2_pud_huge(kvm, old_pud)) {
+			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+			goto retry;
+		} else {
+			WARN_ON_ONCE(pud_pfn(old_pud) != pud_pfn(*new_pudp));
+			stage2_pud_clear(kvm, pudp);
+			kvm_tlb_flush_vmid_ipa(kvm, addr);
+		}
 	} else {
 		get_page(virt_to_page(pudp));
 	}