diff mbox

[v9,4/4] arm: ARMv7 dirty page logging 2nd stage page fault handling support

Message ID 1406249768-25315-5-git-send-email-m.smarduch@samsung.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mario Smarduch July 25, 2014, 12:56 a.m. UTC
This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and dissolves huge pages to page tables.
In case migration is canceled huge pages will be used again.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm/kvm/mmu.c |   31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

Comments

Christoffer Dall Aug. 11, 2014, 7:13 p.m. UTC | #1
On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
> This patch adds support for handling 2nd stage page faults during migration,
> it disables faulting in huge pages, and dissolves huge pages to page tables.
> In case migration is canceled huge pages will be used again.
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm/kvm/mmu.c |   31 +++++++++++++++++++++++++------
>  1 file changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index ca84331..a17812a 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte, bool iomap,
> +			  bool logging_active)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty memory logging, clear PMD entry for huge page and split
> +	 * into smaller pages, to track dirty memory at page granularity.
> +	 */
> +	if (logging_active && kvm_pmd_huge(*pmd)) {
> +		phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT;
> +		clear_pmd_entry(kvm, pmd, ipa);

clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is
definitely not the right thing to call.

> +	}
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>  	struct vm_area_struct *vma;
>  	pfn_t pfn;
> +	/* Get logging status, if dirty_bitmap is not NULL then logging is on */
> +	#ifdef CONFIG_ARM
> +		bool logging_active = !!memslot->dirty_bitmap;
> +	#else
> +		bool logging_active = false;
> +	#endif

can you make this an inline in the header files for now please?

>  
>  	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>  	down_read(&current->mm->mmap_sem);
>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> -	if (is_vm_hugetlb_page(vma)) {
> +	if (is_vm_hugetlb_page(vma) && !logging_active) {
>  		hugetlb = true;
>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>  	} else {
> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> -	if (!hugetlb && !force_pte)
> +	if (!hugetlb && !force_pte && !logging_active)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
>  	if (hugetlb) {
> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false,
> +					logging_active);
>  	}
>  
> +	if (write_fault)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);

why is logging never active if we are called from MMU notifiers?

>  }
>  
>  
> -- 
> 1.7.9.5
> 

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mario Smarduch Aug. 12, 2014, 1:25 a.m. UTC | #2
On 08/11/2014 12:13 PM, Christoffer Dall wrote:
> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
>> This patch adds support for handling 2nd stage page faults during migration,
>> it disables faulting in huge pages, and dissolves huge pages to page tables.
>> In case migration is canceled huge pages will be used again.
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm/kvm/mmu.c |   31 +++++++++++++++++++++++++------
>>  1 file changed, 25 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index ca84331..a17812a 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>  }
>>  
>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>> +			  phys_addr_t addr, const pte_t *new_pte, bool iomap,
>> +			  bool logging_active)
>>  {
>>  	pmd_t *pmd;
>>  	pte_t *pte, old_pte;
>> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>  		return 0;
>>  	}
>>  
>> +	/*
>> +	 * While dirty memory logging, clear PMD entry for huge page and split
>> +	 * into smaller pages, to track dirty memory at page granularity.
>> +	 */
>> +	if (logging_active && kvm_pmd_huge(*pmd)) {
>> +		phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT;
>> +		clear_pmd_entry(kvm, pmd, ipa);
> 
> clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is
> definitely not the right thing to call.

I don't see that in 3.15rc1/rc4 -

static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
{
        if (kvm_pmd_huge(*pmd)) {
                pmd_clear(pmd);
                kvm_tlb_flush_vmid_ipa(kvm, addr);
        } else {
                  [....]
}

I thought the purpose of this function was to clear PMD entry. Also
ran hundreds of tests no problems. Hmmm confused.

> 
>> +	}
>> +
>>  	/* Create stage-2 page mappings - Level 2 */
>>  	if (pmd_none(*pmd)) {
>>  		if (!cache)
>> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>  		if (ret)
>>  			goto out;
>>  		spin_lock(&kvm->mmu_lock);
>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false);
>>  		spin_unlock(&kvm->mmu_lock);
>>  		if (ret)
>>  			goto out;
>> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>>  	struct vm_area_struct *vma;
>>  	pfn_t pfn;
>> +	/* Get logging status, if dirty_bitmap is not NULL then logging is on */
>> +	#ifdef CONFIG_ARM
>> +		bool logging_active = !!memslot->dirty_bitmap;
>> +	#else
>> +		bool logging_active = false;
>> +	#endif
> 
> can you make this an inline in the header files for now please?

Yes definitely.

> 
>>  
>>  	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
>>  	if (fault_status == FSC_PERM && !write_fault) {
>> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>  	down_read(&current->mm->mmap_sem);
>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>> -	if (is_vm_hugetlb_page(vma)) {
>> +	if (is_vm_hugetlb_page(vma) && !logging_active) {
>>  		hugetlb = true;
>>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>>  	} else {
>> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	spin_lock(&kvm->mmu_lock);
>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>  		goto out_unlock;
>> -	if (!hugetlb && !force_pte)
>> +	if (!hugetlb && !force_pte && !logging_active)
>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>  
>>  	if (hugetlb) {
>> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  			kvm_set_pfn_dirty(pfn);
>>  		}
>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
>> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false,
>> +					logging_active);
>>  	}
>>  
>> +	if (write_fault)
>> +		mark_page_dirty(kvm, gfn);
>>  
>>  out_unlock:
>>  	spin_unlock(&kvm->mmu_lock);
>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>  {
>>  	pte_t *pte = (pte_t *)data;
>>  
>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
> 
> why is logging never active if we are called from MMU notifiers?

mmu notifiers update sptes, but I don't see how these updates
can result in guest dirty pages. Also guest pages are marked dirty
from 2nd stage page fault handlers (searching through the code).

> 
>>  }
>>  
>>  
>> -- 
>> 1.7.9.5
>>
> 
> Thanks,
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Aug. 12, 2014, 9:50 a.m. UTC | #3
On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
> > On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
> >> This patch adds support for handling 2nd stage page faults during migration,
> >> it disables faulting in huge pages, and dissolves huge pages to page tables.
> >> In case migration is canceled huge pages will be used again.
> >>
> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> >> ---
> >>  arch/arm/kvm/mmu.c |   31 +++++++++++++++++++++++++------
> >>  1 file changed, 25 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >> index ca84331..a17812a 100644
> >> --- a/arch/arm/kvm/mmu.c
> >> +++ b/arch/arm/kvm/mmu.c
> >> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
> >>  }
> >>  
> >>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> >> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> >> +			  phys_addr_t addr, const pte_t *new_pte, bool iomap,
> >> +			  bool logging_active)
> >>  {
> >>  	pmd_t *pmd;
> >>  	pte_t *pte, old_pte;
> >> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> >>  		return 0;
> >>  	}
> >>  
> >> +	/*
> >> +	 * While dirty memory logging, clear PMD entry for huge page and split
> >> +	 * into smaller pages, to track dirty memory at page granularity.
> >> +	 */
> >> +	if (logging_active && kvm_pmd_huge(*pmd)) {
> >> +		phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT;
> >> +		clear_pmd_entry(kvm, pmd, ipa);
> > 
> > clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is
> > definitely not the right thing to call.
> 
> I don't see that in 3.15rc1/rc4 -
> 
> static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
> {
>         if (kvm_pmd_huge(*pmd)) {
>                 pmd_clear(pmd);
>                 kvm_tlb_flush_vmid_ipa(kvm, addr);
>         } else {
>                   [....]
> }
> 
> I thought the purpose of this function was to clear PMD entry. Also
> ran hundreds of tests no problems. Hmmm confused.
> 

You need to rebase on kvm/next or linus/master, something that contains:

4f853a7 arm/arm64: KVM: Fix and refactor unmap_range

> > 
> >> +	}
> >> +
> >>  	/* Create stage-2 page mappings - Level 2 */
> >>  	if (pmd_none(*pmd)) {
> >>  		if (!cache)
> >> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
> >>  		if (ret)
> >>  			goto out;
> >>  		spin_lock(&kvm->mmu_lock);
> >> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> >> +		ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false);
> >>  		spin_unlock(&kvm->mmu_lock);
> >>  		if (ret)
> >>  			goto out;
> >> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> >>  	struct vm_area_struct *vma;
> >>  	pfn_t pfn;
> >> +	/* Get logging status, if dirty_bitmap is not NULL then logging is on */
> >> +	#ifdef CONFIG_ARM
> >> +		bool logging_active = !!memslot->dirty_bitmap;
> >> +	#else
> >> +		bool logging_active = false;
> >> +	#endif
> > 
> > can you make this an inline in the header files for now please?
> 
> Yes definitely.
> 
> > 
> >>  
> >>  	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
> >>  	if (fault_status == FSC_PERM && !write_fault) {
> >> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
> >>  	down_read(&current->mm->mmap_sem);
> >>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> >> -	if (is_vm_hugetlb_page(vma)) {
> >> +	if (is_vm_hugetlb_page(vma) && !logging_active) {
> >>  		hugetlb = true;
> >>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
> >>  	} else {
> >> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	spin_lock(&kvm->mmu_lock);
> >>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>  		goto out_unlock;
> >> -	if (!hugetlb && !force_pte)
> >> +	if (!hugetlb && !force_pte && !logging_active)
> >>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
> >>  
> >>  	if (hugetlb) {
> >> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  			kvm_set_pfn_dirty(pfn);
> >>  		}
> >>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
> >> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
> >> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false,
> >> +					logging_active);
> >>  	}
> >>  
> >> +	if (write_fault)
> >> +		mark_page_dirty(kvm, gfn);
> >>  
> >>  out_unlock:
> >>  	spin_unlock(&kvm->mmu_lock);
> >> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
> >>  {
> >>  	pte_t *pte = (pte_t *)data;
> >>  
> >> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> >> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
> > 
> > why is logging never active if we are called from MMU notifiers?
> 
> mmu notifiers update sptes, but I don't see how these updates
> can result in guest dirty pages. Also guest pages are marked dirty
> from 2nd stage page fault handlers (searching through the code).
> 
Ok, then add:

/*
 * We can always call stage2_set_pte with logging_active == false,
 * because MMU notifiers will have unmapped a huge PMD before calling
 * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore
 * stage2_set_pte() never needs to clear out a huge PMD through this
 * calling path.
 */

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mario Smarduch Aug. 13, 2014, 1:27 a.m. UTC | #4
On 08/12/2014 02:50 AM, Christoffer Dall wrote:
> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
>> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
>>>> This patch adds support for handling 2nd stage page faults during migration,
>>>> it disables faulting in huge pages, and dissolves huge pages to page tables.
>>>> In case migration is canceled huge pages will be used again.
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>>>> ---
>>>>  arch/arm/kvm/mmu.c |   31 +++++++++++++++++++++++++------
>>>>  1 file changed, 25 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index ca84331..a17812a 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>>>  }
>>>>  
>>>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>>>> +			  phys_addr_t addr, const pte_t *new_pte, bool iomap,
>>>> +			  bool logging_active)
>>>>  {
>>>>  	pmd_t *pmd;
>>>>  	pte_t *pte, old_pte;
>>>> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>>  		return 0;
>>>>  	}
>>>>  
>>>> +	/*
>>>> +	 * While dirty memory logging, clear PMD entry for huge page and split
>>>> +	 * into smaller pages, to track dirty memory at page granularity.
>>>> +	 */
>>>> +	if (logging_active && kvm_pmd_huge(*pmd)) {
>>>> +		phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT;
>>>> +		clear_pmd_entry(kvm, pmd, ipa);
>>>
>>> clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is
>>> definitely not the right thing to call.
>>
>> I don't see that in 3.15rc1/rc4 -
>>
>> static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
>> {
>>         if (kvm_pmd_huge(*pmd)) {
>>                 pmd_clear(pmd);
>>                 kvm_tlb_flush_vmid_ipa(kvm, addr);
>>         } else {
>>                   [....]
>> }
>>
>> I thought the purpose of this function was to clear PMD entry. Also
>> ran hundreds of tests no problems. Hmmm confused.
>>
> 
> You need to rebase on kvm/next or linus/master, something that contains:
> 
> 4f853a7 arm/arm64: KVM: Fix and refactor unmap_range
> 
>>>
>>>> +	}
>>>> +
>>>>  	/* Create stage-2 page mappings - Level 2 */
>>>>  	if (pmd_none(*pmd)) {
>>>>  		if (!cache)
>>>> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>>  		if (ret)
>>>>  			goto out;
>>>>  		spin_lock(&kvm->mmu_lock);
>>>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>>>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false);
>>>>  		spin_unlock(&kvm->mmu_lock);
>>>>  		if (ret)
>>>>  			goto out;
>>>> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>>>>  	struct vm_area_struct *vma;
>>>>  	pfn_t pfn;
>>>> +	/* Get logging status, if dirty_bitmap is not NULL then logging is on */
>>>> +	#ifdef CONFIG_ARM
>>>> +		bool logging_active = !!memslot->dirty_bitmap;
>>>> +	#else
>>>> +		bool logging_active = false;
>>>> +	#endif
>>>
>>> can you make this an inline in the header files for now please?
>>
>> Yes definitely.
>>
>>>
>>>>  
>>>>  	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
>>>>  	if (fault_status == FSC_PERM && !write_fault) {
>>>> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>>>  	down_read(&current->mm->mmap_sem);
>>>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>>>> -	if (is_vm_hugetlb_page(vma)) {
>>>> +	if (is_vm_hugetlb_page(vma) && !logging_active) {
>>>>  		hugetlb = true;
>>>>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>>>>  	} else {
>>>> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>  		goto out_unlock;
>>>> -	if (!hugetlb && !force_pte)
>>>> +	if (!hugetlb && !force_pte && !logging_active)
>>>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>>>  
>>>>  	if (hugetlb) {
>>>> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  			kvm_set_pfn_dirty(pfn);
>>>>  		}
>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
>>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
>>>> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false,
>>>> +					logging_active);
>>>>  	}
>>>>  
>>>> +	if (write_fault)
>>>> +		mark_page_dirty(kvm, gfn);
>>>>  
>>>>  out_unlock:
>>>>  	spin_unlock(&kvm->mmu_lock);
>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>  {
>>>>  	pte_t *pte = (pte_t *)data;
>>>>  
>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
>>>
>>> why is logging never active if we are called from MMU notifiers?
>>
>> mmu notifiers update sptes, but I don't see how these updates
>> can result in guest dirty pages. Also guest pages are marked dirty
>> from 2nd stage page fault handlers (searching through the code).
>>
> Ok, then add:
> 
> /*
>  * We can always call stage2_set_pte with logging_active == false,
>  * because MMU notifiers will have unmapped a huge PMD before calling
>  * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore
>  * stage2_set_pte() never needs to clear out a huge PMD through this
>  * calling path.
>  */

So here on permission change to primary pte's kernel first invalidates
related s2ptes followed by ->change_pte calls to synchronize s2ptes. As
consequence of invalidation we unmap huge PMDs, if a page falls in that
range.

Is the comment to point out use of logging flag under various scenarios?

Should I add comments on flag use in other places as well?

> 
> Thanks,
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Aug. 13, 2014, 7:30 a.m. UTC | #5
On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote:
> On 08/12/2014 02:50 AM, Christoffer Dall wrote:
> > On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
> >> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
> >>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:

[...]

> >>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
> >>>>  {
> >>>>  	pte_t *pte = (pte_t *)data;
> >>>>  
> >>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> >>>> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
> >>>
> >>> why is logging never active if we are called from MMU notifiers?
> >>
> >> mmu notifiers update sptes, but I don't see how these updates
> >> can result in guest dirty pages. Also guest pages are marked dirty
> >> from 2nd stage page fault handlers (searching through the code).
> >>
> > Ok, then add:
> > 
> > /*
> >  * We can always call stage2_set_pte with logging_active == false,
> >  * because MMU notifiers will have unmapped a huge PMD before calling
> >  * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore
> >  * stage2_set_pte() never needs to clear out a huge PMD through this
> >  * calling path.
> >  */
> 
> So here on permission change to primary pte's kernel first invalidates
> related s2ptes followed by ->change_pte calls to synchronize s2ptes. As
> consequence of invalidation we unmap huge PMDs, if a page falls in that
> range.
> 
> Is the comment to point out use of logging flag under various scenarios?

The comment is because when you look at this function it is not obvious
why we pass logging_active=false, despite logging may actually be
active.  This could suggest that the parameter to stage2_set_pte()
should be named differently (break_huge_pmds) or something like that,
but we can also be satisfied with the comment.

> 
> Should I add comments on flag use in other places as well?
> 

It's always a judgement call.  I didn't find it necessarry to put a
comment elsewhere because I think it's pretty obivous that we would
never care about logging writes to device regions.

However, this made me think, are we making sure that we are not marking
device mappings as read-only in the wp_range functions?  I think it's
quite bad if we mark the VCPU interface as read-only for example.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mario Smarduch Aug. 14, 2014, 1:20 a.m. UTC | #6
On 08/13/2014 12:30 AM, Christoffer Dall wrote:
> On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote:
>> On 08/12/2014 02:50 AM, Christoffer Dall wrote:
>>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
>>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
>>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
> 
> [...]
> 
>>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>>>  {
>>>>>>  	pte_t *pte = (pte_t *)data;
>>>>>>  
>>>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
>>>>>
>>>>> why is logging never active if we are called from MMU notifiers?
>>>>
>>>> mmu notifiers update sptes, but I don't see how these updates
>>>> can result in guest dirty pages. Also guest pages are marked dirty
>>>> from 2nd stage page fault handlers (searching through the code).
>>>>
>>> Ok, then add:
>>>
>>> /*
>>>  * We can always call stage2_set_pte with logging_active == false,
>>>  * because MMU notifiers will have unmapped a huge PMD before calling
>>>  * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore
>>>  * stage2_set_pte() never needs to clear out a huge PMD through this
>>>  * calling path.
>>>  */
>>
>> So here on permission change to primary pte's kernel first invalidates
>> related s2ptes followed by ->change_pte calls to synchronize s2ptes. As
>> consequence of invalidation we unmap huge PMDs, if a page falls in that
>> range.
>>
>> Is the comment to point out use of logging flag under various scenarios?
> 
> The comment is because when you look at this function it is not obvious
> why we pass logging_active=false, despite logging may actually be
> active.  This could suggest that the parameter to stage2_set_pte()
> should be named differently (break_huge_pmds) or something like that,
> but we can also be satisfied with the comment.

Ok I see, I was thinking you thought it was breaking something.
Yeah I'll add the comment, in reality this is another use case
where a PMD may need to be converted to page table so it makes sense
to contrast use cases.

> 
>>
>> Should I add comments on flag use in other places as well?
>>
> 
> It's always a judgement call.  I didn't find it necessarry to put a
> comment elsewhere because I think it's pretty obivous that we would
> never care about logging writes to device regions.
> 
> However, this made me think, are we making sure that we are not marking
> device mappings as read-only in the wp_range functions?  I think it's

KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being
installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU
these regions wind up in KVMState->KVMSlot[], when
memory_region_add_subregion() is called KVM listener installs it.
For migration and dirty page logging QEMU walks the KVMSlot[] array.

For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM,
causes the memory region to be added to KVMState->KVMSlot[].
In that case it's possible to walk KVMState->KVMSlot[] issue
the ioctl and  come across  a device mapping with normal memory and
WP it's s2ptes (VFIO sets unmigrateble state though).

But I'm not sure what's there to stop someone calling the ioctl and
install a region with device memory type. Most likely though if you
installed that kind of region migration would be disabled.

But just for logging use not checking memory type could be an issue.

> quite bad if we mark the VCPU interface as read-only for example.
> 
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mario Smarduch Aug. 15, 2014, 12:01 a.m. UTC | #7
On 08/13/2014 06:20 PM, Mario Smarduch wrote:
> On 08/13/2014 12:30 AM, Christoffer Dall wrote:
>> On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote:
>>> On 08/12/2014 02:50 AM, Christoffer Dall wrote:
>>>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
>>>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
>>>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
>>
>> [...]
>>
>>>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>>>>  {
>>>>>>>  	pte_t *pte = (pte_t *)data;
>>>>>>>  
>>>>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
>>>>>>
>>>>>> why is logging never active if we are called from MMU notifiers?
>>>>>
[...]

>> The comment is because when you look at this function it is not obvious
>> why we pass logging_active=false, despite logging may actually be
>> active.  This could suggest that the parameter to stage2_set_pte()
>> should be named differently (break_huge_pmds) or something like that,
>> but we can also be satisfied with the comment.
> 
> Ok I see, I was thinking you thought it was breaking something.
> Yeah I'll add the comment, in reality this is another use case
> where a PMD may need to be converted to page table so it makes sense
> to contrast use cases.
> 
>>
>>>
>>> Should I add comments on flag use in other places as well?
>>>
>>
>> It's always a judgement call.  I didn't find it necessarry to put a
>> comment elsewhere because I think it's pretty obivous that we would
>> never care about logging writes to device regions.
>>
>> However, this made me think, are we making sure that we are not marking
>> device mappings as read-only in the wp_range functions?  I think it's
> 
> KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being
> installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU
> these regions wind up in KVMState->KVMSlot[], when
> memory_region_add_subregion() is called KVM listener installs it.
> For migration and dirty page logging QEMU walks the KVMSlot[] array.
> 
> For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM,
> causes the memory region to be added to KVMState->KVMSlot[].
> In that case it's possible to walk KVMState->KVMSlot[] issue
> the ioctl and  come across  a device mapping with normal memory and
> WP it's s2ptes (VFIO sets unmigrateble state though).
> 
> But I'm not sure what's there to stop someone calling the ioctl and
> install a region with device memory type. Most likely though if you
> installed that kind of region migration would be disabled.
> 
> But just for logging use not checking memory type could be an issue.

Clarifying above a bit, KVM structures like kvm_run or vgic don't go
through KVM_SET_USER_MEMORY_REGION interface (can't think of any
other KVM structures). VFIO uses KVM_SET_USER_MEMORY_REGION,
user_mem_abort() should resolve the fault. I recall VFIO patch
series add that support.

It should be ok to write protect MMIO regions installed through
KVM_SET_USER_MEMORY_REGION. Although at this time I don't know
of use case for logging without migration, so this may not be
an issue at all at this time.

> 
>> quite bad if we mark the VCPU interface as read-only for example.
>>
>> -Christoffer
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Aug. 18, 2014, 12:51 p.m. UTC | #8
On Wed, Aug 13, 2014 at 06:20:19PM -0700, Mario Smarduch wrote:
> On 08/13/2014 12:30 AM, Christoffer Dall wrote:
> > On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote:
> >> On 08/12/2014 02:50 AM, Christoffer Dall wrote:
> >>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
> >>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
> >>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
> > 
> > [...]
> > 
> >>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
> >>>>>>  {
> >>>>>>  	pte_t *pte = (pte_t *)data;
> >>>>>>  
> >>>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> >>>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
> >>>>>
> >>>>> why is logging never active if we are called from MMU notifiers?
> >>>>
> >>>> mmu notifiers update sptes, but I don't see how these updates
> >>>> can result in guest dirty pages. Also guest pages are marked dirty
> >>>> from 2nd stage page fault handlers (searching through the code).
> >>>>
> >>> Ok, then add:
> >>>
> >>> /*
> >>>  * We can always call stage2_set_pte with logging_active == false,
> >>>  * because MMU notifiers will have unmapped a huge PMD before calling
> >>>  * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore
> >>>  * stage2_set_pte() never needs to clear out a huge PMD through this
> >>>  * calling path.
> >>>  */
> >>
> >> So here on permission change to primary pte's kernel first invalidates
> >> related s2ptes followed by ->change_pte calls to synchronize s2ptes. As
> >> consequence of invalidation we unmap huge PMDs, if a page falls in that
> >> range.
> >>
> >> Is the comment to point out use of logging flag under various scenarios?
> > 
> > The comment is because when you look at this function it is not obvious
> > why we pass logging_active=false, despite logging may actually be
> > active.  This could suggest that the parameter to stage2_set_pte()
> > should be named differently (break_huge_pmds) or something like that,
> > but we can also be satisfied with the comment.
> 
> Ok I see, I was thinking you thought it was breaking something.
> Yeah I'll add the comment, in reality this is another use case
> where a PMD may need to be converted to page table so it makes sense
> to contrast use cases.
> 

the hidden knowledge is that MMU notifiers will ensure a huge PMD gets
unmapped before trying to change the physical backing of the underlying
PTEs, so it's a gigantic kernel bug if this gets called on something
mapped with a huge PMD.


> > 
> >>
> >> Should I add comments on flag use in other places as well?
> >>
> > 
> > It's always a judgement call.  I didn't find it necessarry to put a
> > comment elsewhere because I think it's pretty obivous that we would
> > never care about logging writes to device regions.
> > 
> > However, this made me think, are we making sure that we are not marking
> > device mappings as read-only in the wp_range functions?  I think it's
> 
> KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being
> installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU
> these regions wind up in KVMState->KVMSlot[], when
> memory_region_add_subregion() is called KVM listener installs it.
> For migration and dirty page logging QEMU walks the KVMSlot[] array.
> 
> For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM,
> causes the memory region to be added to KVMState->KVMSlot[].
> In that case it's possible to walk KVMState->KVMSlot[] issue
> the ioctl and  come across  a device mapping with normal memory and
> WP it's s2ptes (VFIO sets unmigrateble state though).
> 
> But I'm not sure what's there to stop someone calling the ioctl and
> install a region with device memory type. Most likely though if you
> installed that kind of region migration would be disabled.
> 
> But just for logging use not checking memory type could be an issue.
> 
I forgot that the current write-protect'ing is limited to the memory
region boundaries, so everything should be fine.

If user-space write-protects device memory regions, the worst
consequence is that it breaks the guest, but that's its own
responsibility, so I don't think you need to change anything.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mario Smarduch Aug. 18, 2014, 5:42 p.m. UTC | #9
On 08/18/2014 05:51 AM, Christoffer Dall wrote:
> On Wed, Aug 13, 2014 at 06:20:19PM -0700, Mario Smarduch wrote:
>> On 08/13/2014 12:30 AM, Christoffer Dall wrote:
>>> On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote:
>>>> On 08/12/2014 02:50 AM, Christoffer Dall wrote:
>>>>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote:
>>>>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote:
>>>>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
>>>
>>> [...]
>>>
>>>>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)

>>
>> Ok I see, I was thinking you thought it was breaking something.
>> Yeah I'll add the comment, in reality this is another use case
>> where a PMD may need to be converted to page table so it makes sense
>> to contrast use cases.
>>
> 
> the hidden knowledge is that MMU notifiers will ensure a huge PMD gets
> unmapped before trying to change the physical backing of the underlying
> PTEs, so it's a gigantic kernel bug if this gets called on something
> mapped with a huge PMD.
> 

That's a good way of putting it luckily I was aware previously looking
at some other features.

> 
>>>
>>>>
>>>> Should I add comments on flag use in other places as well?
>>>>
>>>
>>> It's always a judgement call.  I didn't find it necessarry to put a
>>> comment elsewhere because I think it's pretty obivous that we would
>>> never care about logging writes to device regions.
>>>
>>> However, this made me think, are we making sure that we are not marking
>>> device mappings as read-only in the wp_range functions?  I think it's
>>
>> KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being
>> installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU
>> these regions wind up in KVMState->KVMSlot[], when
>> memory_region_add_subregion() is called KVM listener installs it.
>> For migration and dirty page logging QEMU walks the KVMSlot[] array.
>>
>> For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM,
>> causes the memory region to be added to KVMState->KVMSlot[].
>> In that case it's possible to walk KVMState->KVMSlot[] issue
>> the ioctl and  come across  a device mapping with normal memory and
>> WP it's s2ptes (VFIO sets unmigrateble state though).
>>
>> But I'm not sure what's there to stop someone calling the ioctl and
>> install a region with device memory type. Most likely though if you
>> installed that kind of region migration would be disabled.
>>
>> But just for logging use not checking memory type could be an issue.
>>
> I forgot that the current write-protect'ing is limited to the memory
> region boundaries, so everything should be fine.

I looked through this way back, but it was worth to revisit.
> 
> If user-space write-protects device memory regions, the worst
> consequence is that it breaks the guest, but that's its own
> responsibility, so I don't think you need to change anything.

Thanks for the detailed review. I'll go off now, rebase and make
the needed changes

> 
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index ca84331..a17812a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -642,7 +642,8 @@  static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte, bool iomap,
+			  bool logging_active)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
@@ -657,6 +658,15 @@  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty memory logging, clear PMD entry for huge page and split
+	 * into smaller pages, to track dirty memory at page granularity.
+	 */
+	if (logging_active && kvm_pmd_huge(*pmd)) {
+		phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT;
+		clear_pmd_entry(kvm, pmd, ipa);
+	}
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -709,7 +719,7 @@  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -926,6 +936,12 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 	struct vm_area_struct *vma;
 	pfn_t pfn;
+	/* Get logging status, if dirty_bitmap is not NULL then logging is on */
+	#ifdef CONFIG_ARM
+		bool logging_active = !!memslot->dirty_bitmap;
+	#else
+		bool logging_active = false;
+	#endif
 
 	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -936,7 +952,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	/* Let's check if we will get back a huge page backed by hugetlbfs */
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma_intersection(current->mm, hva, hva + 1);
-	if (is_vm_hugetlb_page(vma)) {
+	if (is_vm_hugetlb_page(vma) && !logging_active) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -979,7 +995,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
-	if (!hugetlb && !force_pte)
+	if (!hugetlb && !force_pte && !logging_active)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
 	if (hugetlb) {
@@ -998,9 +1014,12 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false,
+					logging_active);
 	}
 
+	if (write_fault)
+		mark_page_dirty(kvm, gfn);
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
@@ -1151,7 +1170,7 @@  static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	stage2_set_pte(kvm, NULL, gpa, pte, false, false);
 }