diff mbox series

[RFC,2/7] KVM: arm64: Set DBM bit of PTEs if hw DBM enabled

Message ID 20200525112406.28224-3-zhukeqian1@huawei.com (mailing list archive)
State New, archived
Headers show
Series kvm: arm64: Support stage2 hardware DBM | expand

Commit Message

zhukeqian May 25, 2020, 11:24 a.m. UTC
In user_mem_abort, for normal case (mem_type is PAGE_S2), set DBM bit
of PTEs if hw DBM enabled. We also check and set DBM bit during write
protect PTEs to make it works well if we miss some cases.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Peng Liang <liangpeng10@huawei.com>
---
 arch/arm64/include/asm/pgtable-prot.h |  1 +
 virt/kvm/arm/mmu.c                    | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

Comments

Catalin Marinas May 26, 2020, 11:49 a.m. UTC | #1
On Mon, May 25, 2020 at 07:24:01PM +0800, Keqian Zhu wrote:
> diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
> index 1305e28225fc..f9910ba2afd8 100644
> --- a/arch/arm64/include/asm/pgtable-prot.h
> +++ b/arch/arm64/include/asm/pgtable-prot.h
> @@ -79,6 +79,7 @@ extern bool arm64_use_ng_mappings;
>  	})
>  
>  #define PAGE_S2			__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
> +#define PAGE_S2_DBM		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN | PTE_DBM)

You don't need a new page permission (see below).

>  #define PAGE_S2_DEVICE		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
>  
>  #define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index e3b9ee268823..dc97988eb2e0 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1426,6 +1426,10 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>  	pte = pte_offset_kernel(pmd, addr);
>  	do {
>  		if (!pte_none(*pte)) {
> +#ifdef CONFIG_ARM64_HW_AFDBM
> +			if (kvm_hw_dbm_enabled() && !kvm_s2pte_dbm(pte))
> +				kvm_set_s2pte_dbm(pte);
> +#endif
>  			if (!kvm_s2pte_readonly(pte))
>  				kvm_set_s2pte_readonly(pte);
>  		}

Setting the DBM bit is equivalent to marking the page writable. The
actual writable pte bit (S2AP[1] or HAP[2] as we call them in Linux for
legacy reasons) tells you whether the page has been dirtied but it is
still writable if you set DBM. Doing this in stage2_wp_ptes()
practically means that you no longer have read-only pages at S2. There
are several good reasons why you don't want to break this. For example,
the S2 pte may already be read-only for other reasons (CoW).

I think you should only set the DBM bit if the pte was previously
writable. In addition, any permission change to the S2 pte must take
into account the DBM bit and clear it while transferring the dirty
status to the underlying page. I'm not deeply familiar with all these
callbacks into KVM but two such paths are kvm_unmap_hva_range() and the
kvm_mmu_notifier_change_pte().


> @@ -1827,7 +1831,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
> -		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
> +		pte_t new_pte;
> +
> +#ifdef CONFIG_ARM64_HW_AFDBM
> +		if (kvm_hw_dbm_enabled() &&
> +		    pgprot_val(mem_type) == pgprot_val(PAGE_S2)) {
> +			mem_type = PAGE_S2_DBM;
> +		}
> +#endif
> +		new_pte = kvm_pfn_pte(pfn, mem_type);
>  
>  		if (writable) {
>  			new_pte = kvm_s2pte_mkwrite(new_pte);

That's wrong here. Basically for any fault you get, you just turn the S2
page writable. The point of DBM is that you don't get write faults at
all if you have a writable page. So, as I said above, only set the DBM
bit if you stored a writable S2 pte (kvm_s2pte_mkwrite()).
zhukeqian May 27, 2020, 9:28 a.m. UTC | #2
Hi Catalin,

On 2020/5/26 19:49, Catalin Marinas wrote:
> On Mon, May 25, 2020 at 07:24:01PM +0800, Keqian Zhu wrote:
>> diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
>> index 1305e28225fc..f9910ba2afd8 100644
>> --- a/arch/arm64/include/asm/pgtable-prot.h
>> +++ b/arch/arm64/include/asm/pgtable-prot.h
>> @@ -79,6 +79,7 @@ extern bool arm64_use_ng_mappings;
>>  	})
>>  
>>  #define PAGE_S2			__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
>> +#define PAGE_S2_DBM		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN | PTE_DBM)
> 
> You don't need a new page permission (see below).
> 
>>  #define PAGE_S2_DEVICE		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
>>  
>>  #define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
>> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
>> index e3b9ee268823..dc97988eb2e0 100644
>> --- a/virt/kvm/arm/mmu.c
>> +++ b/virt/kvm/arm/mmu.c
>> @@ -1426,6 +1426,10 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
>>  	pte = pte_offset_kernel(pmd, addr);
>>  	do {
>>  		if (!pte_none(*pte)) {
>> +#ifdef CONFIG_ARM64_HW_AFDBM
>> +			if (kvm_hw_dbm_enabled() && !kvm_s2pte_dbm(pte))
>> +				kvm_set_s2pte_dbm(pte);
>> +#endif
>>  			if (!kvm_s2pte_readonly(pte))
>>  				kvm_set_s2pte_readonly(pte);
>>  		}
> 
> Setting the DBM bit is equivalent to marking the page writable. The
> actual writable pte bit (S2AP[1] or HAP[2] as we call them in Linux for
> legacy reasons) tells you whether the page has been dirtied but it is
> still writable if you set DBM. Doing this in stage2_wp_ptes()
> practically means that you no longer have read-only pages at S2. There
> are several good reasons why you don't want to break this. For example,
> the S2 pte may already be read-only for other reasons (CoW).
> 
Thanks, your comments help to solve the first problem in cover letter.

> I think you should only set the DBM bit if the pte was previously
> writable. In addition, any permission change to the S2 pte must take
> into account the DBM bit and clear it while transferring the dirty
> status to the underlying page. I'm not deeply familiar with all these
> callbacks into KVM but two such paths are kvm_unmap_hva_range() and the
> kvm_mmu_notifier_change_pte().
Yes, I agree.
> 
> 
>> @@ -1827,7 +1831,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  
>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>  	} else {
>> -		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
>> +		pte_t new_pte;
>> +
>> +#ifdef CONFIG_ARM64_HW_AFDBM
>> +		if (kvm_hw_dbm_enabled() &&
>> +		    pgprot_val(mem_type) == pgprot_val(PAGE_S2)) {
>> +			mem_type = PAGE_S2_DBM;
>> +		}
>> +#endif
>> +		new_pte = kvm_pfn_pte(pfn, mem_type);
>>  
>>  		if (writable) {
>>  			new_pte = kvm_s2pte_mkwrite(new_pte);
> 
> That's wrong here. Basically for any fault you get, you just turn the S2
> page writable. The point of DBM is that you don't get write faults at
> all if you have a writable page. So, as I said above, only set the DBM
> bit if you stored a writable S2 pte (kvm_s2pte_mkwrite()).
Yeah, you are right. I will correct it in Patch v1.
> 

Thanks,
Keqian
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 1305e28225fc..f9910ba2afd8 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -79,6 +79,7 @@  extern bool arm64_use_ng_mappings;
 	})
 
 #define PAGE_S2			__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
+#define PAGE_S2_DBM		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN | PTE_DBM)
 #define PAGE_S2_DEVICE		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
 
 #define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index e3b9ee268823..dc97988eb2e0 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1426,6 +1426,10 @@  static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
 	pte = pte_offset_kernel(pmd, addr);
 	do {
 		if (!pte_none(*pte)) {
+#ifdef CONFIG_ARM64_HW_AFDBM
+			if (kvm_hw_dbm_enabled() && !kvm_s2pte_dbm(pte))
+				kvm_set_s2pte_dbm(pte);
+#endif
 			if (!kvm_s2pte_readonly(pte))
 				kvm_set_s2pte_readonly(pte);
 		}
@@ -1827,7 +1831,15 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
-		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
+		pte_t new_pte;
+
+#ifdef CONFIG_ARM64_HW_AFDBM
+		if (kvm_hw_dbm_enabled() &&
+		    pgprot_val(mem_type) == pgprot_val(PAGE_S2)) {
+			mem_type = PAGE_S2_DBM;
+		}
+#endif
+		new_pte = kvm_pfn_pte(pfn, mem_type);
 
 		if (writable) {
 			new_pte = kvm_s2pte_mkwrite(new_pte);