diff mbox series

[v19,11/17] RISC-V: KVM: Implement MMU notifiers

Message ID 20210727055450.2742868-12-anup.patel@wdc.com (mailing list archive)
State New, archived
Headers show
Series KVM RISC-V Support | expand

Commit Message

Anup Patel July 27, 2021, 5:54 a.m. UTC
This patch implements MMU notifiers for KVM RISC-V so that Guest
physical address space is in-sync with Host physical address space.

This will allow swapping, page migration, etc to work transparently
with KVM RISC-V.

Signed-off-by: Anup Patel <anup.patel@wdc.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alexander Graf <graf@amazon.com>
---
 arch/riscv/include/asm/kvm_host.h |  2 +
 arch/riscv/kvm/Kconfig            |  1 +
 arch/riscv/kvm/mmu.c              | 90 +++++++++++++++++++++++++++++--
 arch/riscv/kvm/vm.c               |  1 +
 4 files changed, 89 insertions(+), 5 deletions(-)

Comments

MingWang Li Aug. 3, 2021, 1:19 p.m. UTC | #1
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index
> fa9a4f9b9542..4b294113c63b 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -300,7 +300,8 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t
> addr,
>  	}
>  }
> 
> -static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size)
> +static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
> +			       gpa_t size, bool may_block)
>  {
>  	int ret;
>  	pte_t *ptep;
> @@ -325,6 +326,13 @@ static void stage2_unmap_range(struct kvm *kvm,
> gpa_t start, gpa_t size)
> 
>  next:
>  		addr += page_size;
> +
> +		/*
> +		 * If the range is too large, release the kvm->mmu_lock
> +		 * to prevent starvation and lockup detector warnings.
> +		 */
> +		if (may_block && addr < end)
> +			cond_resched_lock(&kvm->mmu_lock);
>  	}
>  }
> 
> @@ -405,7 +413,6 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa,
> phys_addr_t hpa,
>  out:
>  	stage2_cache_flush(&pcache);
>  	return ret;
> -
>  }
> 
>  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, @@
> -547,7 +554,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		stage2_unmap_range(kvm, mem->guest_phys_addr,
> -				   mem->memory_size);
> +				   mem->memory_size, false);
>  	spin_unlock(&kvm->mmu_lock);
> 
>  out:
> @@ -555,6 +562,73 @@ int kvm_arch_prepare_memory_region(struct kvm
> *kvm,
>  	return ret;
>  }
> 
> +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	if (!kvm->arch.pgd)
> +		return 0;
> +
> +	stage2_unmap_range(kvm, range->start << PAGE_SHIFT,
> +			   (range->end - range->start) << PAGE_SHIFT,
> +			   range->may_block);
> +	return 0;
> +}
> +
> +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) {
> +	int ret;
> +	kvm_pfn_t pfn = pte_pfn(range->pte);
> +
> +	if (!kvm->arch.pgd)
> +		return 0;
> +
> +	WARN_ON(range->end - range->start != 1);
> +
> +	ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT,
> +			      __pfn_to_phys(pfn), PAGE_SIZE, true, true);
> +	if (ret) {
> +		kvm_err("Failed to map stage2 page (error %d)\n", ret);
> +		return 1;
> +	}

Hi, Anup

I think that it is not appropriate to add kvm_err here, because stage2_set_pte function
may apply for memory based on the pcache parameter. If the value of pcache is NULL,
stage2_set_pte function considers that there is not enough memory and here an invalid
error log is generated.

As an example, this error log is printed when a VM is migrating. But finally the VM migration
is successful. And if the kvm_err is added to the same position in the ARM architecture, the
same error log is also printed.

Mingwang

> +	return 0;
> +}
> +
Anup Patel Aug. 4, 2021, 7:16 a.m. UTC | #2
On Tue, Aug 3, 2021 at 6:49 PM limingwang (A) <limingwang@huawei.com> wrote:
>
> > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index
> > fa9a4f9b9542..4b294113c63b 100644
> > --- a/arch/riscv/kvm/mmu.c
> > +++ b/arch/riscv/kvm/mmu.c
> > @@ -300,7 +300,8 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t
> > addr,
> >       }
> >  }
> >
> > -static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size)
> > +static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
> > +                            gpa_t size, bool may_block)
> >  {
> >       int ret;
> >       pte_t *ptep;
> > @@ -325,6 +326,13 @@ static void stage2_unmap_range(struct kvm *kvm,
> > gpa_t start, gpa_t size)
> >
> >  next:
> >               addr += page_size;
> > +
> > +             /*
> > +              * If the range is too large, release the kvm->mmu_lock
> > +              * to prevent starvation and lockup detector warnings.
> > +              */
> > +             if (may_block && addr < end)
> > +                     cond_resched_lock(&kvm->mmu_lock);
> >       }
> >  }
> >
> > @@ -405,7 +413,6 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa,
> > phys_addr_t hpa,
> >  out:
> >       stage2_cache_flush(&pcache);
> >       return ret;
> > -
> >  }
> >
> >  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, @@
> > -547,7 +554,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >       spin_lock(&kvm->mmu_lock);
> >       if (ret)
> >               stage2_unmap_range(kvm, mem->guest_phys_addr,
> > -                                mem->memory_size);
> > +                                mem->memory_size, false);
> >       spin_unlock(&kvm->mmu_lock);
> >
> >  out:
> > @@ -555,6 +562,73 @@ int kvm_arch_prepare_memory_region(struct kvm
> > *kvm,
> >       return ret;
> >  }
> >
> > +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> > +{
> > +     if (!kvm->arch.pgd)
> > +             return 0;
> > +
> > +     stage2_unmap_range(kvm, range->start << PAGE_SHIFT,
> > +                        (range->end - range->start) << PAGE_SHIFT,
> > +                        range->may_block);
> > +     return 0;
> > +}
> > +
> > +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) {
> > +     int ret;
> > +     kvm_pfn_t pfn = pte_pfn(range->pte);
> > +
> > +     if (!kvm->arch.pgd)
> > +             return 0;
> > +
> > +     WARN_ON(range->end - range->start != 1);
> > +
> > +     ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT,
> > +                           __pfn_to_phys(pfn), PAGE_SIZE, true, true);
> > +     if (ret) {
> > +             kvm_err("Failed to map stage2 page (error %d)\n", ret);
> > +             return 1;
> > +     }
>
> Hi, Anup
>
> I think that it is not appropriate to add kvm_err here, because stage2_set_pte function
> may apply for memory based on the pcache parameter. If the value of pcache is NULL,
> stage2_set_pte function considers that there is not enough memory and here an invalid
> error log is generated.
>
> As an example, this error log is printed when a VM is migrating. But finally the VM migration
> is successful. And if the kvm_err is added to the same position in the ARM architecture, the
> same error log is also printed.

Okay, I have converted kvm_err() to kvm_debug(). In the future, we can totally
remove it as well.

Please try riscv_kvm_v20 branch at:
https://github.com/avpatel/linux.git

Regards,
Anup

>
> Mingwang
>
> > +     return 0;
> > +}
> > +
>
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 33255c5dd555..a54a58a4026d 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -196,6 +196,8 @@  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+
 void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa, unsigned long vmid);
 void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid);
 void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa);
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index 633063edaee8..a712bb910cda 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -20,6 +20,7 @@  if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
 	depends on RISCV_SBI && MMU
+	select MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select KVM_MMIO
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index fa9a4f9b9542..4b294113c63b 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -300,7 +300,8 @@  static void stage2_op_pte(struct kvm *kvm, gpa_t addr,
 	}
 }
 
-static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size)
+static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
+			       gpa_t size, bool may_block)
 {
 	int ret;
 	pte_t *ptep;
@@ -325,6 +326,13 @@  static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size)
 
 next:
 		addr += page_size;
+
+		/*
+		 * If the range is too large, release the kvm->mmu_lock
+		 * to prevent starvation and lockup detector warnings.
+		 */
+		if (may_block && addr < end)
+			cond_resched_lock(&kvm->mmu_lock);
 	}
 }
 
@@ -405,7 +413,6 @@  static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 out:
 	stage2_cache_flush(&pcache);
 	return ret;
-
 }
 
 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
@@ -547,7 +554,7 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	spin_lock(&kvm->mmu_lock);
 	if (ret)
 		stage2_unmap_range(kvm, mem->guest_phys_addr,
-				   mem->memory_size);
+				   mem->memory_size, false);
 	spin_unlock(&kvm->mmu_lock);
 
 out:
@@ -555,6 +562,73 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	return ret;
 }
 
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	if (!kvm->arch.pgd)
+		return 0;
+
+	stage2_unmap_range(kvm, range->start << PAGE_SHIFT,
+			   (range->end - range->start) << PAGE_SHIFT,
+			   range->may_block);
+	return 0;
+}
+
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	int ret;
+	kvm_pfn_t pfn = pte_pfn(range->pte);
+
+	if (!kvm->arch.pgd)
+		return 0;
+
+	WARN_ON(range->end - range->start != 1);
+
+	ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT,
+			      __pfn_to_phys(pfn), PAGE_SIZE, true, true);
+	if (ret) {
+		kvm_err("Failed to map stage2 page (error %d)\n", ret);
+		return 1;
+	}
+
+	return 0;
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	pte_t *ptep;
+	u32 ptep_level = 0;
+	u64 size = (range->end - range->start) << PAGE_SHIFT;
+
+	if (!kvm->arch.pgd)
+		return 0;
+
+	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
+
+	if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
+				   &ptep, &ptep_level))
+		return 0;
+
+	return ptep_test_and_clear_young(NULL, 0, ptep);
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	pte_t *ptep;
+	u32 ptep_level = 0;
+	u64 size = (range->end - range->start) << PAGE_SHIFT;
+
+	if (!kvm->arch.pgd)
+		return 0;
+
+	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
+
+	if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
+				   &ptep, &ptep_level))
+		return 0;
+
+	return pte_young(*ptep);
+}
+
 int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
 			 struct kvm_memory_slot *memslot,
 			 gpa_t gpa, unsigned long hva, bool is_write)
@@ -569,7 +643,7 @@  int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache;
 	bool logging = (memslot->dirty_bitmap &&
 			!(memslot->flags & KVM_MEM_READONLY)) ? true : false;
-	unsigned long vma_pagesize;
+	unsigned long vma_pagesize, mmu_seq;
 
 	mmap_read_lock(current->mm);
 
@@ -608,6 +682,8 @@  int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
 		return ret;
 	}
 
+	mmu_seq = kvm->mmu_notifier_seq;
+
 	hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable);
 	if (hfn == KVM_PFN_ERR_HWPOISON) {
 		send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
@@ -626,6 +702,9 @@  int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
 
 	spin_lock(&kvm->mmu_lock);
 
+	if (mmu_notifier_retry(kvm, mmu_seq))
+		goto out_unlock;
+
 	if (writeable) {
 		kvm_set_pfn_dirty(hfn);
 		mark_page_dirty(kvm, gfn);
@@ -639,6 +718,7 @@  int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
 	if (ret)
 		kvm_err("Failed to map in stage2\n");
 
+out_unlock:
 	spin_unlock(&kvm->mmu_lock);
 	kvm_set_pfn_accessed(hfn);
 	kvm_release_pfn_clean(hfn);
@@ -675,7 +755,7 @@  void kvm_riscv_stage2_free_pgd(struct kvm *kvm)
 
 	spin_lock(&kvm->mmu_lock);
 	if (kvm->arch.pgd) {
-		stage2_unmap_range(kvm, 0UL, stage2_gpa_size);
+		stage2_unmap_range(kvm, 0UL, stage2_gpa_size, false);
 		pgd = READ_ONCE(kvm->arch.pgd);
 		kvm->arch.pgd = NULL;
 		kvm->arch.pgd_phys = 0;
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index 38a644417627..0110267eb7e3 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -64,6 +64,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
 	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_READONLY_MEM: