diff mbox series

[2/7] KVM: Add KVM_MAP_MEMORY vcpu ioctl to pre-populate guest memory

Message ID 20240417153450.3608097-3-pbonzini@redhat.com (mailing list archive)
State New
Headers show
Series KVM: Guest Memory Pre-Population API | expand

Commit Message

Paolo Bonzini April 17, 2024, 3:34 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Add a new ioctl KVM_MAP_MEMORY in the KVM common code. It iterates on the
memory range and calls the arch-specific function.  Add stub arch function
as a weak symbol.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  5 ++++
 include/uapi/linux/kvm.h | 10 +++++++
 virt/kvm/Kconfig         |  3 ++
 virt/kvm/kvm_main.c      | 61 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+)

Comments

Isaku Yamahata April 17, 2024, 7:36 p.m. UTC | #1
On Wed, Apr 17, 2024 at 11:34:45AM -0400,
Paolo Bonzini <pbonzini@redhat.com> wrote:

> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Add a new ioctl KVM_MAP_MEMORY in the KVM common code. It iterates on the
> memory range and calls the arch-specific function.  Add stub arch function
> as a weak symbol.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  include/linux/kvm_host.h |  5 ++++
>  include/uapi/linux/kvm.h | 10 +++++++
>  virt/kvm/Kconfig         |  3 ++
>  virt/kvm/kvm_main.c      | 61 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 79 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 8dea11701ab2..2b0f0240a64c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
>  void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
>  #endif
>  
> +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
> +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu,
> +			     struct kvm_map_memory *mapping);
> +#endif
> +
>  #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2190adbe3002..4d233f44c613 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -917,6 +917,7 @@ struct kvm_enable_cap {
>  #define KVM_CAP_MEMORY_ATTRIBUTES 233
>  #define KVM_CAP_GUEST_MEMFD 234
>  #define KVM_CAP_VM_TYPES 235
> +#define KVM_CAP_MAP_MEMORY 236
>  
>  struct kvm_irq_routing_irqchip {
>  	__u32 irqchip;
> @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
>  	__u64 reserved[6];
>  };
>  
> +#define KVM_MAP_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_map_memory)
> +
> +struct kvm_map_memory {
> +	__u64 base_address;
> +	__u64 size;
> +	__u64 flags;
> +	__u64 padding[5];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 754c6c923427..1b94126622e8 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
>  config KVM_GENERIC_DIRTYLOG_READ_PROTECT
>         bool
>  
> +config KVM_GENERIC_MAP_MEMORY
> +       bool
> +
>  config KVM_COMPAT
>         def_bool y
>         depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 38b498669ef9..350ead98e9a6 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4379,6 +4379,47 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
>  	return fd;
>  }
>  
> +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
> +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu,
> +			       struct kvm_map_memory *mapping)
> +{
> +	int idx, r;
> +	u64 full_size;
> +
> +	if (mapping->flags)
> +		return -EINVAL;
> +
> +	if (!PAGE_ALIGNED(mapping->base_address) ||
> +	    !PAGE_ALIGNED(mapping->size) ||
> +	    mapping->base_address + mapping->size <= mapping->base_address)
> +		return -EINVAL;
> +
> +	vcpu_load(vcpu);
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	r = 0;
> +	full_size = mapping->size;
> +	while (mapping->size) {
> +		if (signal_pending(current)) {
> +			r = -EINTR;
> +			break;
> +		}
> +
> +		r = kvm_arch_vcpu_map_memory(vcpu, mapping);
> +		if (r)
> +			break;
> +
> +		cond_resched();
> +	}
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +	vcpu_put(vcpu);
> +
> +	/* Return success if at least one page was mapped successfully.  */
> +	return full_size == mapping->size ? r : 0;
> +}
> +#endif
> +
>  static long kvm_vcpu_ioctl(struct file *filp,
>  			   unsigned int ioctl, unsigned long arg)
>  {
> @@ -4580,6 +4621,20 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
>  		break;
>  	}
> +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
> +	case KVM_MAP_MEMORY: {
> +		struct kvm_map_memory mapping;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mapping, argp, sizeof(mapping)))
> +			break;
> +		r = kvm_vcpu_map_memory(vcpu, &mapping);
> +		/* Pass back leftover range. */
> +		if (copy_to_user(argp, &mapping, sizeof(mapping)))
> +			r = -EFAULT;
> +		break;
> +	}
> +#endif
>  	default:
>  		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>  	}
> @@ -4863,6 +4918,12 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  #ifdef CONFIG_KVM_PRIVATE_MEM
>  	case KVM_CAP_GUEST_MEMFD:
>  		return !kvm || kvm_arch_has_private_mem(kvm);
> +#endif
> +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
> +	case KVM_CAP_MAP_MEMORY:
> +		if (!kvm)
> +			return 1;
> +		/* Leave per-VM implementation to kvm_vm_ioctl_check_extension().  */

nitpick:
                fallthough;

>  #endif
>  	default:
>  		break;
> -- 
> 2.43.0
> 
> 
>
Sean Christopherson April 17, 2024, 9:07 p.m. UTC | #2
On Wed, Apr 17, 2024, Isaku Yamahata wrote:
> > +	vcpu_load(vcpu);
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +
> > +	r = 0;
> > +	full_size = mapping->size;
> > +	while (mapping->size) {

Maybe pre-check !mapping->size?  E.g. there's no reason to load the vCPU and
acquire SRCU just to do nothing.  Then this can be a do-while loop and doesn't
need to explicitly initialize 'r'.

> > +		if (signal_pending(current)) {
> > +			r = -EINTR;
> > +			break;
> > +		}
> > +
> > +		r = kvm_arch_vcpu_map_memory(vcpu, mapping);

Requiring arch code to address @mapping is cumbersone.  If the arch call returns
a long, then can't we do?

		if (r < 0)
			break;

		mapping->size -= r;
		mapping->gpa += r;

> > +		if (r)
> > +			break;
> > +
> > +		cond_resched();
> > +	}
> > +
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +	vcpu_put(vcpu);
> > +
> > +	/* Return success if at least one page was mapped successfully.  */
> > +	return full_size == mapping->size ? r : 0;

I just saw Paolo's update that this is intentional, but this strikes me as odd,
as it requires userspace to redo the ioctl() to figure out why the last one failed.

Not a sticking point, just odd to my eyes.
Paolo Bonzini April 17, 2024, 9:13 p.m. UTC | #3
On Wed, Apr 17, 2024 at 11:07 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Apr 17, 2024, Isaku Yamahata wrote:
> > > +   vcpu_load(vcpu);
> > > +   idx = srcu_read_lock(&vcpu->kvm->srcu);
> > > +
> > > +   r = 0;
> > > +   full_size = mapping->size;
> > > +   while (mapping->size) {
>
> Maybe pre-check !mapping->size?  E.g. there's no reason to load the vCPU and
> acquire SRCU just to do nothing.  Then this can be a do-while loop and doesn't
> need to explicitly initialize 'r'.

It's unlikely to make any difference but okay---easy enough.

> > > +           if (signal_pending(current)) {
> > > +                   r = -EINTR;
> > > +                   break;
> > > +           }
> > > +
> > > +           r = kvm_arch_vcpu_map_memory(vcpu, mapping);
>
> Requiring arch code to address @mapping is cumbersone.  If the arch call returns
> a long, then can't we do?
>
>                 if (r < 0)
>                         break;
>
>                 mapping->size -= r;
>                 mapping->gpa += r;

Ok, I thought the same for the return value. I didn't expand the
arguments to arch code in case in the future we have flags or other
expansions of the struct.

> > > +           if (r)
> > > +                   break;
> > > +
> > > +           cond_resched();
> > > +   }
> > > +
> > > +   srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > > +   vcpu_put(vcpu);
> > > +
> > > +   /* Return success if at least one page was mapped successfully.  */
> > > +   return full_size == mapping->size ? r : 0;
>
> I just saw Paolo's update that this is intentional, but this strikes me as odd,
> as it requires userspace to redo the ioctl() to figure out why the last one failed.

Yeah, the same is true of read() but I don't think it's a problem. If
we get an EINTR, then (unlike KVM_RUN which can change the signal
mask) the signal will be delivered right after the ioctl() returns and
you can just proceed. For EAGAIN you can just proceed in general.

And of course, if RET_PF_RETRY is handled in the kernel then EAGAIN
goes away and the only cause of premature exit can be EINTR.

Paolo

> Not a sticking point, just odd to my eyes.
>
Xu Yilun April 19, 2024, 1:59 p.m. UTC | #4
> > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
> > +	case KVM_CAP_MAP_MEMORY:
> > +		if (!kvm)
> > +			return 1;
> > +		/* Leave per-VM implementation to kvm_vm_ioctl_check_extension().  */
> 
> nitpick:
>                 fallthough;

A little tricky. 'break;' should be more straightforward.  

Thanks,
Yilun

> 
> >  #endif
> >  	default:
> >  		break;
> > -- 
> > 2.43.0
> > 
> > 
> > 
> 
> -- 
> Isaku Yamahata <isaku.yamahata@intel.com>
>
Xu Yilun April 19, 2024, 2:01 p.m. UTC | #5
On Wed, Apr 17, 2024 at 11:34:45AM -0400, Paolo Bonzini wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Add a new ioctl KVM_MAP_MEMORY in the KVM common code. It iterates on the
> memory range and calls the arch-specific function.  Add stub arch function
> as a weak symbol.

The weak symbol is gone.

Thanks,
Yilun
Sean Christopherson April 19, 2024, 2:08 p.m. UTC | #6
On Fri, Apr 19, 2024, Xu Yilun wrote:
> > > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
> > > +	case KVM_CAP_MAP_MEMORY:
> > > +		if (!kvm)
> > > +			return 1;
> > > +		/* Leave per-VM implementation to kvm_vm_ioctl_check_extension().  */
> > 
> > nitpick:
> >                 fallthough;
> 
> A little tricky. 'break;' should be more straightforward.  

+1, though it's a moot point as Paolo dropped this code in v4.
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8dea11701ab2..2b0f0240a64c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2478,4 +2478,9 @@  long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
 void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
 #endif
 
+#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
+int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu,
+			     struct kvm_map_memory *mapping);
+#endif
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2190adbe3002..4d233f44c613 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -917,6 +917,7 @@  struct kvm_enable_cap {
 #define KVM_CAP_MEMORY_ATTRIBUTES 233
 #define KVM_CAP_GUEST_MEMFD 234
 #define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_MAP_MEMORY 236
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
@@ -1548,4 +1549,13 @@  struct kvm_create_guest_memfd {
 	__u64 reserved[6];
 };
 
+#define KVM_MAP_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_map_memory)
+
+struct kvm_map_memory {
+	__u64 base_address;
+	__u64 size;
+	__u64 flags;
+	__u64 padding[5];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 754c6c923427..1b94126622e8 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -67,6 +67,9 @@  config HAVE_KVM_INVALID_WAKEUPS
 config KVM_GENERIC_DIRTYLOG_READ_PROTECT
        bool
 
+config KVM_GENERIC_MAP_MEMORY
+       bool
+
 config KVM_COMPAT
        def_bool y
        depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 38b498669ef9..350ead98e9a6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4379,6 +4379,47 @@  static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
 	return fd;
 }
 
+#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
+static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu,
+			       struct kvm_map_memory *mapping)
+{
+	int idx, r;
+	u64 full_size;
+
+	if (mapping->flags)
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(mapping->base_address) ||
+	    !PAGE_ALIGNED(mapping->size) ||
+	    mapping->base_address + mapping->size <= mapping->base_address)
+		return -EINVAL;
+
+	vcpu_load(vcpu);
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	r = 0;
+	full_size = mapping->size;
+	while (mapping->size) {
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		r = kvm_arch_vcpu_map_memory(vcpu, mapping);
+		if (r)
+			break;
+
+		cond_resched();
+	}
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	vcpu_put(vcpu);
+
+	/* Return success if at least one page was mapped successfully.  */
+	return full_size == mapping->size ? r : 0;
+}
+#endif
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4580,6 +4621,20 @@  static long kvm_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
 		break;
 	}
+#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
+	case KVM_MAP_MEMORY: {
+		struct kvm_map_memory mapping;
+
+		r = -EFAULT;
+		if (copy_from_user(&mapping, argp, sizeof(mapping)))
+			break;
+		r = kvm_vcpu_map_memory(vcpu, &mapping);
+		/* Pass back leftover range. */
+		if (copy_to_user(argp, &mapping, sizeof(mapping)))
+			r = -EFAULT;
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
@@ -4863,6 +4918,12 @@  static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #ifdef CONFIG_KVM_PRIVATE_MEM
 	case KVM_CAP_GUEST_MEMFD:
 		return !kvm || kvm_arch_has_private_mem(kvm);
+#endif
+#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY
+	case KVM_CAP_MAP_MEMORY:
+		if (!kvm)
+			return 1;
+		/* Leave per-VM implementation to kvm_vm_ioctl_check_extension().  */
 #endif
 	default:
 		break;