diff mbox series

[2/6] KVM: Add KVM_PRE_FAULT_MEMORY vcpu ioctl to pre-populate guest memory

Message ID 20240419085927.3648704-3-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: Guest Memory Pre-Population API | expand

Commit Message

Paolo Bonzini April 19, 2024, 8:59 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the
memory range and calls the arch-specific function.  Add stub arch function
as a weak symbol.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  5 ++++
 include/uapi/linux/kvm.h | 10 +++++++
 virt/kvm/Kconfig         |  3 ++
 virt/kvm/kvm_main.c      | 63 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 81 insertions(+)

Comments

Binbin Wu April 22, 2024, 5:39 a.m. UTC | #1
On 4/19/2024 4:59 PM, Paolo Bonzini wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the
> memory range and calls the arch-specific function.  Add stub arch function
> as a weak symbol.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   include/linux/kvm_host.h |  5 ++++
>   include/uapi/linux/kvm.h | 10 +++++++
>   virt/kvm/Kconfig         |  3 ++
>   virt/kvm/kvm_main.c      | 63 ++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 81 insertions(+)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 8dea11701ab2..9e9943e5e37c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
>   void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
>   #endif
>   
> +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> +				    struct kvm_pre_fault_memory *range);
> +#endif
> +
>   #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2190adbe3002..917d2964947d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -917,6 +917,7 @@ struct kvm_enable_cap {
>   #define KVM_CAP_MEMORY_ATTRIBUTES 233
>   #define KVM_CAP_GUEST_MEMFD 234
>   #define KVM_CAP_VM_TYPES 235
> +#define KVM_CAP_PRE_FAULT_MEMORY 236
>   
>   struct kvm_irq_routing_irqchip {
>   	__u32 irqchip;
> @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
>   	__u64 reserved[6];
>   };
>   
> +#define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
> +
> +struct kvm_pre_fault_memory {
> +	__u64 gpa;
> +	__u64 size;
> +	__u64 flags;
> +	__u64 padding[5];
> +};
> +
>   #endif /* __LINUX_KVM_H */
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 754c6c923427..b14e14cdbfb9 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
>   config KVM_GENERIC_DIRTYLOG_READ_PROTECT
>          bool
>   
> +config KVM_GENERIC_PRE_FAULT_MEMORY
> +       bool
> +
>   config KVM_COMPAT
>          def_bool y
>          depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 38b498669ef9..51d8dbe7e93b 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4379,6 +4379,55 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
>   	return fd;
>   }
>   
> +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> +				     struct kvm_pre_fault_memory *range)
> +{
> +	int idx;
> +	long r;
> +	u64 full_size;
> +
> +	if (range->flags)
> +		return -EINVAL;
> +
> +	if (!PAGE_ALIGNED(range->gpa) ||
> +	    !PAGE_ALIGNED(range->size) ||
> +	    range->gpa + range->size <= range->gpa)
> +		return -EINVAL;
> +
> +	if (!range->size)
> +		return 0;

range->size equals 0 can be covered by "range->gpa + range->size <= 
range->gpa"

If we want to return success when size is 0 (, though I am not sure it's 
needed),
we need to use "range->gpa + range->size < range->gpa" instead.


> +
> +	vcpu_load(vcpu);
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	full_size = range->size;
> +	do {
> +		if (signal_pending(current)) {
> +			r = -EINTR;
> +			break;
> +		}
> +
> +		r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
> +		if (r < 0)
> +			break;
> +
> +		if (WARN_ON_ONCE(r == 0))
> +			break;
> +
> +		range->size -= r;
> +		range->gpa += r;
> +		cond_resched();
> +	} while (range->size);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +	vcpu_put(vcpu);
> +
> +	/* Return success if at least one page was mapped successfully.  */
> +	return full_size == range->size ? r : 0;
> +}
> +#endif
> +
>   static long kvm_vcpu_ioctl(struct file *filp,
>   			   unsigned int ioctl, unsigned long arg)
>   {
> @@ -4580,6 +4629,20 @@ static long kvm_vcpu_ioctl(struct file *filp,
>   		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
>   		break;
>   	}
> +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> +	case KVM_PRE_FAULT_MEMORY: {
> +		struct kvm_pre_fault_memory range;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&range, argp, sizeof(range)))
> +			break;
> +		r = kvm_vcpu_pre_fault_memory(vcpu, &range);
> +		/* Pass back leftover range. */
> +		if (copy_to_user(argp, &range, sizeof(range)))
> +			r = -EFAULT;
> +		break;
> +	}
> +#endif
>   	default:
>   		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>   	}
Binbin Wu April 22, 2024, 7:19 a.m. UTC | #2
On 4/19/2024 4:59 PM, Paolo Bonzini wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the
> memory range and calls the arch-specific function.  Add stub arch function
> as a weak symbol.

The description is stale. The weak symbol was removed since v3.
Isaku Yamahata April 22, 2024, 6 p.m. UTC | #3
On Fri, Apr 19, 2024 at 04:59:23AM -0400,
Paolo Bonzini <pbonzini@redhat.com> wrote:

> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the
> memory range and calls the arch-specific function.  Add stub arch function
> as a weak symbol.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  include/linux/kvm_host.h |  5 ++++
>  include/uapi/linux/kvm.h | 10 +++++++
>  virt/kvm/Kconfig         |  3 ++
>  virt/kvm/kvm_main.c      | 63 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 81 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 8dea11701ab2..9e9943e5e37c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
>  void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
>  #endif
>  
> +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> +				    struct kvm_pre_fault_memory *range);
> +#endif
> +
>  #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2190adbe3002..917d2964947d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -917,6 +917,7 @@ struct kvm_enable_cap {
>  #define KVM_CAP_MEMORY_ATTRIBUTES 233
>  #define KVM_CAP_GUEST_MEMFD 234
>  #define KVM_CAP_VM_TYPES 235
> +#define KVM_CAP_PRE_FAULT_MEMORY 236
>  
>  struct kvm_irq_routing_irqchip {
>  	__u32 irqchip;
> @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
>  	__u64 reserved[6];
>  };
>  
> +#define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
> +
> +struct kvm_pre_fault_memory {
> +	__u64 gpa;
> +	__u64 size;
> +	__u64 flags;
> +	__u64 padding[5];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 754c6c923427..b14e14cdbfb9 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
>  config KVM_GENERIC_DIRTYLOG_READ_PROTECT
>         bool
>  
> +config KVM_GENERIC_PRE_FAULT_MEMORY
> +       bool
> +
>  config KVM_COMPAT
>         def_bool y
>         depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 38b498669ef9..51d8dbe7e93b 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4379,6 +4379,55 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
>  	return fd;
>  }
>  
> +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> +				     struct kvm_pre_fault_memory *range)
> +{
> +	int idx;
> +	long r;
> +	u64 full_size;
> +
> +	if (range->flags)
> +		return -EINVAL;

To keep future extensively, check the padding are zero.
Or will we be rely on flags?

        if (!memchr_inv(range->padding, 0, sizeof(range->padding)))
                return -EINVAL;
Paolo Bonzini April 24, 2024, 4:05 p.m. UTC | #4
On Mon, Apr 22, 2024 at 7:39 AM Binbin Wu <binbin.wu@linux.intel.com> wrote:
> range->size equals 0 can be covered by "range->gpa + range->size <=
> range->gpa"
>
> If we want to return success when size is 0 (, though I am not sure it's
> needed),
> we need to use "range->gpa + range->size < range->gpa" instead.

I think it's not needed because it could cause an infinite loop in
(buggy) userspace. Better return -EINVAL.

Paolo

>
> > +
> > +     vcpu_load(vcpu);
> > +     idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +
> > +     full_size = range->size;
> > +     do {
> > +             if (signal_pending(current)) {
> > +                     r = -EINTR;
> > +                     break;
> > +             }
> > +
> > +             r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
> > +             if (r < 0)
> > +                     break;
> > +
> > +             if (WARN_ON_ONCE(r == 0))
> > +                     break;
> > +
> > +             range->size -= r;
> > +             range->gpa += r;
> > +             cond_resched();
> > +     } while (range->size);
> > +
> > +     srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +     vcpu_put(vcpu);
> > +
> > +     /* Return success if at least one page was mapped successfully.  */
> > +     return full_size == range->size ? r : 0;
> > +}
> > +#endif
> > +
> >   static long kvm_vcpu_ioctl(struct file *filp,
> >                          unsigned int ioctl, unsigned long arg)
> >   {
> > @@ -4580,6 +4629,20 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >               r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
> >               break;
> >       }
> > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> > +     case KVM_PRE_FAULT_MEMORY: {
> > +             struct kvm_pre_fault_memory range;
> > +
> > +             r = -EFAULT;
> > +             if (copy_from_user(&range, argp, sizeof(range)))
> > +                     break;
> > +             r = kvm_vcpu_pre_fault_memory(vcpu, &range);
> > +             /* Pass back leftover range. */
> > +             if (copy_to_user(argp, &range, sizeof(range)))
> > +                     r = -EFAULT;
> > +             break;
> > +     }
> > +#endif
> >       default:
> >               r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
> >       }
>
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8dea11701ab2..9e9943e5e37c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2478,4 +2478,9 @@  long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
 void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
 #endif
 
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+				    struct kvm_pre_fault_memory *range);
+#endif
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2190adbe3002..917d2964947d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -917,6 +917,7 @@  struct kvm_enable_cap {
 #define KVM_CAP_MEMORY_ATTRIBUTES 233
 #define KVM_CAP_GUEST_MEMFD 234
 #define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_PRE_FAULT_MEMORY 236
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
@@ -1548,4 +1549,13 @@  struct kvm_create_guest_memfd {
 	__u64 reserved[6];
 };
 
+#define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
+
+struct kvm_pre_fault_memory {
+	__u64 gpa;
+	__u64 size;
+	__u64 flags;
+	__u64 padding[5];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 754c6c923427..b14e14cdbfb9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -67,6 +67,9 @@  config HAVE_KVM_INVALID_WAKEUPS
 config KVM_GENERIC_DIRTYLOG_READ_PROTECT
        bool
 
+config KVM_GENERIC_PRE_FAULT_MEMORY
+       bool
+
 config KVM_COMPAT
        def_bool y
        depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 38b498669ef9..51d8dbe7e93b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4379,6 +4379,55 @@  static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
 	return fd;
 }
 
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+				     struct kvm_pre_fault_memory *range)
+{
+	int idx;
+	long r;
+	u64 full_size;
+
+	if (range->flags)
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(range->gpa) ||
+	    !PAGE_ALIGNED(range->size) ||
+	    range->gpa + range->size <= range->gpa)
+		return -EINVAL;
+
+	if (!range->size)
+		return 0;
+
+	vcpu_load(vcpu);
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	full_size = range->size;
+	do {
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
+		if (r < 0)
+			break;
+
+		if (WARN_ON_ONCE(r == 0))
+			break;
+
+		range->size -= r;
+		range->gpa += r;
+		cond_resched();
+	} while (range->size);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	vcpu_put(vcpu);
+
+	/* Return success if at least one page was mapped successfully.  */
+	return full_size == range->size ? r : 0;
+}
+#endif
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4580,6 +4629,20 @@  static long kvm_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
 		break;
 	}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+	case KVM_PRE_FAULT_MEMORY: {
+		struct kvm_pre_fault_memory range;
+
+		r = -EFAULT;
+		if (copy_from_user(&range, argp, sizeof(range)))
+			break;
+		r = kvm_vcpu_pre_fault_memory(vcpu, &range);
+		/* Pass back leftover range. */
+		if (copy_to_user(argp, &range, sizeof(range)))
+			r = -EFAULT;
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}