Message ID | 012b59708114ba121735769de94756fa5af3204d.1709288671.git.isaku.yamahata@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: Prepopulate guest memory API | expand |
On 2024-03-01 09:28 AM, isaku.yamahata@intel.com wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index d1fd9cb5d037..d77c9b79d76b 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_memory_mapping *mapping) > +{ > + bool added = false; > + int idx, r = 0; > + > + if (mapping->flags & ~(KVM_MEMORY_MAPPING_FLAG_WRITE | > + KVM_MEMORY_MAPPING_FLAG_EXEC | > + KVM_MEMORY_MAPPING_FLAG_USER | > + KVM_MEMORY_MAPPING_FLAG_PRIVATE)) > + return -EINVAL; > + if ((mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) && > + !kvm_arch_has_private_mem(vcpu->kvm)) > + return -EINVAL; > + > + /* Sanity check */ > + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || > + !mapping->nr_pages || > + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) > + return -EINVAL; > + > + vcpu_load(vcpu); > + idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvm_arch_vcpu_pre_map_memory(vcpu); > + if (r) > + return r; > + > + while (mapping->nr_pages) { > + if (signal_pending(current)) { > + r = -ERESTARTSYS; > + break; > + } > + > + if (need_resched()) nit: Is need_resched() superfluous when calling cond_resched()? > + cond_resched(); > + > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > + if (r) > + break; > + > + added = true; > + } > + > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + vcpu_put(vcpu); > + > + if (added && mapping->nr_pages > 0) > + r = -EAGAIN; This can overwrite the return value from kvm_arch_vcpu_map_memory().
On Wed, Mar 06, 2024 at 04:49:19PM -0800, David Matlack <dmatlack@google.com> wrote: > > + cond_resched(); > > + > > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > > + if (r) > > + break; > > + > > + added = true; > > + } > > + > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > + vcpu_put(vcpu); > > + > > + if (added && mapping->nr_pages > 0) > > + r = -EAGAIN; > > This can overwrite the return value from kvm_arch_vcpu_map_memory(). This is to convert ERESTART into EAGAIN. I'll drop this check and document EINTR so that caller should be aware of partial population.
> > +int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu); No explanation of why this is needed, and why it only takes @vcpu as input w/o having the @mapping. > +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_memory_mapping *mapping); > + > [...] > +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_memory_mapping *mapping) > +{ > + bool added = false; > + int idx, r = 0; > + > + if (mapping->flags & ~(KVM_MEMORY_MAPPING_FLAG_WRITE | > + KVM_MEMORY_MAPPING_FLAG_EXEC | > + KVM_MEMORY_MAPPING_FLAG_USER | > + KVM_MEMORY_MAPPING_FLAG_PRIVATE)) > + return -EINVAL; > + if ((mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) && > + !kvm_arch_has_private_mem(vcpu->kvm)) > + return -EINVAL; > + > + /* Sanity check */ > + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || > + !mapping->nr_pages || > + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) > + return -EINVAL; > + > + vcpu_load(vcpu); > + idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvm_arch_vcpu_pre_map_memory(vcpu); > + if (r) > + return r; Returning w/o unloading the vcpu and releasing the SRCU. > + > + while (mapping->nr_pages) { > + if (signal_pending(current)) { > + r = -ERESTARTSYS; > + break; > + } > + > + if (need_resched()) > + cond_resched(); need_resched() is not needed. And normally I think we just put it at the end of the loop. > + > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > + if (r) > + break; > + > + added = true; > + } > + > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + vcpu_put(vcpu); > + > + if (added && mapping->nr_pages > 0) > + r = -EAGAIN; Why do we need @added? I assume the kvm_arch_vcpu_map_memory() can internally update the mapping- >nr_pages but still return -E<WHATEVER>. So when that happens in the first call of kvm_arch_vcpu_map_memory(), @added won't get chance to turn to true. > + > + return r; > +} > + > static long kvm_vcpu_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -4620,6 +4683,17 @@ static long kvm_vcpu_ioctl(struct file *filp, > r = kvm_vcpu_ioctl_get_stats_fd(vcpu); > break; > } > + case KVM_MAP_MEMORY: { > + struct kvm_memory_mapping mapping; > + > + r = -EFAULT; > + if (copy_from_user(&mapping, argp, sizeof(mapping))) > + break; > + r = kvm_vcpu_map_memory(vcpu, &mapping); > + if (copy_to_user(argp, &mapping, sizeof(mapping))) > + r = -EFAULT; > + break; > + } > default: > r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); > }
On Thu, Mar 07, 2024 at 12:45:16PM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > > > > > +int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu); > > No explanation of why this is needed, and why it only takes @vcpu as input w/o > having the @mapping. > > > +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, > > + struct kvm_memory_mapping *mapping); > > + > > > > [...] > > > +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, > > + struct kvm_memory_mapping *mapping) > > +{ > > + bool added = false; > > + int idx, r = 0; > > + > > + if (mapping->flags & ~(KVM_MEMORY_MAPPING_FLAG_WRITE | > > + KVM_MEMORY_MAPPING_FLAG_EXEC | > > + KVM_MEMORY_MAPPING_FLAG_USER | > > + KVM_MEMORY_MAPPING_FLAG_PRIVATE)) > > + return -EINVAL; > > + if ((mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) && > > + !kvm_arch_has_private_mem(vcpu->kvm)) > > + return -EINVAL; > > + > > + /* Sanity check */ > > + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || > > + !mapping->nr_pages || > > + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) > > + return -EINVAL; > > + > > + vcpu_load(vcpu); > > + idx = srcu_read_lock(&vcpu->kvm->srcu); > > + r = kvm_arch_vcpu_pre_map_memory(vcpu); > > + if (r) > > + return r; > > Returning w/o unloading the vcpu and releasing the SRCU. Oos, Will fix. > > + > > + while (mapping->nr_pages) { > > + if (signal_pending(current)) { > > + r = -ERESTARTSYS; > > + break; > > + } > > + > > + if (need_resched()) > > + cond_resched(); > > need_resched() is not needed. > > And normally I think we just put it at the end of the loop. Ok, will move it. > > + > > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > > + if (r) > > + break; > > + > > + added = true; > > + } > > + > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > + vcpu_put(vcpu); > > + > > + if (added && mapping->nr_pages > 0) > > + r = -EAGAIN; > > Why do we need @added? > > I assume the kvm_arch_vcpu_map_memory() can internally update the mapping- > >nr_pages but still return -E<WHATEVER>. So when that happens in the first call > of kvm_arch_vcpu_map_memory(), @added won't get chance to turn to true. I intend to tell the caller if the range is partially processed or not. Anyway this seems moot. Let's drop this if clause. Then it's caller's responsibility to check error and partial conversion and to optionally loop with the remaining region.
On Fri, Mar 01, 2024, isaku.yamahata@intel.com wrote: > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index d1fd9cb5d037..d77c9b79d76b 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -4419,6 +4419,69 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) > return fd; > } > > +__weak int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu) > +{ > + return -EOPNOTSUPP; > +} > + > +__weak int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_memory_mapping *mapping) > +{ > + return -EOPNOTSUPP; > +} > + > +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_memory_mapping *mapping) > +{ > + bool added = false; > + int idx, r = 0; Pointless initialization of 'r'. > + > + if (mapping->flags & ~(KVM_MEMORY_MAPPING_FLAG_WRITE | > + KVM_MEMORY_MAPPING_FLAG_EXEC | > + KVM_MEMORY_MAPPING_FLAG_USER | > + KVM_MEMORY_MAPPING_FLAG_PRIVATE)) > + return -EINVAL; > + if ((mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) && > + !kvm_arch_has_private_mem(vcpu->kvm)) > + return -EINVAL; > + > + /* Sanity check */ Pointless comment. > + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || > + !mapping->nr_pages || > + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) > + return -EINVAL; > + > + vcpu_load(vcpu); > + idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvm_arch_vcpu_pre_map_memory(vcpu); This hooks is unnecessary, x86's kvm_mmu_reload() is optimized for the happy path where the MMU is already loaded. Just make the call from kvm_arch_vcpu_map_memory(). > + if (r) > + return r; Which is a good thing, because this leaks the SRCU lock. > + > + while (mapping->nr_pages) { > + if (signal_pending(current)) { > + r = -ERESTARTSYS; Why -ERESTARTSYS instead of -EINTR? The latter is KVM's typical response to a pending signal. > + break; > + } > + > + if (need_resched()) No need to manually check need_resched(), the below is a _conditional_ resched. The reason KVM explicitly checks need_resched() in MMU flows is because KVM needs to drop mmu_lock before rescheduling, i.e. calling cond_resched() directly would try to schedule() while holding a spinlock. > + cond_resched(); > + > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > + if (r) > + break; > + > + added = true; > + } > + > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + vcpu_put(vcpu); > + > + if (added && mapping->nr_pages > 0) > + r = -EAGAIN; No, this clobbers 'r', which might hold a fatal error code. I don't see any reason for common code to ever force -EAGAIN, it can't possibly know if trying again is reasonable. > + > + return r; > +}
On Mon, Mar 11, 2024 at 10:23:28AM -0700, Sean Christopherson <seanjc@google.com> wrote: > On Fri, Mar 01, 2024, isaku.yamahata@intel.com wrote: > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index d1fd9cb5d037..d77c9b79d76b 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -4419,6 +4419,69 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) > > return fd; > > } > > > > +__weak int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu) > > +{ > > + return -EOPNOTSUPP; > > +} > > + > > +__weak int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, > > + struct kvm_memory_mapping *mapping) > > +{ > > + return -EOPNOTSUPP; > > +} > > + > > +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, > > + struct kvm_memory_mapping *mapping) > > +{ > > + bool added = false; > > + int idx, r = 0; > > Pointless initialization of 'r'. > > > + > > + if (mapping->flags & ~(KVM_MEMORY_MAPPING_FLAG_WRITE | > > + KVM_MEMORY_MAPPING_FLAG_EXEC | > > + KVM_MEMORY_MAPPING_FLAG_USER | > > + KVM_MEMORY_MAPPING_FLAG_PRIVATE)) > > + return -EINVAL; > > + if ((mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) && > > + !kvm_arch_has_private_mem(vcpu->kvm)) > > + return -EINVAL; > > + > > + /* Sanity check */ > > Pointless comment. > > > + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || > > + !mapping->nr_pages || > > > + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) > > + return -EINVAL; > > + > > + vcpu_load(vcpu); > > + idx = srcu_read_lock(&vcpu->kvm->srcu); > > + r = kvm_arch_vcpu_pre_map_memory(vcpu); > > This hooks is unnecessary, x86's kvm_mmu_reload() is optimized for the happy path > where the MMU is already loaded. Just make the call from kvm_arch_vcpu_map_memory(). > > > + if (r) > > + return r; > > Which is a good thing, because this leaks the SRCU lock. > > > + > > + while (mapping->nr_pages) { > > + if (signal_pending(current)) { > > + r = -ERESTARTSYS; > > Why -ERESTARTSYS instead of -EINTR? The latter is KVM's typical response to a > pending signal. > > > + break; > > + } > > + > > + if (need_resched()) > > No need to manually check need_resched(), the below is a _conditional_ resched. > The reason KVM explicitly checks need_resched() in MMU flows is because KVM needs > to drop mmu_lock before rescheduling, i.e. calling cond_resched() directly would > try to schedule() while holding a spinlock. > > > + cond_resched(); > > + > > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > > + if (r) > > + break; > > + > > + added = true; > > + } > > + > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > + vcpu_put(vcpu); > > + > > + if (added && mapping->nr_pages > 0) > > + r = -EAGAIN; > > No, this clobbers 'r', which might hold a fatal error code. I don't see any > reason for common code to ever force -EAGAIN, it can't possibly know if trying > again is reasonable. Thanks for review. With those included, the hunk is as follows. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d1fd9cb5d037..342269ef9f13 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4419,6 +4419,47 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +__weak int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_memory_mapping *mapping) +{ + return -EOPNOTSUPP; +} + +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_memory_mapping *mapping) +{ + int idx, r; + + if (mapping->flags) + return -EINVAL; + + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + r = 0; + while (mapping->nr_pages) { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_map_memory(vcpu, mapping); + if (r) + break; + + cond_resched(); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + return r; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9807ea98b568..afbed288d625 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2445,4 +2445,8 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ +int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu); +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_memory_mapping *mapping); + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2190adbe3002..f5d6b481244f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_MAP_MEMORY 236 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1549,18 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_MAP_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_memory_mapping) + +#define KVM_MEMORY_MAPPING_FLAG_WRITE _BITULL(0) +#define KVM_MEMORY_MAPPING_FLAG_EXEC _BITULL(1) +#define KVM_MEMORY_MAPPING_FLAG_USER _BITULL(2) +#define KVM_MEMORY_MAPPING_FLAG_PRIVATE _BITULL(3) + +struct kvm_memory_mapping { + __u64 base_gfn; + __u64 nr_pages; + __u64 flags; + __u64 source; +}; + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d1fd9cb5d037..d77c9b79d76b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4419,6 +4419,69 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +__weak int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu) +{ + return -EOPNOTSUPP; +} + +__weak int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_memory_mapping *mapping) +{ + return -EOPNOTSUPP; +} + +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_memory_mapping *mapping) +{ + bool added = false; + int idx, r = 0; + + if (mapping->flags & ~(KVM_MEMORY_MAPPING_FLAG_WRITE | + KVM_MEMORY_MAPPING_FLAG_EXEC | + KVM_MEMORY_MAPPING_FLAG_USER | + KVM_MEMORY_MAPPING_FLAG_PRIVATE)) + return -EINVAL; + if ((mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) && + !kvm_arch_has_private_mem(vcpu->kvm)) + return -EINVAL; + + /* Sanity check */ + if (!IS_ALIGNED(mapping->source, PAGE_SIZE) || + !mapping->nr_pages || + mapping->base_gfn + mapping->nr_pages <= mapping->base_gfn) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvm_arch_vcpu_pre_map_memory(vcpu); + if (r) + return r; + + while (mapping->nr_pages) { + if (signal_pending(current)) { + r = -ERESTARTSYS; + break; + } + + if (need_resched()) + cond_resched(); + + r = kvm_arch_vcpu_map_memory(vcpu, mapping); + if (r) + break; + + added = true; + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + if (added && mapping->nr_pages > 0) + r = -EAGAIN; + + return r; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4620,6 +4683,17 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } + case KVM_MAP_MEMORY: { + struct kvm_memory_mapping mapping; + + r = -EFAULT; + if (copy_from_user(&mapping, argp, sizeof(mapping))) + break; + r = kvm_vcpu_map_memory(vcpu, &mapping); + if (copy_to_user(argp, &mapping, sizeof(mapping))) + r = -EFAULT; + break; + } default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); }