Message ID | 20190409141347.3029-2-clg@kaod.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | None | expand |
On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: > When the VM boots, the CAS negotiation process determines which > interrupt mode to use and invokes a machine reset. At that time, any > links to the previous KVM interrupt device should be 'destroyed' > before the new chosen one is created. > > To perform the necessary cleanups in KVM, we extend the KVM device > interface with a new 'release' operation which is called when the file > descriptor of the device is closed. > > Such operations are defined for the XICS-on-XIVE and the XIVE native > KVM devices. They clear the vCPU interrupt presenters that could be > attached and then destroy the device. > > Signed-off-by: Cédric Le Goater <clg@kaod.org> > --- > include/linux/kvm_host.h | 1 + > arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- > arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ > virt/kvm/kvm_main.c | 13 +++++++ > 4 files changed, 85 insertions(+), 2 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 831d963451d8..3b444620d8fc 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1246,6 +1246,7 @@ struct kvm_device_ops { > long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, > unsigned long arg); > int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); > + void (*release)(struct kvm_device *dev); > }; > > void kvm_device_get(struct kvm_device *dev); > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 4d4e1730de84..ba777db849d7 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > { > struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > - struct kvmppc_xive *xive = xc->xive; > + struct kvmppc_xive *xive; > int i; > > + if (!kvmppc_xics_enabled(vcpu)) > + return; > + > + if (!xc) > + return; > + > pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); > > + xive = xc->xive; > + > /* Ensure no interrupt is still routed to that VP */ > xc->valid = false; > kvmppc_xive_disable_vcpu_interrupts(vcpu); > @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > } > /* Free the VP */ > kfree(xc); > + > + /* Cleanup the vcpu */ > + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > + vcpu->arch.xive_vcpu = NULL; > } > > int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > } > if (xive->kvm != vcpu->kvm) > return -EPERM; > - if (vcpu->arch.irq_type) > + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) > return -EBUSY; > if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > pr_devel("Duplicate !\n"); > @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) > kfree(dev); > } > > +static void kvmppc_xive_release(struct kvm_device *dev) > +{ > + struct kvmppc_xive *xive = dev->private; > + struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > + int i; > + > + pr_devel("Releasing xive device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + if (atomic_read(&kvm->online_vcpus) != 0) { What prevents online_vcpus from becoming non-zero after this test, but before the kvmppc_xive_free()? Is the test actually necessary? The operations below should be safe even if there are no online cpus, yes? > + /* > + * call kick_all_cpus_sync() to ensure that all CPUs > + * have executed any pending interrupts > + */ > + if (is_kvmppc_hv_enabled(kvm)) > + kick_all_cpus_sync(); > + > + /* > + * TODO: There is still a race window with the early > + * checks in kvmppc_native_connect_vcpu() > + */ That's... not reassuring. What are the consequences of that race, and what do you plan to do about it? > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_cleanup_vcpu(vcpu); > + } > + > + kvmppc_xive_free(dev); > +} > + > struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) > { > struct kvmppc_xive *xive; > @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = { > .name = "kvm-xive", > .create = kvmppc_xive_create, > .init = kvmppc_xive_init, > + .release = kvmppc_xive_release, > .destroy = kvmppc_xive_free, > .set_attr = xive_set_attr, > .get_attr = xive_get_attr, > diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > index 092db0efe628..629da7bf2a89 100644 > --- a/arch/powerpc/kvm/book3s_xive_native.c > +++ b/arch/powerpc/kvm/book3s_xive_native.c > @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) > kfree(dev); > } > > +static void kvmppc_xive_native_release(struct kvm_device *dev) > +{ > + struct kvmppc_xive *xive = dev->private; > + struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > + int i; > + > + pr_devel("Releasing xive native device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + if (atomic_read(&kvm->online_vcpus) != 0) { Likewise here. > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_native_cleanup_vcpu(vcpu); > + } > + > + kvmppc_xive_native_free(dev); > +} > + > static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) > { > struct kvmppc_xive *xive; > @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = { > .name = "kvm-xive-native", > .create = kvmppc_xive_native_create, > .init = kvmppc_xive_native_init, > + .release = kvmppc_xive_native_release, > .destroy = kvmppc_xive_native_free, > .set_attr = kvmppc_xive_native_set_attr, > .get_attr = kvmppc_xive_native_get_attr, > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index ea2018ae1cd7..ea2619d5ca98 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp) > struct kvm_device *dev = filp->private_data; > struct kvm *kvm = dev->kvm; > > + if (!dev) > + return -ENODEV; > + > + if (dev->kvm != kvm) > + return -EPERM; > + > + if (dev->ops->release) { > + mutex_lock(&kvm->lock); > + list_del(&dev->vm_node); > + dev->ops->release(dev); > + mutex_unlock(&kvm->lock); > + } > + Wasn't there a big comment that explained that release replaced destroy somewhere? > kvm_put_kvm(kvm); > return 0; > }
On Mon, Apr 15, 2019 at 01:32:19PM +1000, David Gibson wrote: > On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: > > When the VM boots, the CAS negotiation process determines which > > interrupt mode to use and invokes a machine reset. At that time, any > > links to the previous KVM interrupt device should be 'destroyed' > > before the new chosen one is created. > > > > To perform the necessary cleanups in KVM, we extend the KVM device > > interface with a new 'release' operation which is called when the file > > descriptor of the device is closed. > > > > Such operations are defined for the XICS-on-XIVE and the XIVE native > > KVM devices. They clear the vCPU interrupt presenters that could be > > attached and then destroy the device. > > > > Signed-off-by: Cédric Le Goater <clg@kaod.org> > > --- > > include/linux/kvm_host.h | 1 + > > arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- > > arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ > > virt/kvm/kvm_main.c | 13 +++++++ > > 4 files changed, 85 insertions(+), 2 deletions(-) > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 831d963451d8..3b444620d8fc 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -1246,6 +1246,7 @@ struct kvm_device_ops { > > long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, > > unsigned long arg); > > int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); > > + void (*release)(struct kvm_device *dev); > > }; > > > > void kvm_device_get(struct kvm_device *dev); > > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > > index 4d4e1730de84..ba777db849d7 100644 > > --- a/arch/powerpc/kvm/book3s_xive.c > > +++ b/arch/powerpc/kvm/book3s_xive.c > > @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > > void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > > { > > struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > > - struct kvmppc_xive *xive = xc->xive; > > + struct kvmppc_xive *xive; > > int i; > > > > + if (!kvmppc_xics_enabled(vcpu)) > > + return; > > + > > + if (!xc) > > + return; > > + > > pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); > > > > + xive = xc->xive; > > + > > /* Ensure no interrupt is still routed to that VP */ > > xc->valid = false; > > kvmppc_xive_disable_vcpu_interrupts(vcpu); > > @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > > } > > /* Free the VP */ > > kfree(xc); > > + > > + /* Cleanup the vcpu */ > > + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > > + vcpu->arch.xive_vcpu = NULL; > > } > > > > int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > > @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > > } > > if (xive->kvm != vcpu->kvm) > > return -EPERM; > > - if (vcpu->arch.irq_type) > > + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) > > return -EBUSY; > > if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > > pr_devel("Duplicate !\n"); > > @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) > > kfree(dev); > > } > > > > +static void kvmppc_xive_release(struct kvm_device *dev) > > +{ > > + struct kvmppc_xive *xive = dev->private; > > + struct kvm *kvm = xive->kvm; > > + struct kvm_vcpu *vcpu; > > + int i; > > + > > + pr_devel("Releasing xive device\n"); > > + > > + /* > > + * When releasing the KVM device fd, the vCPUs can still be > > + * running and we should clean up the vCPU interrupt > > + * presenters first. > > + */ > > + if (atomic_read(&kvm->online_vcpus) != 0) { > > What prevents online_vcpus from becoming non-zero after this test, but > before the kvmppc_xive_free()? > > Is the test actually necessary? The operations below should be safe > even if there are no online cpus, yes? Right... Similarly, the kick_all_cpus_sync() without anything having been done before it that we want the other vcpus to notice made me wonder what the point of it was. In other places where it is used we have done something such as set kvm->arch.mmu_ready to 0 first. > > + /* > > + * call kick_all_cpus_sync() to ensure that all CPUs > > + * have executed any pending interrupts > > + */ > > + if (is_kvmppc_hv_enabled(kvm)) > > + kick_all_cpus_sync(); Paul.
On 4/15/19 5:32 AM, David Gibson wrote: > On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: >> When the VM boots, the CAS negotiation process determines which >> interrupt mode to use and invokes a machine reset. At that time, any >> links to the previous KVM interrupt device should be 'destroyed' >> before the new chosen one is created. >> >> To perform the necessary cleanups in KVM, we extend the KVM device >> interface with a new 'release' operation which is called when the file >> descriptor of the device is closed. >> >> Such operations are defined for the XICS-on-XIVE and the XIVE native >> KVM devices. They clear the vCPU interrupt presenters that could be >> attached and then destroy the device. >> >> Signed-off-by: Cédric Le Goater <clg@kaod.org> >> --- >> include/linux/kvm_host.h | 1 + >> arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- >> arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ >> virt/kvm/kvm_main.c | 13 +++++++ >> 4 files changed, 85 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index 831d963451d8..3b444620d8fc 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -1246,6 +1246,7 @@ struct kvm_device_ops { >> long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, >> unsigned long arg); >> int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); >> + void (*release)(struct kvm_device *dev); >> }; >> >> void kvm_device_get(struct kvm_device *dev); >> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c >> index 4d4e1730de84..ba777db849d7 100644 >> --- a/arch/powerpc/kvm/book3s_xive.c >> +++ b/arch/powerpc/kvm/book3s_xive.c >> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) >> void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) >> { >> struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; >> - struct kvmppc_xive *xive = xc->xive; >> + struct kvmppc_xive *xive; >> int i; >> >> + if (!kvmppc_xics_enabled(vcpu)) >> + return; >> + >> + if (!xc) >> + return; >> + >> pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); >> >> + xive = xc->xive; >> + >> /* Ensure no interrupt is still routed to that VP */ >> xc->valid = false; >> kvmppc_xive_disable_vcpu_interrupts(vcpu); >> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) >> } >> /* Free the VP */ >> kfree(xc); >> + >> + /* Cleanup the vcpu */ >> + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; >> + vcpu->arch.xive_vcpu = NULL; >> } >> >> int kvmppc_xive_connect_vcpu(struct kvm_device *dev, >> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, >> } >> if (xive->kvm != vcpu->kvm) >> return -EPERM; >> - if (vcpu->arch.irq_type) >> + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) >> return -EBUSY; >> if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { >> pr_devel("Duplicate !\n"); >> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) >> kfree(dev); >> } >> >> +static void kvmppc_xive_release(struct kvm_device *dev) >> +{ >> + struct kvmppc_xive *xive = dev->private; >> + struct kvm *kvm = xive->kvm; >> + struct kvm_vcpu *vcpu; >> + int i; >> + >> + pr_devel("Releasing xive device\n"); >> + >> + /* >> + * When releasing the KVM device fd, the vCPUs can still be >> + * running and we should clean up the vCPU interrupt >> + * presenters first. >> + */ >> + if (atomic_read(&kvm->online_vcpus) != 0) { > > What prevents online_vcpus from becoming non-zero after this test, but > before the kvmppc_xive_free()? I am not sure what you mean. kvmppc_xive_free() is gone with this patch. It has been replaced by kvmppc_xive_release(). > Is the test actually necessary? The operations below should be safe > even if there are no online cpus, yes? ah, yes. kvm_for_each_vcpu() should be safe to use anyhow. >> + /* >> + * call kick_all_cpus_sync() to ensure that all CPUs >> + * have executed any pending interrupts >> + */ >> + if (is_kvmppc_hv_enabled(kvm)) >> + kick_all_cpus_sync();>> + /* >> + * TODO: There is still a race window with the early >> + * checks in kvmppc_native_connect_vcpu() >> + */ > > That's... not reassuring. What are the consequences of that race, a bogus ->xive pointer under the XIVE vCPU > and what do you plan to do about it? I don't think this is true any more with the release operation which will be called by the last user of the device file. Anyhow, xc->xive does not seem very useful (just like xc->valid) We should try to use only vcpu->kvm->arch.xive instead. I will propose some preliminary cleanups before introducing the new release operation. >> + kvm_for_each_vcpu(i, vcpu, kvm) >> + kvmppc_xive_cleanup_vcpu(vcpu); >> + } >> + >> + kvmppc_xive_free(dev); >> +} >> + >> struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) >> { >> struct kvmppc_xive *xive; >> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = { >> .name = "kvm-xive", >> .create = kvmppc_xive_create, >> .init = kvmppc_xive_init, >> + .release = kvmppc_xive_release, >> .destroy = kvmppc_xive_free, >> .set_attr = xive_set_attr, >> .get_attr = xive_get_attr, >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c >> index 092db0efe628..629da7bf2a89 100644 >> --- a/arch/powerpc/kvm/book3s_xive_native.c >> +++ b/arch/powerpc/kvm/book3s_xive_native.c >> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) >> kfree(dev); >> } >> >> +static void kvmppc_xive_native_release(struct kvm_device *dev) >> +{ >> + struct kvmppc_xive *xive = dev->private; >> + struct kvm *kvm = xive->kvm; >> + struct kvm_vcpu *vcpu; >> + int i; >> + >> + pr_devel("Releasing xive native device\n"); >> + >> + /* >> + * When releasing the KVM device fd, the vCPUs can still be >> + * running and we should clean up the vCPU interrupt >> + * presenters first. >> + */ >> + if (atomic_read(&kvm->online_vcpus) != 0) { > > Likewise here. > >> + kvm_for_each_vcpu(i, vcpu, kvm) >> + kvmppc_xive_native_cleanup_vcpu(vcpu); >> + } >> + >> + kvmppc_xive_native_free(dev); >> +} >> + >> static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) >> { >> struct kvmppc_xive *xive; >> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = { >> .name = "kvm-xive-native", >> .create = kvmppc_xive_native_create, >> .init = kvmppc_xive_native_init, >> + .release = kvmppc_xive_native_release, >> .destroy = kvmppc_xive_native_free, >> .set_attr = kvmppc_xive_native_set_attr, >> .get_attr = kvmppc_xive_native_get_attr, >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index ea2018ae1cd7..ea2619d5ca98 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp) >> struct kvm_device *dev = filp->private_data; >> struct kvm *kvm = dev->kvm; >> >> + if (!dev) >> + return -ENODEV; >> + >> + if (dev->kvm != kvm) >> + return -EPERM; >> + >> + if (dev->ops->release) { >> + mutex_lock(&kvm->lock); >> + list_del(&dev->vm_node); >> + dev->ops->release(dev); >> + mutex_unlock(&kvm->lock); >> + } >> + > > Wasn't there a big comment that explained that release replaced > destroy somewhere? Yes. I did add a comment in the "V5 errata" series. I should be sending a v6 this week, to clarify all these attempts to solve the device switching. Thanks, C. > >> kvm_put_kvm(kvm); >> return 0; >> } >
On 4/15/19 11:25 AM, Paul Mackerras wrote: > On Mon, Apr 15, 2019 at 01:32:19PM +1000, David Gibson wrote: >> On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: >>> When the VM boots, the CAS negotiation process determines which >>> interrupt mode to use and invokes a machine reset. At that time, any >>> links to the previous KVM interrupt device should be 'destroyed' >>> before the new chosen one is created. >>> >>> To perform the necessary cleanups in KVM, we extend the KVM device >>> interface with a new 'release' operation which is called when the file >>> descriptor of the device is closed. >>> >>> Such operations are defined for the XICS-on-XIVE and the XIVE native >>> KVM devices. They clear the vCPU interrupt presenters that could be >>> attached and then destroy the device. >>> >>> Signed-off-by: Cédric Le Goater <clg@kaod.org> >>> --- >>> include/linux/kvm_host.h | 1 + >>> arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- >>> arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ >>> virt/kvm/kvm_main.c | 13 +++++++ >>> 4 files changed, 85 insertions(+), 2 deletions(-) >>> >>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >>> index 831d963451d8..3b444620d8fc 100644 >>> --- a/include/linux/kvm_host.h >>> +++ b/include/linux/kvm_host.h >>> @@ -1246,6 +1246,7 @@ struct kvm_device_ops { >>> long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, >>> unsigned long arg); >>> int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); >>> + void (*release)(struct kvm_device *dev); >>> }; >>> >>> void kvm_device_get(struct kvm_device *dev); >>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c >>> index 4d4e1730de84..ba777db849d7 100644 >>> --- a/arch/powerpc/kvm/book3s_xive.c >>> +++ b/arch/powerpc/kvm/book3s_xive.c >>> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) >>> void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) >>> { >>> struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; >>> - struct kvmppc_xive *xive = xc->xive; >>> + struct kvmppc_xive *xive; >>> int i; >>> >>> + if (!kvmppc_xics_enabled(vcpu)) >>> + return; >>> + >>> + if (!xc) >>> + return; >>> + >>> pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); >>> >>> + xive = xc->xive; >>> + >>> /* Ensure no interrupt is still routed to that VP */ >>> xc->valid = false; >>> kvmppc_xive_disable_vcpu_interrupts(vcpu); >>> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) >>> } >>> /* Free the VP */ >>> kfree(xc); >>> + >>> + /* Cleanup the vcpu */ >>> + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; >>> + vcpu->arch.xive_vcpu = NULL; >>> } >>> >>> int kvmppc_xive_connect_vcpu(struct kvm_device *dev, >>> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, >>> } >>> if (xive->kvm != vcpu->kvm) >>> return -EPERM; >>> - if (vcpu->arch.irq_type) >>> + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) >>> return -EBUSY; >>> if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { >>> pr_devel("Duplicate !\n"); >>> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) >>> kfree(dev); >>> } >>> >>> +static void kvmppc_xive_release(struct kvm_device *dev) >>> +{ >>> + struct kvmppc_xive *xive = dev->private; >>> + struct kvm *kvm = xive->kvm; >>> + struct kvm_vcpu *vcpu; >>> + int i; >>> + >>> + pr_devel("Releasing xive device\n"); >>> + >>> + /* >>> + * When releasing the KVM device fd, the vCPUs can still be >>> + * running and we should clean up the vCPU interrupt >>> + * presenters first. >>> + */ >>> + if (atomic_read(&kvm->online_vcpus) != 0) { >> >> What prevents online_vcpus from becoming non-zero after this test, but >> before the kvmppc_xive_free()? >> >> Is the test actually necessary? The operations below should be safe >> even if there are no online cpus, yes? > > Right... Similarly, the kick_all_cpus_sync() without anything having > been done before it that we want the other vcpus to notice made me > wonder what the point of it was. In other places where it is used we > have done something such as set kvm->arch.mmu_ready to 0 first. This part is more dubious. It comes from my understanding of the routine kvm_arch_destroy_vm() that makes sure all IPIs have been handled before clearing the VCPUs structures. commit e17769eb8c89 is a bit cryptic and looks like an optimization that the release operation can ignore ? Thanks, C. >>> + /* >>> + * call kick_all_cpus_sync() to ensure that all CPUs >>> + * have executed any pending interrupts >>> + */ >>> + if (is_kvmppc_hv_enabled(kvm)) >>> + kick_all_cpus_sync(); > > Paul. >
On Mon, Apr 15, 2019 at 03:48:58PM +0200, Cédric Le Goater wrote: > On 4/15/19 5:32 AM, David Gibson wrote: > > On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: > >> When the VM boots, the CAS negotiation process determines which > >> interrupt mode to use and invokes a machine reset. At that time, any > >> links to the previous KVM interrupt device should be 'destroyed' > >> before the new chosen one is created. > >> > >> To perform the necessary cleanups in KVM, we extend the KVM device > >> interface with a new 'release' operation which is called when the file > >> descriptor of the device is closed. > >> > >> Such operations are defined for the XICS-on-XIVE and the XIVE native > >> KVM devices. They clear the vCPU interrupt presenters that could be > >> attached and then destroy the device. > >> > >> Signed-off-by: Cédric Le Goater <clg@kaod.org> > >> --- > >> include/linux/kvm_host.h | 1 + > >> arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- > >> arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ > >> virt/kvm/kvm_main.c | 13 +++++++ > >> 4 files changed, 85 insertions(+), 2 deletions(-) > >> > >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > >> index 831d963451d8..3b444620d8fc 100644 > >> --- a/include/linux/kvm_host.h > >> +++ b/include/linux/kvm_host.h > >> @@ -1246,6 +1246,7 @@ struct kvm_device_ops { > >> long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, > >> unsigned long arg); > >> int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); > >> + void (*release)(struct kvm_device *dev); > >> }; > >> > >> void kvm_device_get(struct kvm_device *dev); > >> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > >> index 4d4e1730de84..ba777db849d7 100644 > >> --- a/arch/powerpc/kvm/book3s_xive.c > >> +++ b/arch/powerpc/kvm/book3s_xive.c > >> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > >> void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > >> { > >> struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > >> - struct kvmppc_xive *xive = xc->xive; > >> + struct kvmppc_xive *xive; > >> int i; > >> > >> + if (!kvmppc_xics_enabled(vcpu)) > >> + return; > >> + > >> + if (!xc) > >> + return; > >> + > >> pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); > >> > >> + xive = xc->xive; > >> + > >> /* Ensure no interrupt is still routed to that VP */ > >> xc->valid = false; > >> kvmppc_xive_disable_vcpu_interrupts(vcpu); > >> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > >> } > >> /* Free the VP */ > >> kfree(xc); > >> + > >> + /* Cleanup the vcpu */ > >> + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > >> + vcpu->arch.xive_vcpu = NULL; > >> } > >> > >> int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > >> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > >> } > >> if (xive->kvm != vcpu->kvm) > >> return -EPERM; > >> - if (vcpu->arch.irq_type) > >> + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) > >> return -EBUSY; > >> if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > >> pr_devel("Duplicate !\n"); > >> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) > >> kfree(dev); > >> } > >> > >> +static void kvmppc_xive_release(struct kvm_device *dev) > >> +{ > >> + struct kvmppc_xive *xive = dev->private; > >> + struct kvm *kvm = xive->kvm; > >> + struct kvm_vcpu *vcpu; > >> + int i; > >> + > >> + pr_devel("Releasing xive device\n"); > >> + > >> + /* > >> + * When releasing the KVM device fd, the vCPUs can still be > >> + * running and we should clean up the vCPU interrupt > >> + * presenters first. > >> + */ > >> + if (atomic_read(&kvm->online_vcpus) != 0) { > > > > What prevents online_vcpus from becoming non-zero after this test, but > > before the kvmppc_xive_free()? > > I am not sure what you mean. kvmppc_xive_free() is gone with this patch. > It has been replaced by kvmppc_xive_release(). > > > Is the test actually necessary? The operations below should be safe > > even if there are no online cpus, yes? > > ah, yes. kvm_for_each_vcpu() should be safe to use anyhow. > > >> + /* > >> + * call kick_all_cpus_sync() to ensure that all CPUs > >> + * have executed any pending interrupts > >> + */ > >> + if (is_kvmppc_hv_enabled(kvm)) > >> + kick_all_cpus_sync();>> + /* > >> + * TODO: There is still a race window with the early > >> + * checks in kvmppc_native_connect_vcpu() > >> + */ > > > > That's... not reassuring. What are the consequences of that race, > > a bogus ->xive pointer under the XIVE vCPU > > > and what do you plan to do about it? > > I don't think this is true any more with the release operation > which will be called by the last user of the device file. Ok, so the comment needs updating. > Anyhow, xc->xive does not seem very useful (just like xc->valid) > We should try to use only vcpu->kvm->arch.xive instead. > > I will propose some preliminary cleanups before introducing the > new release operation. > > >> + kvm_for_each_vcpu(i, vcpu, kvm) > >> + kvmppc_xive_cleanup_vcpu(vcpu); > >> + } > >> + > >> + kvmppc_xive_free(dev); > >> +} > >> + > >> struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) > >> { > >> struct kvmppc_xive *xive; > >> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = { > >> .name = "kvm-xive", > >> .create = kvmppc_xive_create, > >> .init = kvmppc_xive_init, > >> + .release = kvmppc_xive_release, > >> .destroy = kvmppc_xive_free, > >> .set_attr = xive_set_attr, > >> .get_attr = xive_get_attr, > >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > >> index 092db0efe628..629da7bf2a89 100644 > >> --- a/arch/powerpc/kvm/book3s_xive_native.c > >> +++ b/arch/powerpc/kvm/book3s_xive_native.c > >> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) > >> kfree(dev); > >> } > >> > >> +static void kvmppc_xive_native_release(struct kvm_device *dev) > >> +{ > >> + struct kvmppc_xive *xive = dev->private; > >> + struct kvm *kvm = xive->kvm; > >> + struct kvm_vcpu *vcpu; > >> + int i; > >> + > >> + pr_devel("Releasing xive native device\n"); > >> + > >> + /* > >> + * When releasing the KVM device fd, the vCPUs can still be > >> + * running and we should clean up the vCPU interrupt > >> + * presenters first. > >> + */ > >> + if (atomic_read(&kvm->online_vcpus) != 0) { > > > > Likewise here. > > > >> + kvm_for_each_vcpu(i, vcpu, kvm) > >> + kvmppc_xive_native_cleanup_vcpu(vcpu); > >> + } > >> + > >> + kvmppc_xive_native_free(dev); > >> +} > >> + > >> static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) > >> { > >> struct kvmppc_xive *xive; > >> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = { > >> .name = "kvm-xive-native", > >> .create = kvmppc_xive_native_create, > >> .init = kvmppc_xive_native_init, > >> + .release = kvmppc_xive_native_release, > >> .destroy = kvmppc_xive_native_free, > >> .set_attr = kvmppc_xive_native_set_attr, > >> .get_attr = kvmppc_xive_native_get_attr, > >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > >> index ea2018ae1cd7..ea2619d5ca98 100644 > >> --- a/virt/kvm/kvm_main.c > >> +++ b/virt/kvm/kvm_main.c > >> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp) > >> struct kvm_device *dev = filp->private_data; > >> struct kvm *kvm = dev->kvm; > >> > >> + if (!dev) > >> + return -ENODEV; > >> + > >> + if (dev->kvm != kvm) > >> + return -EPERM; > >> + > >> + if (dev->ops->release) { > >> + mutex_lock(&kvm->lock); > >> + list_del(&dev->vm_node); > >> + dev->ops->release(dev); > >> + mutex_unlock(&kvm->lock); > >> + } > >> + > > > > Wasn't there a big comment that explained that release replaced > > destroy somewhere? > > Yes. I did add a comment in the "V5 errata" series. > > I should be sending a v6 this week, to clarify all these attempts > to solve the device switching. > > Thanks, > > C. > > > > >> kvm_put_kvm(kvm); > >> return 0; > >> } > > >
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 831d963451d8..3b444620d8fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1246,6 +1246,7 @@ struct kvm_device_ops { long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, unsigned long arg); int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); + void (*release)(struct kvm_device *dev); }; void kvm_device_get(struct kvm_device *dev); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 4d4e1730de84..ba777db849d7 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - struct kvmppc_xive *xive = xc->xive; + struct kvmppc_xive *xive; int i; + if (!kvmppc_xics_enabled(vcpu)) + return; + + if (!xc) + return; + pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); + xive = xc->xive; + /* Ensure no interrupt is still routed to that VP */ xc->valid = false; kvmppc_xive_disable_vcpu_interrupts(vcpu); @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) } /* Free the VP */ kfree(xc); + + /* Cleanup the vcpu */ + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; + vcpu->arch.xive_vcpu = NULL; } int kvmppc_xive_connect_vcpu(struct kvm_device *dev, @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, } if (xive->kvm != vcpu->kvm) return -EPERM; - if (vcpu->arch.irq_type) + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { pr_devel("Duplicate !\n"); @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) kfree(dev); } +static void kvmppc_xive_release(struct kvm_device *dev) +{ + struct kvmppc_xive *xive = dev->private; + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + int i; + + pr_devel("Releasing xive device\n"); + + /* + * When releasing the KVM device fd, the vCPUs can still be + * running and we should clean up the vCPU interrupt + * presenters first. + */ + if (atomic_read(&kvm->online_vcpus) != 0) { + /* + * call kick_all_cpus_sync() to ensure that all CPUs + * have executed any pending interrupts + */ + if (is_kvmppc_hv_enabled(kvm)) + kick_all_cpus_sync(); + + /* + * TODO: There is still a race window with the early + * checks in kvmppc_native_connect_vcpu() + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_xive_cleanup_vcpu(vcpu); + } + + kvmppc_xive_free(dev); +} + struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) { struct kvmppc_xive *xive; @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = { .name = "kvm-xive", .create = kvmppc_xive_create, .init = kvmppc_xive_init, + .release = kvmppc_xive_release, .destroy = kvmppc_xive_free, .set_attr = xive_set_attr, .get_attr = xive_get_attr, diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 092db0efe628..629da7bf2a89 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) kfree(dev); } +static void kvmppc_xive_native_release(struct kvm_device *dev) +{ + struct kvmppc_xive *xive = dev->private; + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + int i; + + pr_devel("Releasing xive native device\n"); + + /* + * When releasing the KVM device fd, the vCPUs can still be + * running and we should clean up the vCPU interrupt + * presenters first. + */ + if (atomic_read(&kvm->online_vcpus) != 0) { + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_xive_native_cleanup_vcpu(vcpu); + } + + kvmppc_xive_native_free(dev); +} + static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) { struct kvmppc_xive *xive; @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = { .name = "kvm-xive-native", .create = kvmppc_xive_native_create, .init = kvmppc_xive_native_init, + .release = kvmppc_xive_native_release, .destroy = kvmppc_xive_native_free, .set_attr = kvmppc_xive_native_set_attr, .get_attr = kvmppc_xive_native_get_attr, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ea2018ae1cd7..ea2619d5ca98 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp) struct kvm_device *dev = filp->private_data; struct kvm *kvm = dev->kvm; + if (!dev) + return -ENODEV; + + if (dev->kvm != kvm) + return -EPERM; + + if (dev->ops->release) { + mutex_lock(&kvm->lock); + list_del(&dev->vm_node); + dev->ops->release(dev); + mutex_unlock(&kvm->lock); + } + kvm_put_kvm(kvm); return 0; }
When the VM boots, the CAS negotiation process determines which interrupt mode to use and invokes a machine reset. At that time, any links to the previous KVM interrupt device should be 'destroyed' before the new chosen one is created. To perform the necessary cleanups in KVM, we extend the KVM device interface with a new 'release' operation which is called when the file descriptor of the device is closed. Such operations are defined for the XICS-on-XIVE and the XIVE native KVM devices. They clear the vCPU interrupt presenters that could be attached and then destroy the device. Signed-off-by: Cédric Le Goater <clg@kaod.org> --- include/linux/kvm_host.h | 1 + arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ virt/kvm/kvm_main.c | 13 +++++++ 4 files changed, 85 insertions(+), 2 deletions(-)