diff mbox series

[RFC,v4,17/17] KVM: PPC: Book3S HV: XIVE: introduce a 'release' device operation

Message ID 20190409141347.3029-2-clg@kaod.org (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Cédric Le Goater April 9, 2019, 2:13 p.m. UTC
When the VM boots, the CAS negotiation process determines which
interrupt mode to use and invokes a machine reset. At that time, any
links to the previous KVM interrupt device should be 'destroyed'
before the new chosen one is created.

To perform the necessary cleanups in KVM, we extend the KVM device
interface with a new 'release' operation which is called when the file
descriptor of the device is closed.

Such operations are defined for the XICS-on-XIVE and the XIVE native
KVM devices. They clear the vCPU interrupt presenters that could be
attached and then destroy the device.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/linux/kvm_host.h              |  1 +
 arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
 virt/kvm/kvm_main.c                   | 13 +++++++
 4 files changed, 85 insertions(+), 2 deletions(-)

Comments

David Gibson April 15, 2019, 3:32 a.m. UTC | #1
On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote:
> When the VM boots, the CAS negotiation process determines which
> interrupt mode to use and invokes a machine reset. At that time, any
> links to the previous KVM interrupt device should be 'destroyed'
> before the new chosen one is created.
> 
> To perform the necessary cleanups in KVM, we extend the KVM device
> interface with a new 'release' operation which is called when the file
> descriptor of the device is closed.
> 
> Such operations are defined for the XICS-on-XIVE and the XIVE native
> KVM devices. They clear the vCPU interrupt presenters that could be
> attached and then destroy the device.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  include/linux/kvm_host.h              |  1 +
>  arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
>  arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
>  virt/kvm/kvm_main.c                   | 13 +++++++
>  4 files changed, 85 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 831d963451d8..3b444620d8fc 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1246,6 +1246,7 @@ struct kvm_device_ops {
>  	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
>  		      unsigned long arg);
>  	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
> +	void (*release)(struct kvm_device *dev);
>  };
>  
>  void kvm_device_get(struct kvm_device *dev);
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index 4d4e1730de84..ba777db849d7 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
>  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>  {
>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> -	struct kvmppc_xive *xive = xc->xive;
> +	struct kvmppc_xive *xive;
>  	int i;
>  
> +	if (!kvmppc_xics_enabled(vcpu))
> +		return;
> +
> +	if (!xc)
> +		return;
> +
>  	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
>  
> +	xive = xc->xive;
> +
>  	/* Ensure no interrupt is still routed to that VP */
>  	xc->valid = false;
>  	kvmppc_xive_disable_vcpu_interrupts(vcpu);
> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>  	}
>  	/* Free the VP */
>  	kfree(xc);
> +
> +	/* Cleanup the vcpu */
> +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
> +	vcpu->arch.xive_vcpu = NULL;
>  }
>  
>  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>  	}
>  	if (xive->kvm != vcpu->kvm)
>  		return -EPERM;
> -	if (vcpu->arch.irq_type)
> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>  		return -EBUSY;
>  	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
>  		pr_devel("Duplicate !\n");
> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev)
>  	kfree(dev);
>  }
>  
> +static void kvmppc_xive_release(struct kvm_device *dev)
> +{
> +	struct kvmppc_xive *xive = dev->private;
> +	struct kvm *kvm = xive->kvm;
> +	struct kvm_vcpu *vcpu;
> +	int i;
> +
> +	pr_devel("Releasing xive device\n");
> +
> +	/*
> +	 * When releasing the KVM device fd, the vCPUs can still be
> +	 * running and we should clean up the vCPU interrupt
> +	 * presenters first.
> +	 */
> +	if (atomic_read(&kvm->online_vcpus) != 0) {

What prevents online_vcpus from becoming non-zero after this test, but
before the kvmppc_xive_free()?

Is the test actually necessary?  The operations below should be safe
even if there are no online cpus, yes?

> +		/*
> +		 * call kick_all_cpus_sync() to ensure that all CPUs
> +		 * have executed any pending interrupts
> +		 */
> +		if (is_kvmppc_hv_enabled(kvm))
> +			kick_all_cpus_sync();
> +
> +		/*
> +		 * TODO: There is still a race window with the early
> +		 * checks in kvmppc_native_connect_vcpu()
> +		 */

That's... not reassuring.  What are the consequences of that race, and
what do you plan to do about it?

> +		kvm_for_each_vcpu(i, vcpu, kvm)
> +			kvmppc_xive_cleanup_vcpu(vcpu);
> +	}
> +
> +	kvmppc_xive_free(dev);
> +}
> +
>  struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
>  {
>  	struct kvmppc_xive *xive;
> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = {
>  	.name = "kvm-xive",
>  	.create = kvmppc_xive_create,
>  	.init = kvmppc_xive_init,
> +	.release = kvmppc_xive_release,
>  	.destroy = kvmppc_xive_free,
>  	.set_attr = xive_set_attr,
>  	.get_attr = xive_get_attr,
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index 092db0efe628..629da7bf2a89 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>  	kfree(dev);
>  }
>  
> +static void kvmppc_xive_native_release(struct kvm_device *dev)
> +{
> +	struct kvmppc_xive *xive = dev->private;
> +	struct kvm *kvm = xive->kvm;
> +	struct kvm_vcpu *vcpu;
> +	int i;
> +
> +	pr_devel("Releasing xive native device\n");
> +
> +	/*
> +	 * When releasing the KVM device fd, the vCPUs can still be
> +	 * running and we should clean up the vCPU interrupt
> +	 * presenters first.
> +	 */
> +	if (atomic_read(&kvm->online_vcpus) != 0) {

Likewise here.

> +		kvm_for_each_vcpu(i, vcpu, kvm)
> +			kvmppc_xive_native_cleanup_vcpu(vcpu);
> +	}
> +
> +	kvmppc_xive_native_free(dev);
> +}
> +
>  static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
>  {
>  	struct kvmppc_xive *xive;
> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = {
>  	.name = "kvm-xive-native",
>  	.create = kvmppc_xive_native_create,
>  	.init = kvmppc_xive_native_init,
> +	.release = kvmppc_xive_native_release,
>  	.destroy = kvmppc_xive_native_free,
>  	.set_attr = kvmppc_xive_native_set_attr,
>  	.get_attr = kvmppc_xive_native_get_attr,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ea2018ae1cd7..ea2619d5ca98 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
>  	struct kvm_device *dev = filp->private_data;
>  	struct kvm *kvm = dev->kvm;
>  
> +	if (!dev)
> +		return -ENODEV;
> +
> +	if (dev->kvm != kvm)
> +		return -EPERM;
> +
> +	if (dev->ops->release) {
> +		mutex_lock(&kvm->lock);
> +		list_del(&dev->vm_node);
> +		dev->ops->release(dev);
> +		mutex_unlock(&kvm->lock);
> +	}
> +

Wasn't there a big comment that explained that release replaced
destroy somewhere?

>  	kvm_put_kvm(kvm);
>  	return 0;
>  }
Paul Mackerras April 15, 2019, 9:25 a.m. UTC | #2
On Mon, Apr 15, 2019 at 01:32:19PM +1000, David Gibson wrote:
> On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote:
> > When the VM boots, the CAS negotiation process determines which
> > interrupt mode to use and invokes a machine reset. At that time, any
> > links to the previous KVM interrupt device should be 'destroyed'
> > before the new chosen one is created.
> > 
> > To perform the necessary cleanups in KVM, we extend the KVM device
> > interface with a new 'release' operation which is called when the file
> > descriptor of the device is closed.
> > 
> > Such operations are defined for the XICS-on-XIVE and the XIVE native
> > KVM devices. They clear the vCPU interrupt presenters that could be
> > attached and then destroy the device.
> > 
> > Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > ---
> >  include/linux/kvm_host.h              |  1 +
> >  arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
> >  arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
> >  virt/kvm/kvm_main.c                   | 13 +++++++
> >  4 files changed, 85 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 831d963451d8..3b444620d8fc 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1246,6 +1246,7 @@ struct kvm_device_ops {
> >  	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
> >  		      unsigned long arg);
> >  	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
> > +	void (*release)(struct kvm_device *dev);
> >  };
> >  
> >  void kvm_device_get(struct kvm_device *dev);
> > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> > index 4d4e1730de84..ba777db849d7 100644
> > --- a/arch/powerpc/kvm/book3s_xive.c
> > +++ b/arch/powerpc/kvm/book3s_xive.c
> > @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
> >  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
> >  {
> >  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> > -	struct kvmppc_xive *xive = xc->xive;
> > +	struct kvmppc_xive *xive;
> >  	int i;
> >  
> > +	if (!kvmppc_xics_enabled(vcpu))
> > +		return;
> > +
> > +	if (!xc)
> > +		return;
> > +
> >  	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
> >  
> > +	xive = xc->xive;
> > +
> >  	/* Ensure no interrupt is still routed to that VP */
> >  	xc->valid = false;
> >  	kvmppc_xive_disable_vcpu_interrupts(vcpu);
> > @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
> >  	}
> >  	/* Free the VP */
> >  	kfree(xc);
> > +
> > +	/* Cleanup the vcpu */
> > +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
> > +	vcpu->arch.xive_vcpu = NULL;
> >  }
> >  
> >  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> > @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> >  	}
> >  	if (xive->kvm != vcpu->kvm)
> >  		return -EPERM;
> > -	if (vcpu->arch.irq_type)
> > +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
> >  		return -EBUSY;
> >  	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
> >  		pr_devel("Duplicate !\n");
> > @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev)
> >  	kfree(dev);
> >  }
> >  
> > +static void kvmppc_xive_release(struct kvm_device *dev)
> > +{
> > +	struct kvmppc_xive *xive = dev->private;
> > +	struct kvm *kvm = xive->kvm;
> > +	struct kvm_vcpu *vcpu;
> > +	int i;
> > +
> > +	pr_devel("Releasing xive device\n");
> > +
> > +	/*
> > +	 * When releasing the KVM device fd, the vCPUs can still be
> > +	 * running and we should clean up the vCPU interrupt
> > +	 * presenters first.
> > +	 */
> > +	if (atomic_read(&kvm->online_vcpus) != 0) {
> 
> What prevents online_vcpus from becoming non-zero after this test, but
> before the kvmppc_xive_free()?
> 
> Is the test actually necessary?  The operations below should be safe
> even if there are no online cpus, yes?

Right... Similarly, the kick_all_cpus_sync() without anything having
been done before it that we want the other vcpus to notice made me
wonder what the point of it was.  In other places where it is used we
have done something such as set kvm->arch.mmu_ready to 0 first.

> > +		/*
> > +		 * call kick_all_cpus_sync() to ensure that all CPUs
> > +		 * have executed any pending interrupts
> > +		 */
> > +		if (is_kvmppc_hv_enabled(kvm))
> > +			kick_all_cpus_sync();

Paul.
Cédric Le Goater April 15, 2019, 1:48 p.m. UTC | #3
On 4/15/19 5:32 AM, David Gibson wrote:
> On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote:
>> When the VM boots, the CAS negotiation process determines which
>> interrupt mode to use and invokes a machine reset. At that time, any
>> links to the previous KVM interrupt device should be 'destroyed'
>> before the new chosen one is created.
>>
>> To perform the necessary cleanups in KVM, we extend the KVM device
>> interface with a new 'release' operation which is called when the file
>> descriptor of the device is closed.
>>
>> Such operations are defined for the XICS-on-XIVE and the XIVE native
>> KVM devices. They clear the vCPU interrupt presenters that could be
>> attached and then destroy the device.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  include/linux/kvm_host.h              |  1 +
>>  arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
>>  arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
>>  virt/kvm/kvm_main.c                   | 13 +++++++
>>  4 files changed, 85 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 831d963451d8..3b444620d8fc 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -1246,6 +1246,7 @@ struct kvm_device_ops {
>>  	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
>>  		      unsigned long arg);
>>  	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
>> +	void (*release)(struct kvm_device *dev);
>>  };
>>  
>>  void kvm_device_get(struct kvm_device *dev);
>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>> index 4d4e1730de84..ba777db849d7 100644
>> --- a/arch/powerpc/kvm/book3s_xive.c
>> +++ b/arch/powerpc/kvm/book3s_xive.c
>> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
>>  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>>  {
>>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> -	struct kvmppc_xive *xive = xc->xive;
>> +	struct kvmppc_xive *xive;
>>  	int i;
>>  
>> +	if (!kvmppc_xics_enabled(vcpu))
>> +		return;
>> +
>> +	if (!xc)
>> +		return;
>> +
>>  	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
>>  
>> +	xive = xc->xive;
>> +
>>  	/* Ensure no interrupt is still routed to that VP */
>>  	xc->valid = false;
>>  	kvmppc_xive_disable_vcpu_interrupts(vcpu);
>> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>>  	}
>>  	/* Free the VP */
>>  	kfree(xc);
>> +
>> +	/* Cleanup the vcpu */
>> +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
>> +	vcpu->arch.xive_vcpu = NULL;
>>  }
>>  
>>  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>>  	}
>>  	if (xive->kvm != vcpu->kvm)
>>  		return -EPERM;
>> -	if (vcpu->arch.irq_type)
>> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>>  		return -EBUSY;
>>  	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
>>  		pr_devel("Duplicate !\n");
>> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev)
>>  	kfree(dev);
>>  }
>>  
>> +static void kvmppc_xive_release(struct kvm_device *dev)
>> +{
>> +	struct kvmppc_xive *xive = dev->private;
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	int i;
>> +
>> +	pr_devel("Releasing xive device\n");
>> +
>> +	/*
>> +	 * When releasing the KVM device fd, the vCPUs can still be
>> +	 * running and we should clean up the vCPU interrupt
>> +	 * presenters first.
>> +	 */
>> +	if (atomic_read(&kvm->online_vcpus) != 0) {
> 
> What prevents online_vcpus from becoming non-zero after this test, but
> before the kvmppc_xive_free()?

I am not sure what you mean. kvmppc_xive_free() is gone with this patch. 
It has been replaced by kvmppc_xive_release().

> Is the test actually necessary?  The operations below should be safe
> even if there are no online cpus, yes?

ah, yes. kvm_for_each_vcpu() should be safe to use anyhow.

>> +		/*
>> +		 * call kick_all_cpus_sync() to ensure that all CPUs
>> +		 * have executed any pending interrupts
>> +		 */
>> +		if (is_kvmppc_hv_enabled(kvm))
>> +			kick_all_cpus_sync();>> +		/*
>> +		 * TODO: There is still a race window with the early
>> +		 * checks in kvmppc_native_connect_vcpu()
>> +		 */
> 
> That's... not reassuring.  What are the consequences of that race, 

a bogus ->xive pointer under the XIVE vCPU

> and what do you plan to do about it?

I don't think this is true any more with the release operation
which will be called by the last user of the device file. 

Anyhow, xc->xive does not seem very useful (just like xc->valid) 
We should try to use only vcpu->kvm->arch.xive instead.

I will propose some preliminary cleanups before introducing the
new release operation.

>> +		kvm_for_each_vcpu(i, vcpu, kvm)
>> +			kvmppc_xive_cleanup_vcpu(vcpu);
>> +	}
>> +
>> +	kvmppc_xive_free(dev);
>> +}
>> +
>>  struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
>>  {
>>  	struct kvmppc_xive *xive;
>> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = {
>>  	.name = "kvm-xive",
>>  	.create = kvmppc_xive_create,
>>  	.init = kvmppc_xive_init,
>> +	.release = kvmppc_xive_release,
>>  	.destroy = kvmppc_xive_free,
>>  	.set_attr = xive_set_attr,
>>  	.get_attr = xive_get_attr,
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 092db0efe628..629da7bf2a89 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>>  	kfree(dev);
>>  }
>>  
>> +static void kvmppc_xive_native_release(struct kvm_device *dev)
>> +{
>> +	struct kvmppc_xive *xive = dev->private;
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	int i;
>> +
>> +	pr_devel("Releasing xive native device\n");
>> +
>> +	/*
>> +	 * When releasing the KVM device fd, the vCPUs can still be
>> +	 * running and we should clean up the vCPU interrupt
>> +	 * presenters first.
>> +	 */
>> +	if (atomic_read(&kvm->online_vcpus) != 0) {
> 
> Likewise here.
> 
>> +		kvm_for_each_vcpu(i, vcpu, kvm)
>> +			kvmppc_xive_native_cleanup_vcpu(vcpu);
>> +	}
>> +
>> +	kvmppc_xive_native_free(dev);
>> +}
>> +
>>  static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
>>  {
>>  	struct kvmppc_xive *xive;
>> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = {
>>  	.name = "kvm-xive-native",
>>  	.create = kvmppc_xive_native_create,
>>  	.init = kvmppc_xive_native_init,
>> +	.release = kvmppc_xive_native_release,
>>  	.destroy = kvmppc_xive_native_free,
>>  	.set_attr = kvmppc_xive_native_set_attr,
>>  	.get_attr = kvmppc_xive_native_get_attr,
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index ea2018ae1cd7..ea2619d5ca98 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
>>  	struct kvm_device *dev = filp->private_data;
>>  	struct kvm *kvm = dev->kvm;
>>  
>> +	if (!dev)
>> +		return -ENODEV;
>> +
>> +	if (dev->kvm != kvm)
>> +		return -EPERM;
>> +
>> +	if (dev->ops->release) {
>> +		mutex_lock(&kvm->lock);
>> +		list_del(&dev->vm_node);
>> +		dev->ops->release(dev);
>> +		mutex_unlock(&kvm->lock);
>> +	}
>> +
> 
> Wasn't there a big comment that explained that release replaced
> destroy somewhere?

Yes. I did add a comment in the "V5 errata" series. 

I should be sending a v6 this week, to clarify all these attempts 
to solve the device switching.

Thanks,

C. 

> 
>>  	kvm_put_kvm(kvm);
>>  	return 0;
>>  }
>
Cédric Le Goater April 15, 2019, 1:56 p.m. UTC | #4
On 4/15/19 11:25 AM, Paul Mackerras wrote:
> On Mon, Apr 15, 2019 at 01:32:19PM +1000, David Gibson wrote:
>> On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote:
>>> When the VM boots, the CAS negotiation process determines which
>>> interrupt mode to use and invokes a machine reset. At that time, any
>>> links to the previous KVM interrupt device should be 'destroyed'
>>> before the new chosen one is created.
>>>
>>> To perform the necessary cleanups in KVM, we extend the KVM device
>>> interface with a new 'release' operation which is called when the file
>>> descriptor of the device is closed.
>>>
>>> Such operations are defined for the XICS-on-XIVE and the XIVE native
>>> KVM devices. They clear the vCPU interrupt presenters that could be
>>> attached and then destroy the device.
>>>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>> ---
>>>  include/linux/kvm_host.h              |  1 +
>>>  arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
>>>  arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
>>>  virt/kvm/kvm_main.c                   | 13 +++++++
>>>  4 files changed, 85 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>> index 831d963451d8..3b444620d8fc 100644
>>> --- a/include/linux/kvm_host.h
>>> +++ b/include/linux/kvm_host.h
>>> @@ -1246,6 +1246,7 @@ struct kvm_device_ops {
>>>  	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
>>>  		      unsigned long arg);
>>>  	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
>>> +	void (*release)(struct kvm_device *dev);
>>>  };
>>>  
>>>  void kvm_device_get(struct kvm_device *dev);
>>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>>> index 4d4e1730de84..ba777db849d7 100644
>>> --- a/arch/powerpc/kvm/book3s_xive.c
>>> +++ b/arch/powerpc/kvm/book3s_xive.c
>>> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
>>>  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>>>  {
>>>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>>> -	struct kvmppc_xive *xive = xc->xive;
>>> +	struct kvmppc_xive *xive;
>>>  	int i;
>>>  
>>> +	if (!kvmppc_xics_enabled(vcpu))
>>> +		return;
>>> +
>>> +	if (!xc)
>>> +		return;
>>> +
>>>  	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
>>>  
>>> +	xive = xc->xive;
>>> +
>>>  	/* Ensure no interrupt is still routed to that VP */
>>>  	xc->valid = false;
>>>  	kvmppc_xive_disable_vcpu_interrupts(vcpu);
>>> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>>>  	}
>>>  	/* Free the VP */
>>>  	kfree(xc);
>>> +
>>> +	/* Cleanup the vcpu */
>>> +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
>>> +	vcpu->arch.xive_vcpu = NULL;
>>>  }
>>>  
>>>  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>>> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>>>  	}
>>>  	if (xive->kvm != vcpu->kvm)
>>>  		return -EPERM;
>>> -	if (vcpu->arch.irq_type)
>>> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>>>  		return -EBUSY;
>>>  	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
>>>  		pr_devel("Duplicate !\n");
>>> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev)
>>>  	kfree(dev);
>>>  }
>>>  
>>> +static void kvmppc_xive_release(struct kvm_device *dev)
>>> +{
>>> +	struct kvmppc_xive *xive = dev->private;
>>> +	struct kvm *kvm = xive->kvm;
>>> +	struct kvm_vcpu *vcpu;
>>> +	int i;
>>> +
>>> +	pr_devel("Releasing xive device\n");
>>> +
>>> +	/*
>>> +	 * When releasing the KVM device fd, the vCPUs can still be
>>> +	 * running and we should clean up the vCPU interrupt
>>> +	 * presenters first.
>>> +	 */
>>> +	if (atomic_read(&kvm->online_vcpus) != 0) {
>>
>> What prevents online_vcpus from becoming non-zero after this test, but
>> before the kvmppc_xive_free()?
>>
>> Is the test actually necessary?  The operations below should be safe
>> even if there are no online cpus, yes?
> 
> Right... Similarly, the kick_all_cpus_sync() without anything having
> been done before it that we want the other vcpus to notice made me
> wonder what the point of it was.  In other places where it is used we
> have done something such as set kvm->arch.mmu_ready to 0 first.

This part is more dubious. It comes from my understanding of the routine 
kvm_arch_destroy_vm() that makes sure all IPIs have been handled before 
clearing  the VCPUs structures. commit e17769eb8c89 is a bit cryptic and 
looks like an optimization that the release operation can ignore ?

Thanks,

C.   
 
>>> +		/*
>>> +		 * call kick_all_cpus_sync() to ensure that all CPUs
>>> +		 * have executed any pending interrupts
>>> +		 */
>>> +		if (is_kvmppc_hv_enabled(kvm))
>>> +			kick_all_cpus_sync();
> 
> Paul.
>
David Gibson April 17, 2019, 2:05 a.m. UTC | #5
On Mon, Apr 15, 2019 at 03:48:58PM +0200, Cédric Le Goater wrote:
> On 4/15/19 5:32 AM, David Gibson wrote:
> > On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote:
> >> When the VM boots, the CAS negotiation process determines which
> >> interrupt mode to use and invokes a machine reset. At that time, any
> >> links to the previous KVM interrupt device should be 'destroyed'
> >> before the new chosen one is created.
> >>
> >> To perform the necessary cleanups in KVM, we extend the KVM device
> >> interface with a new 'release' operation which is called when the file
> >> descriptor of the device is closed.
> >>
> >> Such operations are defined for the XICS-on-XIVE and the XIVE native
> >> KVM devices. They clear the vCPU interrupt presenters that could be
> >> attached and then destroy the device.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  include/linux/kvm_host.h              |  1 +
> >>  arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
> >>  arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
> >>  virt/kvm/kvm_main.c                   | 13 +++++++
> >>  4 files changed, 85 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index 831d963451d8..3b444620d8fc 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -1246,6 +1246,7 @@ struct kvm_device_ops {
> >>  	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
> >>  		      unsigned long arg);
> >>  	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
> >> +	void (*release)(struct kvm_device *dev);
> >>  };
> >>  
> >>  void kvm_device_get(struct kvm_device *dev);
> >> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> >> index 4d4e1730de84..ba777db849d7 100644
> >> --- a/arch/powerpc/kvm/book3s_xive.c
> >> +++ b/arch/powerpc/kvm/book3s_xive.c
> >> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
> >>  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
> >>  {
> >>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> >> -	struct kvmppc_xive *xive = xc->xive;
> >> +	struct kvmppc_xive *xive;
> >>  	int i;
> >>  
> >> +	if (!kvmppc_xics_enabled(vcpu))
> >> +		return;
> >> +
> >> +	if (!xc)
> >> +		return;
> >> +
> >>  	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
> >>  
> >> +	xive = xc->xive;
> >> +
> >>  	/* Ensure no interrupt is still routed to that VP */
> >>  	xc->valid = false;
> >>  	kvmppc_xive_disable_vcpu_interrupts(vcpu);
> >> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
> >>  	}
> >>  	/* Free the VP */
> >>  	kfree(xc);
> >> +
> >> +	/* Cleanup the vcpu */
> >> +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
> >> +	vcpu->arch.xive_vcpu = NULL;
> >>  }
> >>  
> >>  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> >> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> >>  	}
> >>  	if (xive->kvm != vcpu->kvm)
> >>  		return -EPERM;
> >> -	if (vcpu->arch.irq_type)
> >> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
> >>  		return -EBUSY;
> >>  	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
> >>  		pr_devel("Duplicate !\n");
> >> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev)
> >>  	kfree(dev);
> >>  }
> >>  
> >> +static void kvmppc_xive_release(struct kvm_device *dev)
> >> +{
> >> +	struct kvmppc_xive *xive = dev->private;
> >> +	struct kvm *kvm = xive->kvm;
> >> +	struct kvm_vcpu *vcpu;
> >> +	int i;
> >> +
> >> +	pr_devel("Releasing xive device\n");
> >> +
> >> +	/*
> >> +	 * When releasing the KVM device fd, the vCPUs can still be
> >> +	 * running and we should clean up the vCPU interrupt
> >> +	 * presenters first.
> >> +	 */
> >> +	if (atomic_read(&kvm->online_vcpus) != 0) {
> > 
> > What prevents online_vcpus from becoming non-zero after this test, but
> > before the kvmppc_xive_free()?
> 
> I am not sure what you mean. kvmppc_xive_free() is gone with this patch. 
> It has been replaced by kvmppc_xive_release().
> 
> > Is the test actually necessary?  The operations below should be safe
> > even if there are no online cpus, yes?
> 
> ah, yes. kvm_for_each_vcpu() should be safe to use anyhow.
> 
> >> +		/*
> >> +		 * call kick_all_cpus_sync() to ensure that all CPUs
> >> +		 * have executed any pending interrupts
> >> +		 */
> >> +		if (is_kvmppc_hv_enabled(kvm))
> >> +			kick_all_cpus_sync();>> +		/*
> >> +		 * TODO: There is still a race window with the early
> >> +		 * checks in kvmppc_native_connect_vcpu()
> >> +		 */
> > 
> > That's... not reassuring.  What are the consequences of that race, 
> 
> a bogus ->xive pointer under the XIVE vCPU
> 
> > and what do you plan to do about it?
> 
> I don't think this is true any more with the release operation
> which will be called by the last user of the device file.

Ok, so the comment needs updating.

> Anyhow, xc->xive does not seem very useful (just like xc->valid) 
> We should try to use only vcpu->kvm->arch.xive instead.
> 
> I will propose some preliminary cleanups before introducing the
> new release operation.
> 
> >> +		kvm_for_each_vcpu(i, vcpu, kvm)
> >> +			kvmppc_xive_cleanup_vcpu(vcpu);
> >> +	}
> >> +
> >> +	kvmppc_xive_free(dev);
> >> +}
> >> +
> >>  struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
> >>  {
> >>  	struct kvmppc_xive *xive;
> >> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = {
> >>  	.name = "kvm-xive",
> >>  	.create = kvmppc_xive_create,
> >>  	.init = kvmppc_xive_init,
> >> +	.release = kvmppc_xive_release,
> >>  	.destroy = kvmppc_xive_free,
> >>  	.set_attr = xive_set_attr,
> >>  	.get_attr = xive_get_attr,
> >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> >> index 092db0efe628..629da7bf2a89 100644
> >> --- a/arch/powerpc/kvm/book3s_xive_native.c
> >> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> >> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
> >>  	kfree(dev);
> >>  }
> >>  
> >> +static void kvmppc_xive_native_release(struct kvm_device *dev)
> >> +{
> >> +	struct kvmppc_xive *xive = dev->private;
> >> +	struct kvm *kvm = xive->kvm;
> >> +	struct kvm_vcpu *vcpu;
> >> +	int i;
> >> +
> >> +	pr_devel("Releasing xive native device\n");
> >> +
> >> +	/*
> >> +	 * When releasing the KVM device fd, the vCPUs can still be
> >> +	 * running and we should clean up the vCPU interrupt
> >> +	 * presenters first.
> >> +	 */
> >> +	if (atomic_read(&kvm->online_vcpus) != 0) {
> > 
> > Likewise here.
> > 
> >> +		kvm_for_each_vcpu(i, vcpu, kvm)
> >> +			kvmppc_xive_native_cleanup_vcpu(vcpu);
> >> +	}
> >> +
> >> +	kvmppc_xive_native_free(dev);
> >> +}
> >> +
> >>  static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
> >>  {
> >>  	struct kvmppc_xive *xive;
> >> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = {
> >>  	.name = "kvm-xive-native",
> >>  	.create = kvmppc_xive_native_create,
> >>  	.init = kvmppc_xive_native_init,
> >> +	.release = kvmppc_xive_native_release,
> >>  	.destroy = kvmppc_xive_native_free,
> >>  	.set_attr = kvmppc_xive_native_set_attr,
> >>  	.get_attr = kvmppc_xive_native_get_attr,
> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >> index ea2018ae1cd7..ea2619d5ca98 100644
> >> --- a/virt/kvm/kvm_main.c
> >> +++ b/virt/kvm/kvm_main.c
> >> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
> >>  	struct kvm_device *dev = filp->private_data;
> >>  	struct kvm *kvm = dev->kvm;
> >>  
> >> +	if (!dev)
> >> +		return -ENODEV;
> >> +
> >> +	if (dev->kvm != kvm)
> >> +		return -EPERM;
> >> +
> >> +	if (dev->ops->release) {
> >> +		mutex_lock(&kvm->lock);
> >> +		list_del(&dev->vm_node);
> >> +		dev->ops->release(dev);
> >> +		mutex_unlock(&kvm->lock);
> >> +	}
> >> +
> > 
> > Wasn't there a big comment that explained that release replaced
> > destroy somewhere?
> 
> Yes. I did add a comment in the "V5 errata" series. 
> 
> I should be sending a v6 this week, to clarify all these attempts 
> to solve the device switching.
> 
> Thanks,
> 
> C. 
> 
> > 
> >>  	kvm_put_kvm(kvm);
> >>  	return 0;
> >>  }
> > 
>
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 831d963451d8..3b444620d8fc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1246,6 +1246,7 @@  struct kvm_device_ops {
 	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
 		      unsigned long arg);
 	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
+	void (*release)(struct kvm_device *dev);
 };
 
 void kvm_device_get(struct kvm_device *dev);
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 4d4e1730de84..ba777db849d7 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1100,11 +1100,19 @@  void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
 void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
-	struct kvmppc_xive *xive = xc->xive;
+	struct kvmppc_xive *xive;
 	int i;
 
+	if (!kvmppc_xics_enabled(vcpu))
+		return;
+
+	if (!xc)
+		return;
+
 	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
 
+	xive = xc->xive;
+
 	/* Ensure no interrupt is still routed to that VP */
 	xc->valid = false;
 	kvmppc_xive_disable_vcpu_interrupts(vcpu);
@@ -1141,6 +1149,10 @@  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
 	}
 	/* Free the VP */
 	kfree(xc);
+
+	/* Cleanup the vcpu */
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+	vcpu->arch.xive_vcpu = NULL;
 }
 
 int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
@@ -1158,7 +1170,7 @@  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	}
 	if (xive->kvm != vcpu->kvm)
 		return -EPERM;
-	if (vcpu->arch.irq_type)
+	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
 		return -EBUSY;
 	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
 		pr_devel("Duplicate !\n");
@@ -1855,6 +1867,39 @@  static void kvmppc_xive_free(struct kvm_device *dev)
 	kfree(dev);
 }
 
+static void kvmppc_xive_release(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	pr_devel("Releasing xive device\n");
+
+	/*
+	 * When releasing the KVM device fd, the vCPUs can still be
+	 * running and we should clean up the vCPU interrupt
+	 * presenters first.
+	 */
+	if (atomic_read(&kvm->online_vcpus) != 0) {
+		/*
+		 * call kick_all_cpus_sync() to ensure that all CPUs
+		 * have executed any pending interrupts
+		 */
+		if (is_kvmppc_hv_enabled(kvm))
+			kick_all_cpus_sync();
+
+		/*
+		 * TODO: There is still a race window with the early
+		 * checks in kvmppc_native_connect_vcpu()
+		 */
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvmppc_xive_cleanup_vcpu(vcpu);
+	}
+
+	kvmppc_xive_free(dev);
+}
+
 struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
 {
 	struct kvmppc_xive *xive;
@@ -2043,6 +2088,7 @@  struct kvm_device_ops kvm_xive_ops = {
 	.name = "kvm-xive",
 	.create = kvmppc_xive_create,
 	.init = kvmppc_xive_init,
+	.release = kvmppc_xive_release,
 	.destroy = kvmppc_xive_free,
 	.set_attr = xive_set_attr,
 	.get_attr = xive_get_attr,
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 092db0efe628..629da7bf2a89 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -996,6 +996,28 @@  static void kvmppc_xive_native_free(struct kvm_device *dev)
 	kfree(dev);
 }
 
+static void kvmppc_xive_native_release(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	pr_devel("Releasing xive native device\n");
+
+	/*
+	 * When releasing the KVM device fd, the vCPUs can still be
+	 * running and we should clean up the vCPU interrupt
+	 * presenters first.
+	 */
+	if (atomic_read(&kvm->online_vcpus) != 0) {
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvmppc_xive_native_cleanup_vcpu(vcpu);
+	}
+
+	kvmppc_xive_native_free(dev);
+}
+
 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
 {
 	struct kvmppc_xive *xive;
@@ -1187,6 +1209,7 @@  struct kvm_device_ops kvm_xive_native_ops = {
 	.name = "kvm-xive-native",
 	.create = kvmppc_xive_native_create,
 	.init = kvmppc_xive_native_init,
+	.release = kvmppc_xive_native_release,
 	.destroy = kvmppc_xive_native_free,
 	.set_attr = kvmppc_xive_native_set_attr,
 	.get_attr = kvmppc_xive_native_get_attr,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ea2018ae1cd7..ea2619d5ca98 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2938,6 +2938,19 @@  static int kvm_device_release(struct inode *inode, struct file *filp)
 	struct kvm_device *dev = filp->private_data;
 	struct kvm *kvm = dev->kvm;
 
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->kvm != kvm)
+		return -EPERM;
+
+	if (dev->ops->release) {
+		mutex_lock(&kvm->lock);
+		list_del(&dev->vm_node);
+		dev->ops->release(dev);
+		mutex_unlock(&kvm->lock);
+	}
+
 	kvm_put_kvm(kvm);
 	return 0;
 }