diff mbox series

[v6,15/21] KVM: s390: pci: add routines to start/stop interpretive execution

Message ID 20220426200842.98655-16-mjrosato@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series KVM: s390: enable zPCI for interpretive execution | expand

Commit Message

Matthew Rosato April 26, 2022, 8:08 p.m. UTC
These routines will be invoked at the time an s390x vfio-pci device is
associated with a KVM (or when the association is removed), allowing
the zPCI device to enable or disable load/store intepretation mode;
this requires the host zPCI device to inform firmware of the unique
token (GISA designation) that is associated with the owning KVM.

Furthemore, add/remove these devices from a list associated with the
kvm and ensure proper cleanup always occurs during vm exit.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  13 +++
 arch/s390/kvm/kvm-s390.c         |   9 ++
 arch/s390/kvm/pci.c              | 158 +++++++++++++++++++++++++++++++
 arch/s390/kvm/pci.h              |   5 +
 arch/s390/pci/pci.c              |   3 +
 5 files changed, 188 insertions(+)

Comments

Jason Gunthorpe April 27, 2022, 3:14 p.m. UTC | #1
On Tue, Apr 26, 2022 at 04:08:36PM -0400, Matthew Rosato wrote:

> +int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm)
> +{
> +	if (!zdev)
> +		return 0;
> +
> +	/*
> +	 * Register device with this KVM (or remove the KVM association if 0).
> +	 * If interpetation facilities are available, enable them and let
> +	 * userspace indicate whether or not they will be used (specify SHM bit
> +	 * to disable).
> +	 */
> +	if (kvm)
> +		return register_kvm(zdev, kvm);
> +	else
> +		return unregister_kvm(zdev);
> +}
> +EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm);

I think it is cleaner to expose both the register/unregister APIs and
not multiplex them like this

> +void kvm_s390_pci_clear_list(struct kvm *kvm)
> +{
> +	struct kvm_zdev *tmp, *kzdev;
> +	LIST_HEAD(remove);
> +
> +	spin_lock(&kvm->arch.kzdev_list_lock);
> +	list_for_each_entry_safe(kzdev, tmp, &kvm->arch.kzdev_list, entry)
> +		list_move_tail(&kzdev->entry, &remove);
> +	spin_unlock(&kvm->arch.kzdev_list_lock);
> +
> +	list_for_each_entry_safe(kzdev, tmp, &remove, entry)
> +		unregister_kvm(kzdev->zdev);

Hum, I wonder if this is a mistake in kvm:

static void kvm_destroy_vm(struct kvm *kvm)
{
[..]
	kvm_arch_destroy_vm(kvm);
	kvm_destroy_devices(kvm);

kvm_destroy_devices() triggers the VFIO notifier with NULL. Indeed for
correctness I would expect the VFIO users to have been notified to
release the kvm before the kvm object becomes partially destroyed?

Maybe you should investigate re-ordering this at the KVM side and just
WARN_ON(!list_empty) in the arch code?

(vfio has this odd usage model where it should use the kvm pointer
without taking a ref on it so long as the unregister hasn't been
called)

If you keep it like this then the locking in register/unregister looks
not broad enough and has to cover the zdev->kzdev also.

Overall I think it is OK designed like this, aside from the ugly
symbol_get in vfio which I hope you can resolve.

Jason
Matthew Rosato April 27, 2022, 8:20 p.m. UTC | #2
On 4/27/22 11:14 AM, Jason Gunthorpe wrote:
> On Tue, Apr 26, 2022 at 04:08:36PM -0400, Matthew Rosato wrote:
> 
>> +int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm)
>> +{
>> +	if (!zdev)
>> +		return 0;
>> +
>> +	/*
>> +	 * Register device with this KVM (or remove the KVM association if 0).
>> +	 * If interpetation facilities are available, enable them and let
>> +	 * userspace indicate whether or not they will be used (specify SHM bit
>> +	 * to disable).
>> +	 */
>> +	if (kvm)
>> +		return register_kvm(zdev, kvm);
>> +	else
>> +		return unregister_kvm(zdev);
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm);
> 
> I think it is cleaner to expose both the register/unregister APIs and
> not multiplex them like this
> 

OK

>> +void kvm_s390_pci_clear_list(struct kvm *kvm)
>> +{
>> +	struct kvm_zdev *tmp, *kzdev;
>> +	LIST_HEAD(remove);
>> +
>> +	spin_lock(&kvm->arch.kzdev_list_lock);
>> +	list_for_each_entry_safe(kzdev, tmp, &kvm->arch.kzdev_list, entry)
>> +		list_move_tail(&kzdev->entry, &remove);
>> +	spin_unlock(&kvm->arch.kzdev_list_lock);
>> +
>> +	list_for_each_entry_safe(kzdev, tmp, &remove, entry)
>> +		unregister_kvm(kzdev->zdev);
> 
> Hum, I wonder if this is a mistake in kvm:
> 
> static void kvm_destroy_vm(struct kvm *kvm)
> {
> [..]
> 	kvm_arch_destroy_vm(kvm);
> 	kvm_destroy_devices(kvm);
> 
> kvm_destroy_devices() triggers the VFIO notifier with NULL. Indeed for
> correctness I would expect the VFIO users to have been notified to
> release the kvm before the kvm object becomes partially destroyed?
> 
> Maybe you should investigate re-ordering this at the KVM side and just
> WARN_ON(!list_empty) in the arch code?
> 
> (vfio has this odd usage model where it should use the kvm pointer
> without taking a ref on it so long as the unregister hasn't been
> called)
>

The issue there is that I am unregistering the notifier during 
close_device for each zPCI dev, which will have already happened -- so 
by the time we get to kvm_destroy_devices(), whether it's before or 
after kvm_arch_destroy_vm, there are no longer any zPCI notifiers 
registered that will trigger.

One way to solve this is to have the zpci close_device hook also trigger 
the work that a KVM_DEV_VFIO_GROUP_DEL would (if the device is being 
closed, the KVM association for that device isn't applicable anymore so 
go ahead and clean up).

Then, since we know each device will get closed (either by userspace or 
by kvm), I don't need something like kvm_s390_pci_clear_list at all.

> If you keep it like this then the locking in register/unregister looks
> not broad enough and has to cover the zdev->kzdev also.

But I would still need to revisit the locking with the above idea.

> 
> Overall I think it is OK designed like this, aside from the ugly
> symbol_get in vfio which I hope you can resolve.
> 
> Jason
Jason Gunthorpe April 28, 2022, 12:28 p.m. UTC | #3
On Wed, Apr 27, 2022 at 04:20:10PM -0400, Matthew Rosato wrote:
> > > +void kvm_s390_pci_clear_list(struct kvm *kvm)
> > > +{
> > > +	struct kvm_zdev *tmp, *kzdev;
> > > +	LIST_HEAD(remove);
> > > +
> > > +	spin_lock(&kvm->arch.kzdev_list_lock);
> > > +	list_for_each_entry_safe(kzdev, tmp, &kvm->arch.kzdev_list, entry)
> > > +		list_move_tail(&kzdev->entry, &remove);
> > > +	spin_unlock(&kvm->arch.kzdev_list_lock);
> > > +
> > > +	list_for_each_entry_safe(kzdev, tmp, &remove, entry)
> > > +		unregister_kvm(kzdev->zdev);
> > 
> > Hum, I wonder if this is a mistake in kvm:
> > 
> > static void kvm_destroy_vm(struct kvm *kvm)
> > {
> > [..]
> > 	kvm_arch_destroy_vm(kvm);
> > 	kvm_destroy_devices(kvm);
> > 
> > kvm_destroy_devices() triggers the VFIO notifier with NULL. Indeed for
> > correctness I would expect the VFIO users to have been notified to
> > release the kvm before the kvm object becomes partially destroyed?
> > 
> > Maybe you should investigate re-ordering this at the KVM side and just
> > WARN_ON(!list_empty) in the arch code?
> > 
> > (vfio has this odd usage model where it should use the kvm pointer
> > without taking a ref on it so long as the unregister hasn't been
> > called)
> > 
> 
> The issue there is that I am unregistering the notifier during close_device
> for each zPCI dev, which will have already happened

And at that moment you have to clean up the arch stuff too, it
shouldn't be left floating around once the driver that installed it
looses access to the kvm.

> -- so by the time we get to kvm_destroy_devices(), whether it's
> before or after kvm_arch_destroy_vm, there are no longer any zPCI
> notifiers registered that will trigger.

I don't think that is strictly true, there is no enforced linkage
between the lifetime of the kvm FD and the lifetime of the VFIO device
FD. qemu probably orders them the way you say.

> One way to solve this is to have the zpci close_device hook also trigger the
> work that a KVM_DEV_VFIO_GROUP_DEL would (if the device is being closed, the
> KVM association for that device isn't applicable anymore so go ahead and
> clean up).

That makes the most sense - but think about what happens if the KVM fd
is closed while the device FD is still open..

Jason
diff mbox series

Patch

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8e381603b6a7..cd58f204305e 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -19,6 +19,7 @@ 
 #include <linux/kvm.h>
 #include <linux/seqlock.h>
 #include <linux/module.h>
+#include <linux/pci.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
 #include <asm/fpu/api.h>
@@ -967,6 +968,8 @@  struct kvm_arch{
 	DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
 	struct kvm_s390_gisa_interrupt gisa_int;
 	struct kvm_s390_pv pv;
+	struct list_head kzdev_list;
+	spinlock_t kzdev_list_lock;
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)
@@ -1017,4 +1020,14 @@  static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
+#ifdef CONFIG_PCI
+int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm);
+#else
+static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev,
+					    struct kvm *kvm)
+{
+	return -EPERM;
+}
+#endif
+
 #endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 504f312c1c27..bd6c8aeabc24 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2866,6 +2866,13 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	kvm_s390_crypto_init(kvm);
 
+	if (IS_ENABLED(CONFIG_VFIO_PCI)) {
+		mutex_lock(&kvm->lock);
+		kvm_s390_pci_init_list(kvm);
+		kvm_s390_vcpu_pci_enable_interp(kvm);
+		mutex_unlock(&kvm->lock);
+	}
+
 	mutex_init(&kvm->arch.float_int.ais_lock);
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@ -2951,6 +2958,8 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (!kvm_is_ucontrol(kvm))
 		gmap_remove(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
+	if (IS_ENABLED(CONFIG_VFIO_PCI))
+		kvm_s390_pci_clear_list(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	kvm_s390_vsie_destroy(kvm);
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index 72a9c6bfae86..2aee7f11db1d 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -12,7 +12,9 @@ 
 #include <asm/pci.h>
 #include <asm/pci_insn.h>
 #include <asm/pci_io.h>
+#include <asm/sclp.h>
 #include "pci.h"
+#include "kvm-s390.h"
 
 struct zpci_aift *aift;
 
@@ -424,6 +426,162 @@  void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
 }
 EXPORT_SYMBOL_GPL(kvm_s390_pci_dev_release);
 
+static inline int register_kvm(struct zpci_dev *zdev, struct kvm *kvm)
+{
+	int rc;
+
+	if (zdev->kzdev || zdev->gisa != 0)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	rc = kvm_s390_pci_dev_open(zdev);
+	if (rc)
+		goto err;
+
+	/*
+	 * If interpretation facilities aren't available, add the device to
+	 * the kzdev list but don't enable for interpretation.
+	 */
+	if (!kvm_s390_pci_interp_allowed())
+		goto out;
+
+	/*
+	 * If this is the first request to use an interpreted device, make the
+	 * necessary vcpu changes
+	 */
+	if (!kvm->arch.use_zpci_interp)
+		kvm_s390_vcpu_pci_enable_interp(kvm);
+
+	if (zdev_enabled(zdev)) {
+		rc = zpci_disable_device(zdev);
+		if (rc)
+			goto err;
+	}
+
+	/*
+	 * Store information about the identity of the kvm guest allowed to
+	 * access this device via interpretation to be used by host CLP
+	 */
+	zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+
+	rc = zpci_enable_device(zdev);
+	if (rc)
+		goto clear_gisa;
+
+	/* Re-register the IOMMU that was already created */
+	rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+				virt_to_phys(zdev->dma_table));
+	if (rc)
+		goto clear_gisa;
+
+out:
+	zdev->kzdev->kvm = kvm;
+
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list);
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+
+	mutex_unlock(&kvm->lock);
+	return 0;
+
+clear_gisa:
+	zdev->gisa = 0;
+err:
+	if (zdev->kzdev)
+		kvm_s390_pci_dev_release(zdev);
+	mutex_unlock(&kvm->lock);
+	return rc;
+}
+
+static inline int unregister_kvm(struct zpci_dev *zdev)
+{
+	struct kvm *kvm;
+	int rc;
+
+	if (!zdev->kzdev)
+		return -EINVAL;
+
+	kvm = zdev->kzdev->kvm;
+	mutex_lock(&kvm->lock);
+
+	/*
+	 * A 0 gisa means interpretation was never enabled, just remove the
+	 * device from the list.
+	 */
+	if (zdev->gisa == 0)
+		goto out;
+
+	/* Forwarding must be turned off before interpretation */
+	if (zdev->kzdev->fib.fmt0.aibv != 0)
+		kvm_s390_pci_aif_disable(zdev, true);
+
+	/* Remove the host CLP guest designation */
+	zdev->gisa = 0;
+
+	if (zdev_enabled(zdev)) {
+		rc = zpci_disable_device(zdev);
+		if (rc)
+			goto out;
+	}
+
+	rc = zpci_enable_device(zdev);
+	if (rc)
+		goto out;
+
+	/* Re-register the IOMMU that was already created */
+	rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+				virt_to_phys(zdev->dma_table));
+
+out:
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_del(&zdev->kzdev->entry);
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+	kvm_s390_pci_dev_release(zdev);
+
+	mutex_unlock(&kvm->lock);
+
+	return rc;
+}
+
+int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm)
+{
+	if (!zdev)
+		return 0;
+
+	/*
+	 * Register device with this KVM (or remove the KVM association if 0).
+	 * If interpetation facilities are available, enable them and let
+	 * userspace indicate whether or not they will be used (specify SHM bit
+	 * to disable).
+	 */
+	if (kvm)
+		return register_kvm(zdev, kvm);
+	else
+		return unregister_kvm(zdev);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm);
+
+void kvm_s390_pci_init_list(struct kvm *kvm)
+{
+	spin_lock_init(&kvm->arch.kzdev_list_lock);
+	INIT_LIST_HEAD(&kvm->arch.kzdev_list);
+}
+
+void kvm_s390_pci_clear_list(struct kvm *kvm)
+{
+	struct kvm_zdev *tmp, *kzdev;
+	LIST_HEAD(remove);
+
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_for_each_entry_safe(kzdev, tmp, &kvm->arch.kzdev_list, entry)
+		list_move_tail(&kzdev->entry, &remove);
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+
+	list_for_each_entry_safe(kzdev, tmp, &remove, entry)
+		unregister_kvm(kzdev->zdev);
+}
+
 int kvm_s390_pci_init(void)
 {
 	aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index f567c3189a40..d11662fadb78 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -13,6 +13,7 @@ 
 #include <linux/kvm_host.h>
 #include <linux/pci.h>
 #include <linux/mutex.h>
+#include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <asm/airq.h>
 #include <asm/cpu.h>
@@ -21,6 +22,7 @@  struct kvm_zdev {
 	struct zpci_dev *zdev;
 	struct kvm *kvm;
 	struct zpci_fib fib;
+	struct list_head entry;
 };
 
 struct zpci_gaite {
@@ -54,6 +56,9 @@  static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
 int kvm_s390_pci_aen_init(u8 nisc);
 void kvm_s390_pci_aen_exit(void);
 
+void kvm_s390_pci_init_list(struct kvm *kvm);
+void kvm_s390_pci_clear_list(struct kvm *kvm);
+
 int kvm_s390_pci_init(void);
 void kvm_s390_pci_exit(void);
 
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index f0a439c43395..d9b021fb84d5 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -132,6 +132,7 @@  int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
 		zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
 	return cc;
 }
+EXPORT_SYMBOL_GPL(zpci_register_ioat);
 
 /* Modify PCI: Unregister I/O address translation parameters */
 int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
@@ -712,6 +713,7 @@  int zpci_enable_device(struct zpci_dev *zdev)
 		zpci_update_fh(zdev, fh);
 	return rc;
 }
+EXPORT_SYMBOL_GPL(zpci_enable_device);
 
 int zpci_disable_device(struct zpci_dev *zdev)
 {
@@ -735,6 +737,7 @@  int zpci_disable_device(struct zpci_dev *zdev)
 	}
 	return rc;
 }
+EXPORT_SYMBOL_GPL(zpci_disable_device);
 
 /**
  * zpci_hot_reset_device - perform a reset of the given zPCI function