diff mbox series

[v9,3/4] s390: ap: implement PAPQ AQIC interception in kernel

Message ID 1558452877-27822-4-git-send-email-pmorel@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series vfio: ap: AP Queue Interrupt Control | expand

Commit Message

Pierre Morel May 21, 2019, 3:34 p.m. UTC
We register a AP PQAP instruction hook during the open
of the mediated device. And unregister it on release.

During the probe of the AP device, we allocate a vfio_ap_queue
structure to keep track of the information we need for the
PQAP/AQIC instruction interception.

In the AP PQAP instruction hook, if we receive a demand to
enable IRQs,
- we retrieve the vfio_ap_queue based on the APQN we receive
  in REG1,
- we retrieve the page of the guest address, (NIB), from
  register REG2
- we retrieve the mediated device to use the VFIO pinning
  infrastructure to pin the page of the guest address,
- we retrieve the pointer to KVM to register the guest ISC
  and retrieve the host ISC
- finaly we activate GISA

If we receive a demand to disable IRQs,
- we deactivate GISA
- unregister from the GIB
- unpin the NIB

When removing the AP device from the driver the device is
reseted and this process unregisters the GISA from the GIB,
and unpins the NIB address then we free the vfio_ap_queue
structure.

Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
---
 drivers/s390/crypto/vfio_ap_drv.c     |  34 +++-
 drivers/s390/crypto/vfio_ap_ops.c     | 336 +++++++++++++++++++++++++++++++++-
 drivers/s390/crypto/vfio_ap_private.h |  11 ++
 3 files changed, 374 insertions(+), 7 deletions(-)

Comments

Anthony Krowiak May 23, 2019, 3:43 p.m. UTC | #1
On 5/21/19 11:34 AM, Pierre Morel wrote:
> We register a AP PQAP instruction hook during the open
> of the mediated device. And unregister it on release.
> 
> During the probe of the AP device, we allocate a vfio_ap_queue
> structure to keep track of the information we need for the
> PQAP/AQIC instruction interception.
> 
> In the AP PQAP instruction hook, if we receive a demand to
> enable IRQs,
> - we retrieve the vfio_ap_queue based on the APQN we receive
>    in REG1,
> - we retrieve the page of the guest address, (NIB), from
>    register REG2
> - we retrieve the mediated device to use the VFIO pinning
>    infrastructure to pin the page of the guest address,
> - we retrieve the pointer to KVM to register the guest ISC
>    and retrieve the host ISC
> - finaly we activate GISA
> 
> If we receive a demand to disable IRQs,
> - we deactivate GISA
> - unregister from the GIB
> - unpin the NIB
> 
> When removing the AP device from the driver the device is
> reseted and this process unregisters the GISA from the GIB,
> and unpins the NIB address then we free the vfio_ap_queue
> structure.
> 
> Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
> ---
>   drivers/s390/crypto/vfio_ap_drv.c     |  34 +++-
>   drivers/s390/crypto/vfio_ap_ops.c     | 336 +++++++++++++++++++++++++++++++++-
>   drivers/s390/crypto/vfio_ap_private.h |  11 ++
>   3 files changed, 374 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
> index e9824c3..003662a 100644
> --- a/drivers/s390/crypto/vfio_ap_drv.c
> +++ b/drivers/s390/crypto/vfio_ap_drv.c
> @@ -5,6 +5,7 @@
>    * Copyright IBM Corp. 2018
>    *
>    * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
> + *	      Pierre Morel <pmorel@linux.ibm.com>
>    */
>   
>   #include <linux/module.h>
> @@ -40,14 +41,45 @@ static struct ap_device_id ap_queue_ids[] = {
>   
>   MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
>   
> +/**
> + * vfio_ap_queue_dev_probe:
> + *
> + * Allocate a vfio_ap_queue structure and associate it
> + * with the device as driver_data.
> + */
>   static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
>   {
> +	struct vfio_ap_queue *q;
> +
> +	q = kzalloc(sizeof(*q), GFP_KERNEL);
> +	if (!q)
> +		return -ENOMEM;
> +	dev_set_drvdata(&apdev->device, q);
> +	q->apqn = to_ap_queue(&apdev->device)->qid;
> +	q->saved_isc = VFIO_AP_ISC_INVALID;

As per my comments in the cover letter, you would embed the code
above in the vfio_ap_mdev_probe_queue function and call it here.


>   	return 0;
>   }
>   
> +/**
> + * vfio_ap_queue_dev_remove:
> + *
> + * Takes the matrix lock to avoid actions on this device while removing
> + * Free the associated vfio_ap_queue structure
> + */
>   static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
>   {
> -	/* Nothing to do yet */
> +	struct vfio_ap_queue *q;
> +	int apid, apqi;
> +
> +	mutex_lock(&matrix_dev->lock);
> +	q = dev_get_drvdata(&apdev->device);
> +	dev_set_drvdata(&apdev->device, NULL);
> +	apid = AP_QID_CARD(q->apqn);
> +	apqi = AP_QID_QUEUE(q->apqn);
> +	vfio_ap_mdev_reset_queue(apid, apqi, 1);
> +	vfio_ap_irq_disable(q);
> +	kfree(q);

As per my comments in the cover letter, you would embed the code
above in the vfio_ap_mdev_remove_queue function and call it here.

> +	mutex_unlock(&matrix_dev->lock);
>   }
>   
>   static void vfio_ap_matrix_dev_release(struct device *dev)
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index e8e87bf..015174f 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -24,6 +24,295 @@
>   #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough"
>   #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device"
>   
> +static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev);
> +
> +static int match_apqn(struct device *dev, void *data)
> +{
> +	struct vfio_ap_queue *q = dev_get_drvdata(dev);
> +
> +	return (q->apqn == *(int *)(data)) ? 1 : 0;
> +}
> +
> +/**
> + * vfio_ap_get_queue: Retrieve a queue with a specific APQN from a list
> + * @matrix_mdev: the associated mediated matrix
> + * @apqn: The queue APQN
> + *
> + * Retrieve a queue with a specific APQN from the list of the
> + * devices of the vfio_ap_drv.
> + * Verify that the APID and the APQI are set in the matrix.
> + *
> + * Returns the pointer to the associated vfio_ap_queue
> + */
> +struct vfio_ap_queue *vfio_ap_get_queue(struct ap_matrix_mdev *matrix_mdev,
> +					int apqn)
> +{
> +	struct vfio_ap_queue *q;
> +	struct device *dev;
> +
> +	if (!test_bit_inv(AP_QID_CARD(apqn), matrix_mdev->matrix.apm))
> +		return NULL;
> +	if (!test_bit_inv(AP_QID_QUEUE(apqn), matrix_mdev->matrix.aqm))
> +		return NULL;
> +
> +	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
> +				 &apqn, match_apqn);
> +	if (!dev)
> +		return NULL;
> +	q = dev_get_drvdata(dev);
> +	q->matrix_mdev = matrix_mdev;
> +	put_device(dev);
> +
> +	return q;
> +}
> +
> +/**
> + * vfio_ap_wait_for_irqclear
> + * @apqn: The AP Queue number
> + *
> + * Checks the IRQ bit for the status of this APQN using ap_tapq.
> + * Returns if the ap_tapq function succedded and the bit is clear.
> + * Returns if ap_tapq function failed with invalid, deconfigured or
> + * checkstopped AP.
> + * Otherwise retries up to 5 times after waiting 20ms.
> + *
> + */
> +static void vfio_ap_wait_for_irqclear(int apqn)
> +{
> +	struct ap_queue_status status;
> +	int retry = 5;
> +
> +	do {
> +		status = ap_tapq(apqn, NULL);
> +		switch (status.response_code) {
> +		case AP_RESPONSE_NORMAL:
> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> +			if (!status.irq_enabled)
> +				return;
> +			/* Fall through */
> +		case AP_RESPONSE_BUSY:
> +			msleep(20);
> +			break;
> +		case AP_RESPONSE_Q_NOT_AVAIL:
> +		case AP_RESPONSE_DECONFIGURED:
> +		case AP_RESPONSE_CHECKSTOPPED:
> +		default:
> +			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
> +				  status.response_code, apqn);
> +			return;
> +		}
> +	} while (--retry);
> +
> +	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
> +		  __func__, status.response_code, apqn);
> +}
> +
> +/**
> + * vfio_ap_free_aqic_resources
> + * @q: The vfio_ap_queue
> + *
> + * Unregisters the ISC in the GIB when the saved ISC not invalid.
> + * Unpin the guest's page holding the NIB when it exist.
> + * Reset the saved_pfn and saved_isc to invalid values.
> + * Clear the pointer to the matrix mediated device.
> + *
> + */
> +static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q)
> +{
> +	if (q->saved_isc != VFIO_AP_ISC_INVALID && q->matrix_mdev)
> +		kvm_s390_gisc_unregister(q->matrix_mdev->kvm, q->saved_isc);
> +	if (q->saved_pfn && q->matrix_mdev)
> +		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev),
> +				 &q->saved_pfn, 1);
> +	q->saved_pfn = 0;
> +	q->saved_isc = VFIO_AP_ISC_INVALID;
> +	q->matrix_mdev = NULL;
> +}
> +
> +/**
> + * vfio_ap_irq_disable
> + * @q: The vfio_ap_queue
> + *
> + * Uses ap_aqic to disable the interruption and in case of success, reset
> + * in progress or IRQ disable command already proceeded: calls
> + * vfio_ap_wait_for_irqclear() to check for the IRQ bit to be clear
> + * and calls vfio_ap_free_aqic_resources() to free the resources associated
> + * with the AP interrupt handling.
> + *
> + * In the case the AP is busy, or a reset is in progress,
> + * retries after 20ms, up to 5 times.
> + *
> + * Returns if ap_aqic function failed with invalid, deconfigured or
> + * checkstopped AP.
> + */
> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
> +{
> +	struct ap_qirq_ctrl aqic_gisa = {};
> +	struct ap_queue_status status;
> +	int retries = 5;
> +
> +	do {
> +		status = ap_aqic(q->apqn, aqic_gisa, NULL);
> +		switch (status.response_code) {
> +		case AP_RESPONSE_OTHERWISE_CHANGED:
> +		case AP_RESPONSE_NORMAL:
> +			vfio_ap_wait_for_irqclear(q->apqn);
> +			goto end_free;
> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> +		case AP_RESPONSE_BUSY:
> +			msleep(20);
> +			break;
> +		case AP_RESPONSE_Q_NOT_AVAIL:
> +		case AP_RESPONSE_DECONFIGURED:
> +		case AP_RESPONSE_CHECKSTOPPED:
> +		case AP_RESPONSE_INVALID_ADDRESS:
> +		default:
> +			/* All cases in default means AP not operational */
> +			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> +				  status.response_code);
> +			goto end_free;
> +		}
> +	} while (retries--);
> +
> +	WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> +		  status.response_code);
> +end_free:
> +	vfio_ap_free_aqic_resources(q);
> +	return status;
> +}
> +
> +/**
> + * vfio_ap_setirq: Enable Interruption for a APQN
> + *
> + * @dev: the device associated with the ap_queue
> + * @q:   the vfio_ap_queue holding AQIC parameters
> + *
> + * Pin the NIB saved in *q
> + * Register the guest ISC to GIB interface and retrieve the
> + * host ISC to issue the host side PQAP/AQIC
> + *
> + * Response.status may be set to AP_RESPONSE_INVALID_ADDRESS in case the
> + * vfio_pin_pages failed.
> + *
> + * Otherwise return the ap_queue_status returned by the ap_aqic(),
> + * all retry handling will be done by the guest.
> + */
> +static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
> +						 int isc,
> +						 unsigned long nib)
> +{
> +	struct ap_qirq_ctrl aqic_gisa = {};
> +	struct ap_queue_status status = {};
> +	struct kvm_s390_gisa *gisa;
> +	struct kvm *kvm;
> +	unsigned long h_nib, g_pfn, h_pfn;
> +	int ret;
> +
> +	g_pfn = nib >> PAGE_SHIFT;
> +	ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1,
> +			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
> +	switch (ret) {
> +	case 1:
> +		break;
> +	default:
> +		status.response_code = AP_RESPONSE_INVALID_ADDRESS;
> +		return status;
> +	}
> +
> +	kvm = q->matrix_mdev->kvm;
> +	gisa = kvm->arch.gisa_int.origin;
> +
> +	h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK);
> +	aqic_gisa.gisc = isc;
> +	aqic_gisa.isc = kvm_s390_gisc_register(kvm, isc);
> +	aqic_gisa.ir = 1;
> +	aqic_gisa.gisa = (uint64_t)gisa >> 4;
> +
> +	status = ap_aqic(q->apqn, aqic_gisa, (void *)h_nib);
> +	switch (status.response_code) {
> +	case AP_RESPONSE_NORMAL:
> +		/* See if we did clear older IRQ configuration */
> +		vfio_ap_free_aqic_resources(q);
> +		q->saved_pfn = g_pfn;
> +		q->saved_isc = isc;
> +		break;
> +	case AP_RESPONSE_OTHERWISE_CHANGED:
> +		/* We could not modify IRQ setings: clear new configuration */
> +		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1);
> +		kvm_s390_gisc_unregister(kvm, isc);
> +		break;
> +	default:
> +		pr_warn("%s: apqn %04x: response: %02x\n", __func__, q->apqn,
> +			status.response_code);
> +		vfio_ap_irq_disable(q);
> +		break;
> +	}
> +
> +	return status;
> +}
> +
> +/**
> + * handle_pqap: PQAP instruction callback
> + *
> + * @vcpu: The vcpu on which we received the PQAP instruction
> + *
> + * Get the general register contents to initialize internal variables.
> + * REG[0]: APQN
> + * REG[1]: IR and ISC
> + * REG[2]: NIB
> + *
> + * Response.status may be set to following Response Code:
> + * - AP_RESPONSE_Q_NOT_AVAIL: if the queue is not available
> + * - AP_RESPONSE_DECONFIGURED: if the queue is not configured
> + * - AP_RESPONSE_NORMAL (0) : in case of successs
> + *   Check vfio_ap_setirq() and vfio_ap_clrirq() for other possible RC.
> + * We take the matrix_dev lock to ensure serialization on queues and
> + * mediated device access.
> + *
> + * Return 0 if we could handle the request inside KVM.
> + * otherwise, returns -EOPNOTSUPP to let QEMU handle the fault.
> + */
> +static int handle_pqap(struct kvm_vcpu *vcpu)
> +{
> +	uint64_t status;
> +	uint16_t apqn;
> +	struct vfio_ap_queue *q;
> +	struct ap_queue_status qstatus = {
> +			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
> +	struct ap_matrix_mdev *matrix_mdev;
> +
> +	/* If we do not use the AIV facility just go to userland */
> +	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
> +		return -EOPNOTSUPP;
> +
> +	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
> +	mutex_lock(&matrix_dev->lock);
> +
> +	if (!vcpu->kvm->arch.crypto.pqap_hook)
> +		goto out_unlock;
> +	matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook,
> +				   struct ap_matrix_mdev, pqap_hook);
> +
> +	q = vfio_ap_get_queue(matrix_mdev, apqn);
> +	if (!q)
> +		goto out_unlock;
> +
> +	status = vcpu->run->s.regs.gprs[1];
> +
> +	/* If IR bit(16) is set we enable the interrupt */
> +	if ((status >> (63 - 16)) & 0x01)
> +		qstatus = vfio_ap_irq_enable(q, status & 0x07,
> +					     vcpu->run->s.regs.gprs[2]);
> +	else
> +		qstatus = vfio_ap_irq_disable(q);
> +
> +out_unlock:
> +	memcpy(&vcpu->run->s.regs.gprs[1], &qstatus, sizeof(qstatus));
> +	vcpu->run->s.regs.gprs[1] >>= 32;
> +	mutex_unlock(&matrix_dev->lock);
> +	return 0;
> +}
> +
>   static void vfio_ap_matrix_init(struct ap_config_info *info,
>   				struct ap_matrix *matrix)
>   {
> @@ -45,8 +334,11 @@ static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
>   		return -ENOMEM;
>   	}
>   
> +	matrix_mdev->mdev = mdev;
>   	vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
>   	mdev_set_drvdata(mdev, matrix_mdev);
> +	matrix_mdev->pqap_hook.hook = handle_pqap;
> +	matrix_mdev->pqap_hook.owner = THIS_MODULE;
>   	mutex_lock(&matrix_dev->lock);
>   	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
>   	mutex_unlock(&matrix_dev->lock);
> @@ -62,6 +354,7 @@ static int vfio_ap_mdev_remove(struct mdev_device *mdev)
>   		return -EBUSY;
>   
>   	mutex_lock(&matrix_dev->lock);
> +	vfio_ap_mdev_reset_queues(mdev);
>   	list_del(&matrix_mdev->node);
>   	mutex_unlock(&matrix_dev->lock);
>   
> @@ -754,6 +1047,8 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
>   	}
>   
>   	matrix_mdev->kvm = kvm;
> +	kvm_get_kvm(kvm);
> +	kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
>   	mutex_unlock(&matrix_dev->lock);
>   
>   	return 0;
> @@ -819,15 +1114,36 @@ static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
>   	return NOTIFY_OK;
>   }
>   
> -static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> -				    unsigned int retry)
> +static void vfio_ap_irq_disable_apqn(int apqn)
> +{
> +	struct device *dev;
> +	struct vfio_ap_queue *q;
> +
> +	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
> +				 &apqn, match_apqn);
> +	if (dev) {
> +		q = dev_get_drvdata(dev);
> +		vfio_ap_irq_disable(q);
> +		put_device(dev);
> +	}
> +}
> +
> +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> +			     unsigned int retry)
>   {
>   	struct ap_queue_status status;
> +	int retry2 = 2;
> +	int apqn = AP_MKQID(apid, apqi);
>   
>   	do {
> -		status = ap_zapq(AP_MKQID(apid, apqi));
> +		status = ap_zapq(apqn);
>   		switch (status.response_code) {
>   		case AP_RESPONSE_NORMAL:
> +			while (!status.queue_empty && retry2--) {
> +				msleep(20);
> +				status = ap_tapq(apqn, NULL);
> +			}
> +			WARN_ON_ONCE(retry <= 0);
>   			return 0;
>   		case AP_RESPONSE_RESET_IN_PROGRESS:
>   		case AP_RESPONSE_BUSY:
> @@ -861,6 +1177,7 @@ static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev)
>   			 */
>   			if (ret)
>   				rc = ret;
> +			vfio_ap_irq_disable_apqn(AP_MKQID(apid, apqi));
>   		}
>   	}
>   
> @@ -904,15 +1221,20 @@ static void vfio_ap_mdev_release(struct mdev_device *mdev)
>   {
>   	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
>   
> -	if (matrix_mdev->kvm)
> +	mutex_lock(&matrix_dev->lock);
> +	if (matrix_mdev->kvm) {
>   		kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
> +		matrix_mdev->kvm->arch.crypto.pqap_hook = NULL;
> +		vfio_ap_mdev_reset_queues(mdev);
> +		kvm_put_kvm(matrix_mdev->kvm);
> +		matrix_mdev->kvm = NULL;
> +	}
> +	mutex_unlock(&matrix_dev->lock);
>   
> -	vfio_ap_mdev_reset_queues(mdev);
>   	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
>   				 &matrix_mdev->iommu_notifier);
>   	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
>   				 &matrix_mdev->group_notifier);
> -	matrix_mdev->kvm = NULL;
>   	module_put(THIS_MODULE);
>   }
>   
> @@ -941,6 +1263,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
>   {
>   	int ret;
>   
> +	mutex_lock(&matrix_dev->lock);
>   	switch (cmd) {
>   	case VFIO_DEVICE_GET_INFO:
>   		ret = vfio_ap_mdev_get_device_info(arg);
> @@ -952,6 +1275,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
>   		ret = -EOPNOTSUPP;
>   		break;
>   	}
> +	mutex_unlock(&matrix_dev->lock);
>   
>   	return ret;
>   }
> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
> index 18dcc4d..f46dde5 100644
> --- a/drivers/s390/crypto/vfio_ap_private.h
> +++ b/drivers/s390/crypto/vfio_ap_private.h
> @@ -4,6 +4,7 @@
>    *
>    * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
>    *	      Halil Pasic <pasic@linux.ibm.com>
> + *	      Pierre Morel <pmorel@linux.ibm.com>
>    *
>    * Copyright IBM Corp. 2018
>    */
> @@ -89,5 +90,15 @@ struct ap_matrix_mdev {
>   
>   extern int vfio_ap_mdev_register(void);
>   extern void vfio_ap_mdev_unregister(void);
> +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> +			     unsigned int retry);

As per my comments in the cover letter, get rid of this.

>   
> +struct vfio_ap_queue {
> +	struct ap_matrix_mdev *matrix_mdev;
> +	unsigned long saved_pfn;
> +	int	apqn;
> +#define VFIO_AP_ISC_INVALID 0xff
> +	unsigned char saved_isc;
> +};
> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q);
>   #endif /* _VFIO_AP_PRIVATE_H_ */

As per my comments in the cover letter, get rid of this and make it a
static function in vfio_ap_ops.c.

You'll have to add the following function defs instead:

int vfio_ap_mdev_probe_queue(struct ap_queue *queue);
void vfio_ap_mdev_remove_queue(struct ap_queue *queue);

>
Anthony Krowiak June 4, 2019, 7:38 p.m. UTC | #2
On 5/21/19 11:34 AM, Pierre Morel wrote:
> We register a AP PQAP instruction hook during the open
> of the mediated device. And unregister it on release.
> 
> During the probe of the AP device, we allocate a vfio_ap_queue
> structure to keep track of the information we need for the
> PQAP/AQIC instruction interception.
> 
> In the AP PQAP instruction hook, if we receive a demand to
> enable IRQs,
> - we retrieve the vfio_ap_queue based on the APQN we receive
>    in REG1,
> - we retrieve the page of the guest address, (NIB), from
>    register REG2
> - we retrieve the mediated device to use the VFIO pinning
>    infrastructure to pin the page of the guest address,
> - we retrieve the pointer to KVM to register the guest ISC
>    and retrieve the host ISC
> - finaly we activate GISA
> 
> If we receive a demand to disable IRQs,
> - we deactivate GISA
> - unregister from the GIB
> - unpin the NIB
> 
> When removing the AP device from the driver the device is
> reseted and this process unregisters the GISA from the GIB,
> and unpins the NIB address then we free the vfio_ap_queue
> structure.
> 
> Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>

I made a few comments, but it looks sane to me:
Acked-by: Tony Krowiak <akrowiak@linux.ibm.com>

> ---
>   drivers/s390/crypto/vfio_ap_drv.c     |  34 +++-
>   drivers/s390/crypto/vfio_ap_ops.c     | 336 +++++++++++++++++++++++++++++++++-
>   drivers/s390/crypto/vfio_ap_private.h |  11 ++
>   3 files changed, 374 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
> index e9824c3..003662a 100644
> --- a/drivers/s390/crypto/vfio_ap_drv.c
> +++ b/drivers/s390/crypto/vfio_ap_drv.c
> @@ -5,6 +5,7 @@
>    * Copyright IBM Corp. 2018
>    *
>    * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
> + *	      Pierre Morel <pmorel@linux.ibm.com>
>    */
>   
>   #include <linux/module.h>
> @@ -40,14 +41,45 @@ static struct ap_device_id ap_queue_ids[] = {
>   
>   MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
>   
> +/**
> + * vfio_ap_queue_dev_probe:
> + *
> + * Allocate a vfio_ap_queue structure and associate it
> + * with the device as driver_data.
> + */
>   static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
>   {
> +	struct vfio_ap_queue *q;
> +
> +	q = kzalloc(sizeof(*q), GFP_KERNEL);
> +	if (!q)
> +		return -ENOMEM;
> +	dev_set_drvdata(&apdev->device, q);
> +	q->apqn = to_ap_queue(&apdev->device)->qid;
> +	q->saved_isc = VFIO_AP_ISC_INVALID;
>   	return 0;
>   }
>   
> +/**
> + * vfio_ap_queue_dev_remove:
> + *
> + * Takes the matrix lock to avoid actions on this device while removing
> + * Free the associated vfio_ap_queue structure
> + */
>   static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
>   {
> -	/* Nothing to do yet */
> +	struct vfio_ap_queue *q;
> +	int apid, apqi;
> +
> +	mutex_lock(&matrix_dev->lock);
> +	q = dev_get_drvdata(&apdev->device);
> +	dev_set_drvdata(&apdev->device, NULL);
> +	apid = AP_QID_CARD(q->apqn);
> +	apqi = AP_QID_QUEUE(q->apqn);
> +	vfio_ap_mdev_reset_queue(apid, apqi, 1);
> +	vfio_ap_irq_disable(q);
> +	kfree(q);
> +	mutex_unlock(&matrix_dev->lock);
>   }
>   
>   static void vfio_ap_matrix_dev_release(struct device *dev)
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index e8e87bf..015174f 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -24,6 +24,295 @@
>   #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough"
>   #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device"
>   
> +static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev);
> +
> +static int match_apqn(struct device *dev, void *data)
> +{
> +	struct vfio_ap_queue *q = dev_get_drvdata(dev);
> +
> +	return (q->apqn == *(int *)(data)) ? 1 : 0;
> +}
> +
> +/**
> + * vfio_ap_get_queue: Retrieve a queue with a specific APQN from a list
> + * @matrix_mdev: the associated mediated matrix
> + * @apqn: The queue APQN
> + *
> + * Retrieve a queue with a specific APQN from the list of the
> + * devices of the vfio_ap_drv.
> + * Verify that the APID and the APQI are set in the matrix.
> + *
> + * Returns the pointer to the associated vfio_ap_queue
> + */
> +struct vfio_ap_queue *vfio_ap_get_queue(struct ap_matrix_mdev *matrix_mdev,
> +					int apqn)
> +{
> +	struct vfio_ap_queue *q;
> +	struct device *dev;
> +
> +	if (!test_bit_inv(AP_QID_CARD(apqn), matrix_mdev->matrix.apm))
> +		return NULL;
> +	if (!test_bit_inv(AP_QID_QUEUE(apqn), matrix_mdev->matrix.aqm))
> +		return NULL;
> +
> +	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
> +				 &apqn, match_apqn);
> +	if (!dev)
> +		return NULL;
> +	q = dev_get_drvdata(dev);
> +	q->matrix_mdev = matrix_mdev;
> +	put_device(dev);
> +
> +	return q;
> +}
> +
> +/**
> + * vfio_ap_wait_for_irqclear
> + * @apqn: The AP Queue number
> + *
> + * Checks the IRQ bit for the status of this APQN using ap_tapq.
> + * Returns if the ap_tapq function succedded and the bit is clear.

s/succedded/succeeded/

> + * Returns if ap_tapq function failed with invalid, deconfigured or
> + * checkstopped AP.
> + * Otherwise retries up to 5 times after waiting 20ms.
> + *
> + */
> +static void vfio_ap_wait_for_irqclear(int apqn)
> +{
> +	struct ap_queue_status status;
> +	int retry = 5;
> +
> +	do {
> +		status = ap_tapq(apqn, NULL);
> +		switch (status.response_code) {
> +		case AP_RESPONSE_NORMAL:
> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> +			if (!status.irq_enabled)
> +				return;
> +			/* Fall through */
> +		case AP_RESPONSE_BUSY:
> +			msleep(20);
> +			break;
> +		case AP_RESPONSE_Q_NOT_AVAIL:
> +		case AP_RESPONSE_DECONFIGURED:
> +		case AP_RESPONSE_CHECKSTOPPED:
> +		default:
> +			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
> +				  status.response_code, apqn);
> +			return;

Why not just break out of the loop and just use the WARN_ONCE
outside of the loop?

> +		}
> +	} while (--retry);
> +
> +	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
> +		  __func__, status.response_code, apqn);
> +}
> +
> +/**
> + * vfio_ap_free_aqic_resources
> + * @q: The vfio_ap_queue
> + *
> + * Unregisters the ISC in the GIB when the saved ISC not invalid.
> + * Unpin the guest's page holding the NIB when it exist.
> + * Reset the saved_pfn and saved_isc to invalid values.
> + * Clear the pointer to the matrix mediated device.
> + *
> + */
> +static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q)
> +{
> +	if (q->saved_isc != VFIO_AP_ISC_INVALID && q->matrix_mdev)
> +		kvm_s390_gisc_unregister(q->matrix_mdev->kvm, q->saved_isc);
> +	if (q->saved_pfn && q->matrix_mdev)
> +		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev),
> +				 &q->saved_pfn, 1);
> +	q->saved_pfn = 0;
> +	q->saved_isc = VFIO_AP_ISC_INVALID;
> +	q->matrix_mdev = NULL;
> +}
> +
> +/**
> + * vfio_ap_irq_disable
> + * @q: The vfio_ap_queue
> + *
> + * Uses ap_aqic to disable the interruption and in case of success, reset
> + * in progress or IRQ disable command already proceeded: calls
> + * vfio_ap_wait_for_irqclear() to check for the IRQ bit to be clear
> + * and calls vfio_ap_free_aqic_resources() to free the resources associated
> + * with the AP interrupt handling.
> + *
> + * In the case the AP is busy, or a reset is in progress,
> + * retries after 20ms, up to 5 times.
> + *
> + * Returns if ap_aqic function failed with invalid, deconfigured or
> + * checkstopped AP.
> + */
> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
> +{
> +	struct ap_qirq_ctrl aqic_gisa = {};
> +	struct ap_queue_status status;
> +	int retries = 5;
> +
> +	do {
> +		status = ap_aqic(q->apqn, aqic_gisa, NULL);
> +		switch (status.response_code) {
> +		case AP_RESPONSE_OTHERWISE_CHANGED:
> +		case AP_RESPONSE_NORMAL:
> +			vfio_ap_wait_for_irqclear(q->apqn);
> +			goto end_free;
> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> +		case AP_RESPONSE_BUSY:
> +			msleep(20);
> +			break;
> +		case AP_RESPONSE_Q_NOT_AVAIL:
> +		case AP_RESPONSE_DECONFIGURED:
> +		case AP_RESPONSE_CHECKSTOPPED:
> +		case AP_RESPONSE_INVALID_ADDRESS:
> +		default:
> +			/* All cases in default means AP not operational */
> +			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> +				  status.response_code);
> +			goto end_free;

Why not just break out of the loop instead of repeating the WARN_ONCE
message?

> +		}
> +	} while (retries--);
> +
> +	WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> +		  status.response_code);
> +end_free:
> +	vfio_ap_free_aqic_resources(q);
> +	return status;
> +}
> +
> +/**
> + * vfio_ap_setirq: Enable Interruption for a APQN
> + *
> + * @dev: the device associated with the ap_queue
> + * @q:   the vfio_ap_queue holding AQIC parameters
> + *
> + * Pin the NIB saved in *q
> + * Register the guest ISC to GIB interface and retrieve the
> + * host ISC to issue the host side PQAP/AQIC
> + *
> + * Response.status may be set to AP_RESPONSE_INVALID_ADDRESS in case the
> + * vfio_pin_pages failed.
> + *
> + * Otherwise return the ap_queue_status returned by the ap_aqic(),
> + * all retry handling will be done by the guest.
> + */
> +static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
> +						 int isc,
> +						 unsigned long nib)
> +{
> +	struct ap_qirq_ctrl aqic_gisa = {};
> +	struct ap_queue_status status = {};
> +	struct kvm_s390_gisa *gisa;
> +	struct kvm *kvm;
> +	unsigned long h_nib, g_pfn, h_pfn;
> +	int ret;
> +
> +	g_pfn = nib >> PAGE_SHIFT;
> +	ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1,
> +			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
> +	switch (ret) {
> +	case 1:
> +		break;
> +	default:
> +		status.response_code = AP_RESPONSE_INVALID_ADDRESS;
> +		return status;
> +	}
> +
> +	kvm = q->matrix_mdev->kvm;
> +	gisa = kvm->arch.gisa_int.origin;
> +
> +	h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK);
> +	aqic_gisa.gisc = isc;
> +	aqic_gisa.isc = kvm_s390_gisc_register(kvm, isc);
> +	aqic_gisa.ir = 1;
> +	aqic_gisa.gisa = (uint64_t)gisa >> 4;
> +
> +	status = ap_aqic(q->apqn, aqic_gisa, (void *)h_nib);
> +	switch (status.response_code) {
> +	case AP_RESPONSE_NORMAL:
> +		/* See if we did clear older IRQ configuration */
> +		vfio_ap_free_aqic_resources(q);
> +		q->saved_pfn = g_pfn;
> +		q->saved_isc = isc;
> +		break;
> +	case AP_RESPONSE_OTHERWISE_CHANGED:
> +		/* We could not modify IRQ setings: clear new configuration */
> +		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1);
> +		kvm_s390_gisc_unregister(kvm, isc);
> +		break;
> +	default:
> +		pr_warn("%s: apqn %04x: response: %02x\n", __func__, q->apqn,
> +			status.response_code);
> +		vfio_ap_irq_disable(q);
> +		break;
> +	}
> +
> +	return status;
> +}
> +
> +/**
> + * handle_pqap: PQAP instruction callback
> + *
> + * @vcpu: The vcpu on which we received the PQAP instruction
> + *
> + * Get the general register contents to initialize internal variables.
> + * REG[0]: APQN
> + * REG[1]: IR and ISC
> + * REG[2]: NIB
> + *
> + * Response.status may be set to following Response Code:
> + * - AP_RESPONSE_Q_NOT_AVAIL: if the queue is not available
> + * - AP_RESPONSE_DECONFIGURED: if the queue is not configured
> + * - AP_RESPONSE_NORMAL (0) : in case of successs
> + *   Check vfio_ap_setirq() and vfio_ap_clrirq() for other possible RC.
> + * We take the matrix_dev lock to ensure serialization on queues and
> + * mediated device access.
> + *
> + * Return 0 if we could handle the request inside KVM.
> + * otherwise, returns -EOPNOTSUPP to let QEMU handle the fault.
> + */
> +static int handle_pqap(struct kvm_vcpu *vcpu)
> +{
> +	uint64_t status;
> +	uint16_t apqn;
> +	struct vfio_ap_queue *q;
> +	struct ap_queue_status qstatus = {
> +			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
> +	struct ap_matrix_mdev *matrix_mdev;
> +
> +	/* If we do not use the AIV facility just go to userland */
> +	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
> +		return -EOPNOTSUPP;
> +
> +	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
> +	mutex_lock(&matrix_dev->lock);
> +
> +	if (!vcpu->kvm->arch.crypto.pqap_hook)

Wasn't this already checked in patch 2 prior to calling this
function? In fact, doesn't the hook point to this function?

> +		goto out_unlock;
> +	matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook,
> +				   struct ap_matrix_mdev, pqap_hook);
> +
> +	q = vfio_ap_get_queue(matrix_mdev, apqn);
> +	if (!q)
> +		goto out_unlock;
> +
> +	status = vcpu->run->s.regs.gprs[1];
> +
> +	/* If IR bit(16) is set we enable the interrupt */
> +	if ((status >> (63 - 16)) & 0x01)
> +		qstatus = vfio_ap_irq_enable(q, status & 0x07,
> +					     vcpu->run->s.regs.gprs[2]);
> +	else
> +		qstatus = vfio_ap_irq_disable(q);
> +
> +out_unlock:
> +	memcpy(&vcpu->run->s.regs.gprs[1], &qstatus, sizeof(qstatus));
> +	vcpu->run->s.regs.gprs[1] >>= 32;
> +	mutex_unlock(&matrix_dev->lock);
> +	return 0;
> +}
> +
>   static void vfio_ap_matrix_init(struct ap_config_info *info,
>   				struct ap_matrix *matrix)
>   {
> @@ -45,8 +334,11 @@ static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
>   		return -ENOMEM;
>   	}
>   
> +	matrix_mdev->mdev = mdev;
>   	vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
>   	mdev_set_drvdata(mdev, matrix_mdev);
> +	matrix_mdev->pqap_hook.hook = handle_pqap;
> +	matrix_mdev->pqap_hook.owner = THIS_MODULE;
>   	mutex_lock(&matrix_dev->lock);
>   	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
>   	mutex_unlock(&matrix_dev->lock);
> @@ -62,6 +354,7 @@ static int vfio_ap_mdev_remove(struct mdev_device *mdev)
>   		return -EBUSY;
>   
>   	mutex_lock(&matrix_dev->lock);
> +	vfio_ap_mdev_reset_queues(mdev);
>   	list_del(&matrix_mdev->node);
>   	mutex_unlock(&matrix_dev->lock);
>   
> @@ -754,6 +1047,8 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
>   	}
>   
>   	matrix_mdev->kvm = kvm;
> +	kvm_get_kvm(kvm);
> +	kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
>   	mutex_unlock(&matrix_dev->lock);
>   
>   	return 0;
> @@ -819,15 +1114,36 @@ static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
>   	return NOTIFY_OK;
>   }
>   
> -static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> -				    unsigned int retry)
> +static void vfio_ap_irq_disable_apqn(int apqn)
> +{
> +	struct device *dev;
> +	struct vfio_ap_queue *q;
> +
> +	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
> +				 &apqn, match_apqn);
> +	if (dev) {
> +		q = dev_get_drvdata(dev);
> +		vfio_ap_irq_disable(q);
> +		put_device(dev);
> +	}
> +}
> +
> +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> +			     unsigned int retry)
>   {
>   	struct ap_queue_status status;
> +	int retry2 = 2;
> +	int apqn = AP_MKQID(apid, apqi);
>   
>   	do {
> -		status = ap_zapq(AP_MKQID(apid, apqi));
> +		status = ap_zapq(apqn);
>   		switch (status.response_code) {
>   		case AP_RESPONSE_NORMAL:
> +			while (!status.queue_empty && retry2--) {
> +				msleep(20);
> +				status = ap_tapq(apqn, NULL);
> +			}
> +			WARN_ON_ONCE(retry <= 0);
>   			return 0;
>   		case AP_RESPONSE_RESET_IN_PROGRESS:
>   		case AP_RESPONSE_BUSY:
> @@ -861,6 +1177,7 @@ static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev)
>   			 */
>   			if (ret)
>   				rc = ret;
> +			vfio_ap_irq_disable_apqn(AP_MKQID(apid, apqi));
>   		}
>   	}
>   
> @@ -904,15 +1221,20 @@ static void vfio_ap_mdev_release(struct mdev_device *mdev)
>   {
>   	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
>   
> -	if (matrix_mdev->kvm)
> +	mutex_lock(&matrix_dev->lock);
> +	if (matrix_mdev->kvm) {
>   		kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
> +		matrix_mdev->kvm->arch.crypto.pqap_hook = NULL;
> +		vfio_ap_mdev_reset_queues(mdev);
> +		kvm_put_kvm(matrix_mdev->kvm);
> +		matrix_mdev->kvm = NULL;
> +	}
> +	mutex_unlock(&matrix_dev->lock);
>   
> -	vfio_ap_mdev_reset_queues(mdev);
>   	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
>   				 &matrix_mdev->iommu_notifier);
>   	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
>   				 &matrix_mdev->group_notifier);
> -	matrix_mdev->kvm = NULL;
>   	module_put(THIS_MODULE);
>   }
>   
> @@ -941,6 +1263,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
>   {
>   	int ret;
>   
> +	mutex_lock(&matrix_dev->lock);
>   	switch (cmd) {
>   	case VFIO_DEVICE_GET_INFO:
>   		ret = vfio_ap_mdev_get_device_info(arg);
> @@ -952,6 +1275,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
>   		ret = -EOPNOTSUPP;
>   		break;
>   	}
> +	mutex_unlock(&matrix_dev->lock);
>   
>   	return ret;
>   }
> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
> index 18dcc4d..f46dde5 100644
> --- a/drivers/s390/crypto/vfio_ap_private.h
> +++ b/drivers/s390/crypto/vfio_ap_private.h
> @@ -4,6 +4,7 @@
>    *
>    * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
>    *	      Halil Pasic <pasic@linux.ibm.com>
> + *	      Pierre Morel <pmorel@linux.ibm.com>
>    *
>    * Copyright IBM Corp. 2018
>    */
> @@ -89,5 +90,15 @@ struct ap_matrix_mdev {
>   
>   extern int vfio_ap_mdev_register(void);
>   extern void vfio_ap_mdev_unregister(void);
> +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> +			     unsigned int retry);
>   
> +struct vfio_ap_queue {
> +	struct ap_matrix_mdev *matrix_mdev;
> +	unsigned long saved_pfn;
> +	int	apqn;
> +#define VFIO_AP_ISC_INVALID 0xff
> +	unsigned char saved_isc;
> +};
> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q);
>   #endif /* _VFIO_AP_PRIVATE_H_ */
>
Halil Pasic June 7, 2019, 2:29 p.m. UTC | #3
On Tue, 4 Jun 2019 15:38:51 -0400
Tony Krowiak <akrowiak@linux.ibm.com> wrote:

> On 5/21/19 11:34 AM, Pierre Morel wrote:
> > We register a AP PQAP instruction hook during the open
> > of the mediated device. And unregister it on release.

[..]

> > +/**
> > + * vfio_ap_wait_for_irqclear
> > + * @apqn: The AP Queue number
> > + *
> > + * Checks the IRQ bit for the status of this APQN using ap_tapq.
> > + * Returns if the ap_tapq function succedded and the bit is clear.
> 
> s/succedded/succeeded/
> 

I'm gonna fix this up when picking.

> > + * Returns if ap_tapq function failed with invalid, deconfigured or
> > + * checkstopped AP.
> > + * Otherwise retries up to 5 times after waiting 20ms.
> > + *
> > + */
> > +static void vfio_ap_wait_for_irqclear(int apqn)
> > +{
> > +	struct ap_queue_status status;
> > +	int retry = 5;
> > +
> > +	do {
> > +		status = ap_tapq(apqn, NULL);
> > +		switch (status.response_code) {
> > +		case AP_RESPONSE_NORMAL:
> > +		case AP_RESPONSE_RESET_IN_PROGRESS:
> > +			if (!status.irq_enabled)
> > +				return;
> > +			/* Fall through */
> > +		case AP_RESPONSE_BUSY:
> > +			msleep(20);
> > +			break;
> > +		case AP_RESPONSE_Q_NOT_AVAIL:
> > +		case AP_RESPONSE_DECONFIGURED:
> > +		case AP_RESPONSE_CHECKSTOPPED:
> > +		default:
> > +			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
> > +				  status.response_code, apqn);
> > +			return;
> 
> Why not just break out of the loop and just use the WARN_ONCE
> outside of the loop?
> 

AFAIU the idea was to differentiate between got a strange response_code
and ran out of retires.

Actually I suspect that we are fine in case of AP_RESPONSE_Q_NOT_AVAIL,
 AP_RESPONSE_DECONFIGURED and AP_RESPONSE_CHECKSTOPPED in a sense that
what should be the post-condition of this function is guaranteed to be
reached. What do you think?

While I think that we can do better here, I see this as something that
should be done on top.

> > +		}
> > +	} while (--retry);
> > +
> > +	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
> > +		  __func__, status.response_code, apqn);
> > +}
> > +
> > +/**
> > + * vfio_ap_free_aqic_resources
> > + * @q: The vfio_ap_queue
> > + *
> > + * Unregisters the ISC in the GIB when the saved ISC not invalid.
> > + * Unpin the guest's page holding the NIB when it exist.
> > + * Reset the saved_pfn and saved_isc to invalid values.
> > + * Clear the pointer to the matrix mediated device.
> > + *
> > + */

[..]

> > +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
> > +{
> > +	struct ap_qirq_ctrl aqic_gisa = {};
> > +	struct ap_queue_status status;
> > +	int retries = 5;
> > +
> > +	do {
> > +		status = ap_aqic(q->apqn, aqic_gisa, NULL);
> > +		switch (status.response_code) {
> > +		case AP_RESPONSE_OTHERWISE_CHANGED:
> > +		case AP_RESPONSE_NORMAL:
> > +			vfio_ap_wait_for_irqclear(q->apqn);
> > +			goto end_free;
> > +		case AP_RESPONSE_RESET_IN_PROGRESS:
> > +		case AP_RESPONSE_BUSY:
> > +			msleep(20);
> > +			break;
> > +		case AP_RESPONSE_Q_NOT_AVAIL:
> > +		case AP_RESPONSE_DECONFIGURED:
> > +		case AP_RESPONSE_CHECKSTOPPED:
> > +		case AP_RESPONSE_INVALID_ADDRESS:
> > +		default:
> > +			/* All cases in default means AP not operational */
> > +			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> > +				  status.response_code);
> > +			goto end_free;
> 
> Why not just break out of the loop instead of repeating the WARN_ONCE
> message?
> 

I suppose the reason is same as above. I'm not entirely happy with this
code myself. E.g. why do we do retries here -- shouldn't we just fail the
aqic by the guest?

[..]

> > +static int handle_pqap(struct kvm_vcpu *vcpu)
> > +{
> > +	uint64_t status;
> > +	uint16_t apqn;
> > +	struct vfio_ap_queue *q;
> > +	struct ap_queue_status qstatus = {
> > +			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
> > +	struct ap_matrix_mdev *matrix_mdev;
> > +
> > +	/* If we do not use the AIV facility just go to userland */
> > +	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
> > +		return -EOPNOTSUPP;
> > +
> > +	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
> > +	mutex_lock(&matrix_dev->lock);
> > +
> > +	if (!vcpu->kvm->arch.crypto.pqap_hook)
> 
> Wasn't this already checked in patch 2 prior to calling this
> function? In fact, doesn't the hook point to this function?
> 

Let us benevolently call this defensive programming. We are actually
in that callback AFAICT, so it sure was set a moment ago, and I guess
the client code still holds the kvm.lock so it is guaranteed to stay
so unless somebody is playing foul.

We can address this with a patch on top.

Regards,
Halil
Anthony Krowiak June 11, 2019, 2:37 p.m. UTC | #4
On 6/7/19 10:29 AM, Halil Pasic wrote:
> On Tue, 4 Jun 2019 15:38:51 -0400
> Tony Krowiak <akrowiak@linux.ibm.com> wrote:
> 
>> On 5/21/19 11:34 AM, Pierre Morel wrote:
>>> We register a AP PQAP instruction hook during the open
>>> of the mediated device. And unregister it on release.
> 
> [..]
> 
>>> +/**
>>> + * vfio_ap_wait_for_irqclear
>>> + * @apqn: The AP Queue number
>>> + *
>>> + * Checks the IRQ bit for the status of this APQN using ap_tapq.
>>> + * Returns if the ap_tapq function succedded and the bit is clear.
>>
>> s/succedded/succeeded/
>>
> 
> I'm gonna fix this up when picking.
> 
>>> + * Returns if ap_tapq function failed with invalid, deconfigured or
>>> + * checkstopped AP.
>>> + * Otherwise retries up to 5 times after waiting 20ms.
>>> + *
>>> + */
>>> +static void vfio_ap_wait_for_irqclear(int apqn)
>>> +{
>>> +	struct ap_queue_status status;
>>> +	int retry = 5;
>>> +
>>> +	do {
>>> +		status = ap_tapq(apqn, NULL);
>>> +		switch (status.response_code) {
>>> +		case AP_RESPONSE_NORMAL:
>>> +		case AP_RESPONSE_RESET_IN_PROGRESS:
>>> +			if (!status.irq_enabled)
>>> +				return;
>>> +			/* Fall through */
>>> +		case AP_RESPONSE_BUSY:
>>> +			msleep(20);
>>> +			break;
>>> +		case AP_RESPONSE_Q_NOT_AVAIL:
>>> +		case AP_RESPONSE_DECONFIGURED:
>>> +		case AP_RESPONSE_CHECKSTOPPED:
>>> +		default:
>>> +			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
>>> +				  status.response_code, apqn);
>>> +			return;
>>
>> Why not just break out of the loop and just use the WARN_ONCE
>> outside of the loop?
>>
> 
> AFAIU the idea was to differentiate between got a strange response_code
> and ran out of retires.

In both cases, the response code is placed into the message, so one
should be able to discern the reason in either case. This is not
critical, just an observation.

> 
> Actually I suspect that we are fine in case of AP_RESPONSE_Q_NOT_AVAIL,
>   AP_RESPONSE_DECONFIGURED and AP_RESPONSE_CHECKSTOPPED in a sense that
> what should be the post-condition of this function is guaranteed to be
> reached. What do you think?

That would seem to be the case given those response codes indicate the
queue is not accessible.

> 
> While I think that we can do better here, I see this as something that
> should be done on top.

Are you talking about a patch on top? What do you think needs to be
addressed?

> 
>>> +		}
>>> +	} while (--retry);
>>> +
>>> +	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
>>> +		  __func__, status.response_code, apqn);
>>> +}
>>> +
>>> +/**
>>> + * vfio_ap_free_aqic_resources
>>> + * @q: The vfio_ap_queue
>>> + *
>>> + * Unregisters the ISC in the GIB when the saved ISC not invalid.
>>> + * Unpin the guest's page holding the NIB when it exist.
>>> + * Reset the saved_pfn and saved_isc to invalid values.
>>> + * Clear the pointer to the matrix mediated device.
>>> + *
>>> + */
> 
> [..]
> 
>>> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
>>> +{
>>> +	struct ap_qirq_ctrl aqic_gisa = {};
>>> +	struct ap_queue_status status;
>>> +	int retries = 5;
>>> +
>>> +	do {
>>> +		status = ap_aqic(q->apqn, aqic_gisa, NULL);
>>> +		switch (status.response_code) {
>>> +		case AP_RESPONSE_OTHERWISE_CHANGED:
>>> +		case AP_RESPONSE_NORMAL:
>>> +			vfio_ap_wait_for_irqclear(q->apqn);
>>> +			goto end_free;
>>> +		case AP_RESPONSE_RESET_IN_PROGRESS:
>>> +		case AP_RESPONSE_BUSY:
>>> +			msleep(20);
>>> +			break;
>>> +		case AP_RESPONSE_Q_NOT_AVAIL:
>>> +		case AP_RESPONSE_DECONFIGURED:
>>> +		case AP_RESPONSE_CHECKSTOPPED:
>>> +		case AP_RESPONSE_INVALID_ADDRESS:
>>> +		default:
>>> +			/* All cases in default means AP not operational */
>>> +			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
>>> +				  status.response_code);
>>> +			goto end_free;
>>
>> Why not just break out of the loop instead of repeating the WARN_ONCE
>> message?
>>
> 
> I suppose the reason is same as above. I'm not entirely happy with this
> code myself. E.g. why do we do retries here -- shouldn't we just fail the
> aqic by the guest?

According to my reading of the code, it looks like the retries are for
response code AP_RESPONSE_BUSY. Why wouldn't we want to wait until the
queue was not busy anymore?

> 
> [..]
> 
>>> +static int handle_pqap(struct kvm_vcpu *vcpu)
>>> +{
>>> +	uint64_t status;
>>> +	uint16_t apqn;
>>> +	struct vfio_ap_queue *q;
>>> +	struct ap_queue_status qstatus = {
>>> +			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
>>> +	struct ap_matrix_mdev *matrix_mdev;
>>> +
>>> +	/* If we do not use the AIV facility just go to userland */
>>> +	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
>>> +		return -EOPNOTSUPP;
>>> +
>>> +	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
>>> +	mutex_lock(&matrix_dev->lock);
>>> +
>>> +	if (!vcpu->kvm->arch.crypto.pqap_hook)
>>
>> Wasn't this already checked in patch 2 prior to calling this
>> function? In fact, doesn't the hook point to this function?
>>
> 
> Let us benevolently call this defensive programming. We are actually
> in that callback AFAICT, so it sure was set a moment ago, and I guess
> the client code still holds the kvm.lock so it is guaranteed to stay
> so unless somebody is playing foul.

Defensive, but completely unnecessary; however, it doesn't negatively
affect the logic in the least.

> 
> We can address this with a patch on top.
> 
> Regards,
> Halil
>
Halil Pasic June 11, 2019, 3 p.m. UTC | #5
On Tue, 11 Jun 2019 10:37:55 -0400
Tony Krowiak <akrowiak@linux.ibm.com> wrote:

> On 6/7/19 10:29 AM, Halil Pasic wrote:
> > On Tue, 4 Jun 2019 15:38:51 -0400
> > Tony Krowiak <akrowiak@linux.ibm.com> wrote:

[..]

> >>> +static void vfio_ap_wait_for_irqclear(int apqn)
> >>> +{
> >>> +	struct ap_queue_status status;
> >>> +	int retry = 5;
> >>> +
> >>> +	do {
> >>> +		status = ap_tapq(apqn, NULL);
> >>> +		switch (status.response_code) {
> >>> +		case AP_RESPONSE_NORMAL:
> >>> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> >>> +			if (!status.irq_enabled)
> >>> +				return;
> >>> +			/* Fall through */
> >>> +		case AP_RESPONSE_BUSY:
> >>> +			msleep(20);
> >>> +			break;
> >>> +		case AP_RESPONSE_Q_NOT_AVAIL:
> >>> +		case AP_RESPONSE_DECONFIGURED:
> >>> +		case AP_RESPONSE_CHECKSTOPPED:
> >>> +		default:
> >>> +			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
> >>> +				  status.response_code, apqn);
> >>> +			return;
> >>
> >> Why not just break out of the loop and just use the WARN_ONCE
> >> outside of the loop?
> >>
> > 
> > AFAIU the idea was to differentiate between got a strange response_code
> > and ran out of retires.
> 
> In both cases, the response code is placed into the message, so one
> should be able to discern the reason in either case. This is not
> critical, just an observation.
> 

I understand, but the message below does say 'could not clear' while
the message above does not. One could infer that information, but I
could not do it without digging. So I think keeping these separate does
have a certain merit to it.

Let's keep it for now. We can change this later if we want.

> > 
> > Actually I suspect that we are fine in case of AP_RESPONSE_Q_NOT_AVAIL,
> >   AP_RESPONSE_DECONFIGURED and AP_RESPONSE_CHECKSTOPPED in a sense that
> > what should be the post-condition of this function is guaranteed to be
> > reached. What do you think?
> 
> That would seem to be the case given those response codes indicate the
> queue is not accessible.
> 
> > 
> > While I think that we can do better here, I see this as something that
> > should be done on top.
> 
> Are you talking about a patch on top? What do you think needs to be
> addressed?
> 

For starters, I'm not sure if the first warning is necessary or even
appropriate. See the paragraph starting with 'Actually I suspect that we
are fine in case ...'.

> > 
> >>> +		}
> >>> +	} while (--retry);
> >>> +
> >>> +	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
> >>> +		  __func__, status.response_code, apqn);
> >>> +}
> >>> +
> >>> +/**
> >>> + * vfio_ap_free_aqic_resources
> >>> + * @q: The vfio_ap_queue
> >>> + *
> >>> + * Unregisters the ISC in the GIB when the saved ISC not invalid.
> >>> + * Unpin the guest's page holding the NIB when it exist.
> >>> + * Reset the saved_pfn and saved_isc to invalid values.
> >>> + * Clear the pointer to the matrix mediated device.
> >>> + *
> >>> + */
> > 
> > [..]
> > 
> >>> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
> >>> +{
> >>> +	struct ap_qirq_ctrl aqic_gisa = {};
> >>> +	struct ap_queue_status status;
> >>> +	int retries = 5;
> >>> +
> >>> +	do {
> >>> +		status = ap_aqic(q->apqn, aqic_gisa, NULL);
> >>> +		switch (status.response_code) {
> >>> +		case AP_RESPONSE_OTHERWISE_CHANGED:
> >>> +		case AP_RESPONSE_NORMAL:
> >>> +			vfio_ap_wait_for_irqclear(q->apqn);
> >>> +			goto end_free;
> >>> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> >>> +		case AP_RESPONSE_BUSY:
> >>> +			msleep(20);
> >>> +			break;
> >>> +		case AP_RESPONSE_Q_NOT_AVAIL:
> >>> +		case AP_RESPONSE_DECONFIGURED:
> >>> +		case AP_RESPONSE_CHECKSTOPPED:
> >>> +		case AP_RESPONSE_INVALID_ADDRESS:
> >>> +		default:
> >>> +			/* All cases in default means AP not operational */
> >>> +			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> >>> +				  status.response_code);
> >>> +			goto end_free;
> >>
> >> Why not just break out of the loop instead of repeating the WARN_ONCE
> >> message?
> >>
> > 
> > I suppose the reason is same as above. I'm not entirely happy with this
> > code myself. E.g. why do we do retries here -- shouldn't we just fail the
> > aqic by the guest?
> 
> According to my reading of the code, it looks like the retries are for
> response code AP_RESPONSE_BUSY. Why wouldn't we want to wait until the
> queue was not busy anymore?
> 

Does HW/FW wait or does it present AP_RESPONSE_BUSY? (Rhetoric
question.)  It is for the guest to decide if and how does it wish to
wait or otherwise react to AP_RESPONSE_BUSY. Or am I missing something?

> > 
> > [..]
> > 
> >>> +static int handle_pqap(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +	uint64_t status;
> >>> +	uint16_t apqn;
> >>> +	struct vfio_ap_queue *q;
> >>> +	struct ap_queue_status qstatus = {
> >>> +			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
> >>> +	struct ap_matrix_mdev *matrix_mdev;
> >>> +
> >>> +	/* If we do not use the AIV facility just go to userland */
> >>> +	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
> >>> +		return -EOPNOTSUPP;
> >>> +
> >>> +	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
> >>> +	mutex_lock(&matrix_dev->lock);
> >>> +
> >>> +	if (!vcpu->kvm->arch.crypto.pqap_hook)
> >>
> >> Wasn't this already checked in patch 2 prior to calling this
> >> function? In fact, doesn't the hook point to this function?
> >>
> > 
> > Let us benevolently call this defensive programming. We are actually
> > in that callback AFAICT, so it sure was set a moment ago, and I guess
> > the client code still holds the kvm.lock so it is guaranteed to stay
> > so unless somebody is playing foul.
> 
> Defensive, but completely unnecessary; however, it doesn't negatively
> affect the logic in the least.
>

I agree it is unnecessary. We can get rid of it later. I'm not too keen
of altering somebody's patch without a really strong reason.
 
> > 
> > We can address this with a patch on top.
> > 

[..]

Regards,
Halil
Harald Freudenberger June 12, 2019, 1:56 p.m. UTC | #6
On 21.05.19 17:34, Pierre Morel wrote:
> We register a AP PQAP instruction hook during the open
> of the mediated device. And unregister it on release.
>
> During the probe of the AP device, we allocate a vfio_ap_queue
> structure to keep track of the information we need for the
> PQAP/AQIC instruction interception.
>
> In the AP PQAP instruction hook, if we receive a demand to
> enable IRQs,
> - we retrieve the vfio_ap_queue based on the APQN we receive
>   in REG1,
> - we retrieve the page of the guest address, (NIB), from
>   register REG2
> - we retrieve the mediated device to use the VFIO pinning
>   infrastructure to pin the page of the guest address,
> - we retrieve the pointer to KVM to register the guest ISC
>   and retrieve the host ISC
> - finaly we activate GISA
>
> If we receive a demand to disable IRQs,
> - we deactivate GISA
> - unregister from the GIB
> - unpin the NIB
>
> When removing the AP device from the driver the device is
> reseted and this process unregisters the GISA from the GIB,
> and unpins the NIB address then we free the vfio_ap_queue
> structure.
>
> Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
> ---
>  drivers/s390/crypto/vfio_ap_drv.c     |  34 +++-
>  drivers/s390/crypto/vfio_ap_ops.c     | 336 +++++++++++++++++++++++++++++++++-
>  drivers/s390/crypto/vfio_ap_private.h |  11 ++
>  3 files changed, 374 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
> index e9824c3..003662a 100644
> --- a/drivers/s390/crypto/vfio_ap_drv.c
> +++ b/drivers/s390/crypto/vfio_ap_drv.c
> @@ -5,6 +5,7 @@
>   * Copyright IBM Corp. 2018
>   *
>   * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
> + *	      Pierre Morel <pmorel@linux.ibm.com>
>   */
>  
>  #include <linux/module.h>
> @@ -40,14 +41,45 @@ static struct ap_device_id ap_queue_ids[] = {
>  
>  MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
>  
> +/**
> + * vfio_ap_queue_dev_probe:
> + *
> + * Allocate a vfio_ap_queue structure and associate it
> + * with the device as driver_data.
> + */
>  static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
>  {
> +	struct vfio_ap_queue *q;
> +
> +	q = kzalloc(sizeof(*q), GFP_KERNEL);
> +	if (!q)
> +		return -ENOMEM;
> +	dev_set_drvdata(&apdev->device, q);
> +	q->apqn = to_ap_queue(&apdev->device)->qid;
> +	q->saved_isc = VFIO_AP_ISC_INVALID;
>  	return 0;
>  }
>  
> +/**
> + * vfio_ap_queue_dev_remove:
> + *
> + * Takes the matrix lock to avoid actions on this device while removing
> + * Free the associated vfio_ap_queue structure
> + */
>  static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
>  {
> -	/* Nothing to do yet */
> +	struct vfio_ap_queue *q;
> +	int apid, apqi;
> +
> +	mutex_lock(&matrix_dev->lock);
> +	q = dev_get_drvdata(&apdev->device);
> +	dev_set_drvdata(&apdev->device, NULL);
> +	apid = AP_QID_CARD(q->apqn);
> +	apqi = AP_QID_QUEUE(q->apqn);
> +	vfio_ap_mdev_reset_queue(apid, apqi, 1);
> +	vfio_ap_irq_disable(q);
> +	kfree(q);
> +	mutex_unlock(&matrix_dev->lock);
>  }
>  
>  static void vfio_ap_matrix_dev_release(struct device *dev)
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index e8e87bf..015174f 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -24,6 +24,295 @@
>  #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough"
>  #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device"
>  
> +static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev);
> +
> +static int match_apqn(struct device *dev, void *data)
> +{
> +	struct vfio_ap_queue *q = dev_get_drvdata(dev);
> +
> +	return (q->apqn == *(int *)(data)) ? 1 : 0;
> +}
> +
> +/**
> + * vfio_ap_get_queue: Retrieve a queue with a specific APQN from a list
> + * @matrix_mdev: the associated mediated matrix
> + * @apqn: The queue APQN
> + *
> + * Retrieve a queue with a specific APQN from the list of the
> + * devices of the vfio_ap_drv.
> + * Verify that the APID and the APQI are set in the matrix.
> + *
> + * Returns the pointer to the associated vfio_ap_queue
> + */
> +struct vfio_ap_queue *vfio_ap_get_queue(struct ap_matrix_mdev *matrix_mdev,
> +					int apqn)
> +{
> +	struct vfio_ap_queue *q;
> +	struct device *dev;
> +
> +	if (!test_bit_inv(AP_QID_CARD(apqn), matrix_mdev->matrix.apm))
> +		return NULL;
> +	if (!test_bit_inv(AP_QID_QUEUE(apqn), matrix_mdev->matrix.aqm))
> +		return NULL;
> +
> +	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
> +				 &apqn, match_apqn);
> +	if (!dev)
> +		return NULL;
> +	q = dev_get_drvdata(dev);
> +	q->matrix_mdev = matrix_mdev;
> +	put_device(dev);
> +
> +	return q;
> +}
> +
> +/**
> + * vfio_ap_wait_for_irqclear
> + * @apqn: The AP Queue number
> + *
> + * Checks the IRQ bit for the status of this APQN using ap_tapq.
> + * Returns if the ap_tapq function succedded and the bit is clear.
> + * Returns if ap_tapq function failed with invalid, deconfigured or
> + * checkstopped AP.
> + * Otherwise retries up to 5 times after waiting 20ms.
> + *
> + */
> +static void vfio_ap_wait_for_irqclear(int apqn)
> +{
> +	struct ap_queue_status status;
> +	int retry = 5;
> +
> +	do {
> +		status = ap_tapq(apqn, NULL);
> +		switch (status.response_code) {
> +		case AP_RESPONSE_NORMAL:
> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> +			if (!status.irq_enabled)
> +				return;
> +			/* Fall through */
> +		case AP_RESPONSE_BUSY:
> +			msleep(20);
> +			break;
> +		case AP_RESPONSE_Q_NOT_AVAIL:
> +		case AP_RESPONSE_DECONFIGURED:
> +		case AP_RESPONSE_CHECKSTOPPED:
> +		default:
> +			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
> +				  status.response_code, apqn);
> +			return;
> +		}
> +	} while (--retry);
> +
> +	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
> +		  __func__, status.response_code, apqn);
> +}
> +
> +/**
> + * vfio_ap_free_aqic_resources
> + * @q: The vfio_ap_queue
> + *
> + * Unregisters the ISC in the GIB when the saved ISC not invalid.
> + * Unpin the guest's page holding the NIB when it exist.
> + * Reset the saved_pfn and saved_isc to invalid values.
> + * Clear the pointer to the matrix mediated device.
> + *
> + */
> +static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q)
> +{
> +	if (q->saved_isc != VFIO_AP_ISC_INVALID && q->matrix_mdev)
> +		kvm_s390_gisc_unregister(q->matrix_mdev->kvm, q->saved_isc);
> +	if (q->saved_pfn && q->matrix_mdev)
> +		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev),
> +				 &q->saved_pfn, 1);
> +	q->saved_pfn = 0;
> +	q->saved_isc = VFIO_AP_ISC_INVALID;
> +	q->matrix_mdev = NULL;
> +}
> +
> +/**
> + * vfio_ap_irq_disable
> + * @q: The vfio_ap_queue
> + *
> + * Uses ap_aqic to disable the interruption and in case of success, reset
> + * in progress or IRQ disable command already proceeded: calls
> + * vfio_ap_wait_for_irqclear() to check for the IRQ bit to be clear
> + * and calls vfio_ap_free_aqic_resources() to free the resources associated
> + * with the AP interrupt handling.
> + *
> + * In the case the AP is busy, or a reset is in progress,
> + * retries after 20ms, up to 5 times.
> + *
> + * Returns if ap_aqic function failed with invalid, deconfigured or
> + * checkstopped AP.
> + */
> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
> +{
> +	struct ap_qirq_ctrl aqic_gisa = {};
> +	struct ap_queue_status status;
> +	int retries = 5;
> +
> +	do {
> +		status = ap_aqic(q->apqn, aqic_gisa, NULL);
> +		switch (status.response_code) {
> +		case AP_RESPONSE_OTHERWISE_CHANGED:
> +		case AP_RESPONSE_NORMAL:
> +			vfio_ap_wait_for_irqclear(q->apqn);
> +			goto end_free;
> +		case AP_RESPONSE_RESET_IN_PROGRESS:
> +		case AP_RESPONSE_BUSY:
> +			msleep(20);
> +			break;
> +		case AP_RESPONSE_Q_NOT_AVAIL:
> +		case AP_RESPONSE_DECONFIGURED:
> +		case AP_RESPONSE_CHECKSTOPPED:
> +		case AP_RESPONSE_INVALID_ADDRESS:
> +		default:
> +			/* All cases in default means AP not operational */
> +			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> +				  status.response_code);
> +			goto end_free;
> +		}
> +	} while (retries--);
> +
> +	WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
> +		  status.response_code);
> +end_free:
> +	vfio_ap_free_aqic_resources(q);
> +	return status;
> +}
> +
> +/**
> + * vfio_ap_setirq: Enable Interruption for a APQN
> + *
> + * @dev: the device associated with the ap_queue
> + * @q:   the vfio_ap_queue holding AQIC parameters
> + *
> + * Pin the NIB saved in *q
> + * Register the guest ISC to GIB interface and retrieve the
> + * host ISC to issue the host side PQAP/AQIC
> + *
> + * Response.status may be set to AP_RESPONSE_INVALID_ADDRESS in case the
> + * vfio_pin_pages failed.
> + *
> + * Otherwise return the ap_queue_status returned by the ap_aqic(),
> + * all retry handling will be done by the guest.
> + */
> +static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
> +						 int isc,
> +						 unsigned long nib)
> +{
> +	struct ap_qirq_ctrl aqic_gisa = {};
> +	struct ap_queue_status status = {};
> +	struct kvm_s390_gisa *gisa;
> +	struct kvm *kvm;
> +	unsigned long h_nib, g_pfn, h_pfn;
> +	int ret;
> +
> +	g_pfn = nib >> PAGE_SHIFT;
> +	ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1,
> +			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
> +	switch (ret) {
> +	case 1:
> +		break;
> +	default:
> +		status.response_code = AP_RESPONSE_INVALID_ADDRESS;
> +		return status;
> +	}
> +
> +	kvm = q->matrix_mdev->kvm;
> +	gisa = kvm->arch.gisa_int.origin;
> +
> +	h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK);
> +	aqic_gisa.gisc = isc;
> +	aqic_gisa.isc = kvm_s390_gisc_register(kvm, isc);
> +	aqic_gisa.ir = 1;
> +	aqic_gisa.gisa = (uint64_t)gisa >> 4;
> +
> +	status = ap_aqic(q->apqn, aqic_gisa, (void *)h_nib);
> +	switch (status.response_code) {
> +	case AP_RESPONSE_NORMAL:
> +		/* See if we did clear older IRQ configuration */
> +		vfio_ap_free_aqic_resources(q);
> +		q->saved_pfn = g_pfn;
> +		q->saved_isc = isc;
> +		break;
> +	case AP_RESPONSE_OTHERWISE_CHANGED:
> +		/* We could not modify IRQ setings: clear new configuration */
> +		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1);
> +		kvm_s390_gisc_unregister(kvm, isc);
> +		break;
> +	default:
> +		pr_warn("%s: apqn %04x: response: %02x\n", __func__, q->apqn,
> +			status.response_code);
> +		vfio_ap_irq_disable(q);
> +		break;
> +	}
> +
> +	return status;
> +}
> +
> +/**
> + * handle_pqap: PQAP instruction callback
> + *
> + * @vcpu: The vcpu on which we received the PQAP instruction
> + *
> + * Get the general register contents to initialize internal variables.
> + * REG[0]: APQN
> + * REG[1]: IR and ISC
> + * REG[2]: NIB
> + *
> + * Response.status may be set to following Response Code:
> + * - AP_RESPONSE_Q_NOT_AVAIL: if the queue is not available
> + * - AP_RESPONSE_DECONFIGURED: if the queue is not configured
> + * - AP_RESPONSE_NORMAL (0) : in case of successs
> + *   Check vfio_ap_setirq() and vfio_ap_clrirq() for other possible RC.
> + * We take the matrix_dev lock to ensure serialization on queues and
> + * mediated device access.
> + *
> + * Return 0 if we could handle the request inside KVM.
> + * otherwise, returns -EOPNOTSUPP to let QEMU handle the fault.
> + */
> +static int handle_pqap(struct kvm_vcpu *vcpu)
> +{
> +	uint64_t status;
> +	uint16_t apqn;
> +	struct vfio_ap_queue *q;
> +	struct ap_queue_status qstatus = {
> +			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
> +	struct ap_matrix_mdev *matrix_mdev;
> +
> +	/* If we do not use the AIV facility just go to userland */
> +	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
> +		return -EOPNOTSUPP;
> +
> +	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
> +	mutex_lock(&matrix_dev->lock);
> +
> +	if (!vcpu->kvm->arch.crypto.pqap_hook)
> +		goto out_unlock;
> +	matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook,
> +				   struct ap_matrix_mdev, pqap_hook);
> +
> +	q = vfio_ap_get_queue(matrix_mdev, apqn);
> +	if (!q)
> +		goto out_unlock;
> +
> +	status = vcpu->run->s.regs.gprs[1];
> +
> +	/* If IR bit(16) is set we enable the interrupt */
> +	if ((status >> (63 - 16)) & 0x01)
> +		qstatus = vfio_ap_irq_enable(q, status & 0x07,
> +					     vcpu->run->s.regs.gprs[2]);
> +	else
> +		qstatus = vfio_ap_irq_disable(q);
> +
> +out_unlock:
> +	memcpy(&vcpu->run->s.regs.gprs[1], &qstatus, sizeof(qstatus));
> +	vcpu->run->s.regs.gprs[1] >>= 32;
> +	mutex_unlock(&matrix_dev->lock);
> +	return 0;
> +}
> +
>  static void vfio_ap_matrix_init(struct ap_config_info *info,
>  				struct ap_matrix *matrix)
>  {
> @@ -45,8 +334,11 @@ static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
>  		return -ENOMEM;
>  	}
>  
> +	matrix_mdev->mdev = mdev;
>  	vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
>  	mdev_set_drvdata(mdev, matrix_mdev);
> +	matrix_mdev->pqap_hook.hook = handle_pqap;
> +	matrix_mdev->pqap_hook.owner = THIS_MODULE;
>  	mutex_lock(&matrix_dev->lock);
>  	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
>  	mutex_unlock(&matrix_dev->lock);
> @@ -62,6 +354,7 @@ static int vfio_ap_mdev_remove(struct mdev_device *mdev)
>  		return -EBUSY;
>  
>  	mutex_lock(&matrix_dev->lock);
> +	vfio_ap_mdev_reset_queues(mdev);
>  	list_del(&matrix_mdev->node);
>  	mutex_unlock(&matrix_dev->lock);
>  
> @@ -754,6 +1047,8 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
>  	}
>  
>  	matrix_mdev->kvm = kvm;
> +	kvm_get_kvm(kvm);
> +	kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
>  	mutex_unlock(&matrix_dev->lock);
>  
>  	return 0;
> @@ -819,15 +1114,36 @@ static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
>  	return NOTIFY_OK;
>  }
>  
> -static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> -				    unsigned int retry)
> +static void vfio_ap_irq_disable_apqn(int apqn)
> +{
> +	struct device *dev;
> +	struct vfio_ap_queue *q;
> +
> +	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
> +				 &apqn, match_apqn);
> +	if (dev) {
> +		q = dev_get_drvdata(dev);
> +		vfio_ap_irq_disable(q);
> +		put_device(dev);
> +	}
> +}
> +
> +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> +			     unsigned int retry)
>  {
>  	struct ap_queue_status status;
> +	int retry2 = 2;
> +	int apqn = AP_MKQID(apid, apqi);
>  
>  	do {
> -		status = ap_zapq(AP_MKQID(apid, apqi));
> +		status = ap_zapq(apqn);
>  		switch (status.response_code) {
>  		case AP_RESPONSE_NORMAL:
> +			while (!status.queue_empty && retry2--) {
> +				msleep(20);
> +				status = ap_tapq(apqn, NULL);
> +			}
> +			WARN_ON_ONCE(retry <= 0);
>  			return 0;
>  		case AP_RESPONSE_RESET_IN_PROGRESS:
>  		case AP_RESPONSE_BUSY:
> @@ -861,6 +1177,7 @@ static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev)
>  			 */
>  			if (ret)
>  				rc = ret;
> +			vfio_ap_irq_disable_apqn(AP_MKQID(apid, apqi));
>  		}
>  	}
>  
> @@ -904,15 +1221,20 @@ static void vfio_ap_mdev_release(struct mdev_device *mdev)
>  {
>  	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
>  
> -	if (matrix_mdev->kvm)
> +	mutex_lock(&matrix_dev->lock);
> +	if (matrix_mdev->kvm) {
>  		kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
> +		matrix_mdev->kvm->arch.crypto.pqap_hook = NULL;
> +		vfio_ap_mdev_reset_queues(mdev);
> +		kvm_put_kvm(matrix_mdev->kvm);
> +		matrix_mdev->kvm = NULL;
> +	}
> +	mutex_unlock(&matrix_dev->lock);
>  
> -	vfio_ap_mdev_reset_queues(mdev);
>  	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
>  				 &matrix_mdev->iommu_notifier);
>  	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
>  				 &matrix_mdev->group_notifier);
> -	matrix_mdev->kvm = NULL;
>  	module_put(THIS_MODULE);
>  }
>  
> @@ -941,6 +1263,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
>  {
>  	int ret;
>  
> +	mutex_lock(&matrix_dev->lock);
>  	switch (cmd) {
>  	case VFIO_DEVICE_GET_INFO:
>  		ret = vfio_ap_mdev_get_device_info(arg);
> @@ -952,6 +1275,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
>  		ret = -EOPNOTSUPP;
>  		break;
>  	}
> +	mutex_unlock(&matrix_dev->lock);
>  
>  	return ret;
>  }
> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
> index 18dcc4d..f46dde5 100644
> --- a/drivers/s390/crypto/vfio_ap_private.h
> +++ b/drivers/s390/crypto/vfio_ap_private.h
> @@ -4,6 +4,7 @@
>   *
>   * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
>   *	      Halil Pasic <pasic@linux.ibm.com>
> + *	      Pierre Morel <pmorel@linux.ibm.com>
>   *
>   * Copyright IBM Corp. 2018
>   */
> @@ -89,5 +90,15 @@ struct ap_matrix_mdev {
>  
>  extern int vfio_ap_mdev_register(void);
>  extern void vfio_ap_mdev_unregister(void);
> +int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
> +			     unsigned int retry);
>  
> +struct vfio_ap_queue {
> +	struct ap_matrix_mdev *matrix_mdev;
> +	unsigned long saved_pfn;
> +	int	apqn;
> +#define VFIO_AP_ISC_INVALID 0xff
> +	unsigned char saved_isc;
> +};
> +struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q);
>  #endif /* _VFIO_AP_PRIVATE_H_ */
acked-by: Harald Freudenberger <freude@linux.ibm.com>
diff mbox series

Patch

diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
index e9824c3..003662a 100644
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -5,6 +5,7 @@ 
  * Copyright IBM Corp. 2018
  *
  * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
+ *	      Pierre Morel <pmorel@linux.ibm.com>
  */
 
 #include <linux/module.h>
@@ -40,14 +41,45 @@  static struct ap_device_id ap_queue_ids[] = {
 
 MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
 
+/**
+ * vfio_ap_queue_dev_probe:
+ *
+ * Allocate a vfio_ap_queue structure and associate it
+ * with the device as driver_data.
+ */
 static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
 {
+	struct vfio_ap_queue *q;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+	dev_set_drvdata(&apdev->device, q);
+	q->apqn = to_ap_queue(&apdev->device)->qid;
+	q->saved_isc = VFIO_AP_ISC_INVALID;
 	return 0;
 }
 
+/**
+ * vfio_ap_queue_dev_remove:
+ *
+ * Takes the matrix lock to avoid actions on this device while removing
+ * Free the associated vfio_ap_queue structure
+ */
 static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
 {
-	/* Nothing to do yet */
+	struct vfio_ap_queue *q;
+	int apid, apqi;
+
+	mutex_lock(&matrix_dev->lock);
+	q = dev_get_drvdata(&apdev->device);
+	dev_set_drvdata(&apdev->device, NULL);
+	apid = AP_QID_CARD(q->apqn);
+	apqi = AP_QID_QUEUE(q->apqn);
+	vfio_ap_mdev_reset_queue(apid, apqi, 1);
+	vfio_ap_irq_disable(q);
+	kfree(q);
+	mutex_unlock(&matrix_dev->lock);
 }
 
 static void vfio_ap_matrix_dev_release(struct device *dev)
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index e8e87bf..015174f 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -24,6 +24,295 @@ 
 #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough"
 #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device"
 
+static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev);
+
+static int match_apqn(struct device *dev, void *data)
+{
+	struct vfio_ap_queue *q = dev_get_drvdata(dev);
+
+	return (q->apqn == *(int *)(data)) ? 1 : 0;
+}
+
+/**
+ * vfio_ap_get_queue: Retrieve a queue with a specific APQN from a list
+ * @matrix_mdev: the associated mediated matrix
+ * @apqn: The queue APQN
+ *
+ * Retrieve a queue with a specific APQN from the list of the
+ * devices of the vfio_ap_drv.
+ * Verify that the APID and the APQI are set in the matrix.
+ *
+ * Returns the pointer to the associated vfio_ap_queue
+ */
+struct vfio_ap_queue *vfio_ap_get_queue(struct ap_matrix_mdev *matrix_mdev,
+					int apqn)
+{
+	struct vfio_ap_queue *q;
+	struct device *dev;
+
+	if (!test_bit_inv(AP_QID_CARD(apqn), matrix_mdev->matrix.apm))
+		return NULL;
+	if (!test_bit_inv(AP_QID_QUEUE(apqn), matrix_mdev->matrix.aqm))
+		return NULL;
+
+	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
+				 &apqn, match_apqn);
+	if (!dev)
+		return NULL;
+	q = dev_get_drvdata(dev);
+	q->matrix_mdev = matrix_mdev;
+	put_device(dev);
+
+	return q;
+}
+
+/**
+ * vfio_ap_wait_for_irqclear
+ * @apqn: The AP Queue number
+ *
+ * Checks the IRQ bit for the status of this APQN using ap_tapq.
+ * Returns if the ap_tapq function succedded and the bit is clear.
+ * Returns if ap_tapq function failed with invalid, deconfigured or
+ * checkstopped AP.
+ * Otherwise retries up to 5 times after waiting 20ms.
+ *
+ */
+static void vfio_ap_wait_for_irqclear(int apqn)
+{
+	struct ap_queue_status status;
+	int retry = 5;
+
+	do {
+		status = ap_tapq(apqn, NULL);
+		switch (status.response_code) {
+		case AP_RESPONSE_NORMAL:
+		case AP_RESPONSE_RESET_IN_PROGRESS:
+			if (!status.irq_enabled)
+				return;
+			/* Fall through */
+		case AP_RESPONSE_BUSY:
+			msleep(20);
+			break;
+		case AP_RESPONSE_Q_NOT_AVAIL:
+		case AP_RESPONSE_DECONFIGURED:
+		case AP_RESPONSE_CHECKSTOPPED:
+		default:
+			WARN_ONCE(1, "%s: tapq rc %02x: %04x\n", __func__,
+				  status.response_code, apqn);
+			return;
+		}
+	} while (--retry);
+
+	WARN_ONCE(1, "%s: tapq rc %02x: %04x could not clear IR bit\n",
+		  __func__, status.response_code, apqn);
+}
+
+/**
+ * vfio_ap_free_aqic_resources
+ * @q: The vfio_ap_queue
+ *
+ * Unregisters the ISC in the GIB when the saved ISC not invalid.
+ * Unpin the guest's page holding the NIB when it exist.
+ * Reset the saved_pfn and saved_isc to invalid values.
+ * Clear the pointer to the matrix mediated device.
+ *
+ */
+static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q)
+{
+	if (q->saved_isc != VFIO_AP_ISC_INVALID && q->matrix_mdev)
+		kvm_s390_gisc_unregister(q->matrix_mdev->kvm, q->saved_isc);
+	if (q->saved_pfn && q->matrix_mdev)
+		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev),
+				 &q->saved_pfn, 1);
+	q->saved_pfn = 0;
+	q->saved_isc = VFIO_AP_ISC_INVALID;
+	q->matrix_mdev = NULL;
+}
+
+/**
+ * vfio_ap_irq_disable
+ * @q: The vfio_ap_queue
+ *
+ * Uses ap_aqic to disable the interruption and in case of success, reset
+ * in progress or IRQ disable command already proceeded: calls
+ * vfio_ap_wait_for_irqclear() to check for the IRQ bit to be clear
+ * and calls vfio_ap_free_aqic_resources() to free the resources associated
+ * with the AP interrupt handling.
+ *
+ * In the case the AP is busy, or a reset is in progress,
+ * retries after 20ms, up to 5 times.
+ *
+ * Returns if ap_aqic function failed with invalid, deconfigured or
+ * checkstopped AP.
+ */
+struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q)
+{
+	struct ap_qirq_ctrl aqic_gisa = {};
+	struct ap_queue_status status;
+	int retries = 5;
+
+	do {
+		status = ap_aqic(q->apqn, aqic_gisa, NULL);
+		switch (status.response_code) {
+		case AP_RESPONSE_OTHERWISE_CHANGED:
+		case AP_RESPONSE_NORMAL:
+			vfio_ap_wait_for_irqclear(q->apqn);
+			goto end_free;
+		case AP_RESPONSE_RESET_IN_PROGRESS:
+		case AP_RESPONSE_BUSY:
+			msleep(20);
+			break;
+		case AP_RESPONSE_Q_NOT_AVAIL:
+		case AP_RESPONSE_DECONFIGURED:
+		case AP_RESPONSE_CHECKSTOPPED:
+		case AP_RESPONSE_INVALID_ADDRESS:
+		default:
+			/* All cases in default means AP not operational */
+			WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
+				  status.response_code);
+			goto end_free;
+		}
+	} while (retries--);
+
+	WARN_ONCE(1, "%s: ap_aqic status %d\n", __func__,
+		  status.response_code);
+end_free:
+	vfio_ap_free_aqic_resources(q);
+	return status;
+}
+
+/**
+ * vfio_ap_setirq: Enable Interruption for a APQN
+ *
+ * @dev: the device associated with the ap_queue
+ * @q:   the vfio_ap_queue holding AQIC parameters
+ *
+ * Pin the NIB saved in *q
+ * Register the guest ISC to GIB interface and retrieve the
+ * host ISC to issue the host side PQAP/AQIC
+ *
+ * Response.status may be set to AP_RESPONSE_INVALID_ADDRESS in case the
+ * vfio_pin_pages failed.
+ *
+ * Otherwise return the ap_queue_status returned by the ap_aqic(),
+ * all retry handling will be done by the guest.
+ */
+static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
+						 int isc,
+						 unsigned long nib)
+{
+	struct ap_qirq_ctrl aqic_gisa = {};
+	struct ap_queue_status status = {};
+	struct kvm_s390_gisa *gisa;
+	struct kvm *kvm;
+	unsigned long h_nib, g_pfn, h_pfn;
+	int ret;
+
+	g_pfn = nib >> PAGE_SHIFT;
+	ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1,
+			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
+	switch (ret) {
+	case 1:
+		break;
+	default:
+		status.response_code = AP_RESPONSE_INVALID_ADDRESS;
+		return status;
+	}
+
+	kvm = q->matrix_mdev->kvm;
+	gisa = kvm->arch.gisa_int.origin;
+
+	h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK);
+	aqic_gisa.gisc = isc;
+	aqic_gisa.isc = kvm_s390_gisc_register(kvm, isc);
+	aqic_gisa.ir = 1;
+	aqic_gisa.gisa = (uint64_t)gisa >> 4;
+
+	status = ap_aqic(q->apqn, aqic_gisa, (void *)h_nib);
+	switch (status.response_code) {
+	case AP_RESPONSE_NORMAL:
+		/* See if we did clear older IRQ configuration */
+		vfio_ap_free_aqic_resources(q);
+		q->saved_pfn = g_pfn;
+		q->saved_isc = isc;
+		break;
+	case AP_RESPONSE_OTHERWISE_CHANGED:
+		/* We could not modify IRQ setings: clear new configuration */
+		vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1);
+		kvm_s390_gisc_unregister(kvm, isc);
+		break;
+	default:
+		pr_warn("%s: apqn %04x: response: %02x\n", __func__, q->apqn,
+			status.response_code);
+		vfio_ap_irq_disable(q);
+		break;
+	}
+
+	return status;
+}
+
+/**
+ * handle_pqap: PQAP instruction callback
+ *
+ * @vcpu: The vcpu on which we received the PQAP instruction
+ *
+ * Get the general register contents to initialize internal variables.
+ * REG[0]: APQN
+ * REG[1]: IR and ISC
+ * REG[2]: NIB
+ *
+ * Response.status may be set to following Response Code:
+ * - AP_RESPONSE_Q_NOT_AVAIL: if the queue is not available
+ * - AP_RESPONSE_DECONFIGURED: if the queue is not configured
+ * - AP_RESPONSE_NORMAL (0) : in case of successs
+ *   Check vfio_ap_setirq() and vfio_ap_clrirq() for other possible RC.
+ * We take the matrix_dev lock to ensure serialization on queues and
+ * mediated device access.
+ *
+ * Return 0 if we could handle the request inside KVM.
+ * otherwise, returns -EOPNOTSUPP to let QEMU handle the fault.
+ */
+static int handle_pqap(struct kvm_vcpu *vcpu)
+{
+	uint64_t status;
+	uint16_t apqn;
+	struct vfio_ap_queue *q;
+	struct ap_queue_status qstatus = {
+			       .response_code = AP_RESPONSE_Q_NOT_AVAIL, };
+	struct ap_matrix_mdev *matrix_mdev;
+
+	/* If we do not use the AIV facility just go to userland */
+	if (!(vcpu->arch.sie_block->eca & ECA_AIV))
+		return -EOPNOTSUPP;
+
+	apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
+	mutex_lock(&matrix_dev->lock);
+
+	if (!vcpu->kvm->arch.crypto.pqap_hook)
+		goto out_unlock;
+	matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook,
+				   struct ap_matrix_mdev, pqap_hook);
+
+	q = vfio_ap_get_queue(matrix_mdev, apqn);
+	if (!q)
+		goto out_unlock;
+
+	status = vcpu->run->s.regs.gprs[1];
+
+	/* If IR bit(16) is set we enable the interrupt */
+	if ((status >> (63 - 16)) & 0x01)
+		qstatus = vfio_ap_irq_enable(q, status & 0x07,
+					     vcpu->run->s.regs.gprs[2]);
+	else
+		qstatus = vfio_ap_irq_disable(q);
+
+out_unlock:
+	memcpy(&vcpu->run->s.regs.gprs[1], &qstatus, sizeof(qstatus));
+	vcpu->run->s.regs.gprs[1] >>= 32;
+	mutex_unlock(&matrix_dev->lock);
+	return 0;
+}
+
 static void vfio_ap_matrix_init(struct ap_config_info *info,
 				struct ap_matrix *matrix)
 {
@@ -45,8 +334,11 @@  static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
 		return -ENOMEM;
 	}
 
+	matrix_mdev->mdev = mdev;
 	vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
 	mdev_set_drvdata(mdev, matrix_mdev);
+	matrix_mdev->pqap_hook.hook = handle_pqap;
+	matrix_mdev->pqap_hook.owner = THIS_MODULE;
 	mutex_lock(&matrix_dev->lock);
 	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
 	mutex_unlock(&matrix_dev->lock);
@@ -62,6 +354,7 @@  static int vfio_ap_mdev_remove(struct mdev_device *mdev)
 		return -EBUSY;
 
 	mutex_lock(&matrix_dev->lock);
+	vfio_ap_mdev_reset_queues(mdev);
 	list_del(&matrix_mdev->node);
 	mutex_unlock(&matrix_dev->lock);
 
@@ -754,6 +1047,8 @@  static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
 	}
 
 	matrix_mdev->kvm = kvm;
+	kvm_get_kvm(kvm);
+	kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
 	mutex_unlock(&matrix_dev->lock);
 
 	return 0;
@@ -819,15 +1114,36 @@  static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
-				    unsigned int retry)
+static void vfio_ap_irq_disable_apqn(int apqn)
+{
+	struct device *dev;
+	struct vfio_ap_queue *q;
+
+	dev = driver_find_device(&matrix_dev->vfio_ap_drv->driver, NULL,
+				 &apqn, match_apqn);
+	if (dev) {
+		q = dev_get_drvdata(dev);
+		vfio_ap_irq_disable(q);
+		put_device(dev);
+	}
+}
+
+int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
+			     unsigned int retry)
 {
 	struct ap_queue_status status;
+	int retry2 = 2;
+	int apqn = AP_MKQID(apid, apqi);
 
 	do {
-		status = ap_zapq(AP_MKQID(apid, apqi));
+		status = ap_zapq(apqn);
 		switch (status.response_code) {
 		case AP_RESPONSE_NORMAL:
+			while (!status.queue_empty && retry2--) {
+				msleep(20);
+				status = ap_tapq(apqn, NULL);
+			}
+			WARN_ON_ONCE(retry <= 0);
 			return 0;
 		case AP_RESPONSE_RESET_IN_PROGRESS:
 		case AP_RESPONSE_BUSY:
@@ -861,6 +1177,7 @@  static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev)
 			 */
 			if (ret)
 				rc = ret;
+			vfio_ap_irq_disable_apqn(AP_MKQID(apid, apqi));
 		}
 	}
 
@@ -904,15 +1221,20 @@  static void vfio_ap_mdev_release(struct mdev_device *mdev)
 {
 	struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
 
-	if (matrix_mdev->kvm)
+	mutex_lock(&matrix_dev->lock);
+	if (matrix_mdev->kvm) {
 		kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
+		matrix_mdev->kvm->arch.crypto.pqap_hook = NULL;
+		vfio_ap_mdev_reset_queues(mdev);
+		kvm_put_kvm(matrix_mdev->kvm);
+		matrix_mdev->kvm = NULL;
+	}
+	mutex_unlock(&matrix_dev->lock);
 
-	vfio_ap_mdev_reset_queues(mdev);
 	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
 				 &matrix_mdev->iommu_notifier);
 	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
 				 &matrix_mdev->group_notifier);
-	matrix_mdev->kvm = NULL;
 	module_put(THIS_MODULE);
 }
 
@@ -941,6 +1263,7 @@  static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
 {
 	int ret;
 
+	mutex_lock(&matrix_dev->lock);
 	switch (cmd) {
 	case VFIO_DEVICE_GET_INFO:
 		ret = vfio_ap_mdev_get_device_info(arg);
@@ -952,6 +1275,7 @@  static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
 		ret = -EOPNOTSUPP;
 		break;
 	}
+	mutex_unlock(&matrix_dev->lock);
 
 	return ret;
 }
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index 18dcc4d..f46dde5 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -4,6 +4,7 @@ 
  *
  * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
  *	      Halil Pasic <pasic@linux.ibm.com>
+ *	      Pierre Morel <pmorel@linux.ibm.com>
  *
  * Copyright IBM Corp. 2018
  */
@@ -89,5 +90,15 @@  struct ap_matrix_mdev {
 
 extern int vfio_ap_mdev_register(void);
 extern void vfio_ap_mdev_unregister(void);
+int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
+			     unsigned int retry);
 
+struct vfio_ap_queue {
+	struct ap_matrix_mdev *matrix_mdev;
+	unsigned long saved_pfn;
+	int	apqn;
+#define VFIO_AP_ISC_INVALID 0xff
+	unsigned char saved_isc;
+};
+struct ap_queue_status vfio_ap_irq_disable(struct vfio_ap_queue *q);
 #endif /* _VFIO_AP_PRIVATE_H_ */