diff mbox series

[v1] s390/vfio-ap: Signal eventfd when guest AP configuration is changed

Message ID 20250107183645.90082-1-rreyes@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series [v1] s390/vfio-ap: Signal eventfd when guest AP configuration is changed | expand

Commit Message

Rorie Reyes Jan. 7, 2025, 6:36 p.m. UTC
In this patch, an eventfd object is created by the vfio_ap device driver
and used to notify userspace when a guests's AP configuration is
dynamically changed. Such changes may occur whenever:

* An adapter, domain or control domain is assigned to or unassigned from a
  mediated device that is attached to the guest.
* A queue assigned to the mediated device that is attached to a guest is
  bound to or unbound from the vfio_ap device driver. This can occur
  either by manually binding/unbinding the queue via the vfio_ap driver's
  sysfs bind/unbind attribute interfaces, or because an adapter, domain or
  control domain assigned to the mediated device is added to or removed
  from the host's AP configuration via an SE/HMC

The purpose of this patch is to provide immediate notification of changes
made to a guest's AP configuration by the vfio_ap driver. This will enable
the guest to take immediate action rather than relying on polling or some
other inefficient mechanism to detect changes to its AP configuration.

Note that there are corresponding QEMU patches that will be shipped along
with this patch (see vfio-ap: Report vfio-ap configuration changes) that
will pick up the eventfd signal.

Signed-off-by: Rorie Reyes <rreyes@linux.ibm.com>
Reviewed-by: Anthony Krowiak <akrowiak@linux.ibm.com>
Tested-by: Anthony Krowiak <akrowiak@linux.ibm.com>
---
 drivers/s390/crypto/vfio_ap_ops.c     | 52 ++++++++++++++++++++++++++-
 drivers/s390/crypto/vfio_ap_private.h |  2 ++
 include/uapi/linux/vfio.h             |  1 +
 3 files changed, 54 insertions(+), 1 deletion(-)

Comments

Alexander Gordeev Jan. 13, 2025, 4:08 p.m. UTC | #1
On Tue, Jan 07, 2025 at 01:36:45PM -0500, Rorie Reyes wrote:

Hi Rorie, Antony,

> Note that there are corresponding QEMU patches that will be shipped along
> with this patch (see vfio-ap: Report vfio-ap configuration changes) that
> will pick up the eventfd signal.

How this patch is synchronized with the mentioned QEMU series?
What is the series status, especially with the comment from Cédric Le Goater [1]?

1. https://lore.kernel.org/all/20250107184354.91079-1-rreyes@linux.ibm.com/T/#mb0d37909c5f69bdff96289094ac0bad0922a7cce

Thanks!
Harald Freudenberger Jan. 14, 2025, 9:03 a.m. UTC | #2
On 2025-01-07 19:36, Rorie Reyes wrote:
> In this patch, an eventfd object is created by the vfio_ap device 
> driver
> and used to notify userspace when a guests's AP configuration is
> dynamically changed. Such changes may occur whenever:
> 
> * An adapter, domain or control domain is assigned to or unassigned 
> from a
>   mediated device that is attached to the guest.
> * A queue assigned to the mediated device that is attached to a guest 
> is
>   bound to or unbound from the vfio_ap device driver. This can occur
>   either by manually binding/unbinding the queue via the vfio_ap 
> driver's
>   sysfs bind/unbind attribute interfaces, or because an adapter, domain 
> or
>   control domain assigned to the mediated device is added to or removed
>   from the host's AP configuration via an SE/HMC
> 
> The purpose of this patch is to provide immediate notification of 
> changes
> made to a guest's AP configuration by the vfio_ap driver. This will 
> enable
> the guest to take immediate action rather than relying on polling or 
> some
> other inefficient mechanism to detect changes to its AP configuration.
> 
> Note that there are corresponding QEMU patches that will be shipped 
> along
> with this patch (see vfio-ap: Report vfio-ap configuration changes) 
> that
> will pick up the eventfd signal.
> 
> Signed-off-by: Rorie Reyes <rreyes@linux.ibm.com>
> Reviewed-by: Anthony Krowiak <akrowiak@linux.ibm.com>
> Tested-by: Anthony Krowiak <akrowiak@linux.ibm.com>
> ---
>  drivers/s390/crypto/vfio_ap_ops.c     | 52 ++++++++++++++++++++++++++-
>  drivers/s390/crypto/vfio_ap_private.h |  2 ++
>  include/uapi/linux/vfio.h             |  1 +
>  3 files changed, 54 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c
> b/drivers/s390/crypto/vfio_ap_ops.c
> index a52c2690933f..c6ff4ab13f16 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -650,13 +650,22 @@ static void vfio_ap_matrix_init(struct
> ap_config_info *info,
>  	matrix->adm_max = info->apxa ? info->nd : 15;
>  }
> 
> +static void signal_guest_ap_cfg_changed(struct ap_matrix_mdev 
> *matrix_mdev)
> +{
> +		if (matrix_mdev->cfg_chg_trigger)
> +			eventfd_signal(matrix_mdev->cfg_chg_trigger);
> +}
> +
>  static void vfio_ap_mdev_update_guest_apcb(struct ap_matrix_mdev 
> *matrix_mdev)
>  {
> -	if (matrix_mdev->kvm)
> +	if (matrix_mdev->kvm) {
>  		kvm_arch_crypto_set_masks(matrix_mdev->kvm,
>  					  matrix_mdev->shadow_apcb.apm,
>  					  matrix_mdev->shadow_apcb.aqm,
>  					  matrix_mdev->shadow_apcb.adm);
> +
> +		signal_guest_ap_cfg_changed(matrix_mdev);
> +	}
>  }
> 
>  static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev 
> *matrix_mdev)
> @@ -792,6 +801,7 @@ static int vfio_ap_mdev_probe(struct mdev_device 
> *mdev)
>  	if (ret)
>  		goto err_put_vdev;
>  	matrix_mdev->req_trigger = NULL;
> +	matrix_mdev->cfg_chg_trigger = NULL;
>  	dev_set_drvdata(&mdev->dev, matrix_mdev);
>  	mutex_lock(&matrix_dev->mdevs_lock);
>  	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
> @@ -1860,6 +1870,7 @@ static void vfio_ap_mdev_unset_kvm(struct
> ap_matrix_mdev *matrix_mdev)
>  		get_update_locks_for_kvm(kvm);
> 
>  		kvm_arch_crypto_clear_masks(kvm);
> +		signal_guest_ap_cfg_changed(matrix_mdev);
>  		vfio_ap_mdev_reset_queues(matrix_mdev);
>  		kvm_put_kvm(kvm);
>  		matrix_mdev->kvm = NULL;
> @@ -2097,6 +2108,10 @@ static ssize_t vfio_ap_get_irq_info(unsigned 
> long arg)
>  		info.count = 1;
>  		info.flags = VFIO_IRQ_INFO_EVENTFD;
>  		break;
> +	case VFIO_AP_CFG_CHG_IRQ_INDEX:
> +		info.count = 1;
> +		info.flags = VFIO_IRQ_INFO_EVENTFD;
> +		break;
>  	default:
>  		return -EINVAL;
>  	}
> @@ -2160,6 +2175,39 @@ static int vfio_ap_set_request_irq(struct
> ap_matrix_mdev *matrix_mdev,
>  	return 0;
>  }
> 
> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev
> *matrix_mdev, unsigned long arg)
> +{
> +	s32 fd;
> +	void __user *data;
> +	unsigned long minsz;
> +	struct eventfd_ctx *cfg_chg_trigger;
> +
> +	minsz = offsetofend(struct vfio_irq_set, count);
> +	data = (void __user *)(arg + minsz);
> +
> +	if (get_user(fd, (s32 __user *)data))
> +		return -EFAULT;
> +
> +	if (fd == -1) {
> +		if (matrix_mdev->cfg_chg_trigger)
> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> +		matrix_mdev->cfg_chg_trigger = NULL;
> +	} else if (fd >= 0) {
> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
> +		if (IS_ERR(cfg_chg_trigger))
> +			return PTR_ERR(cfg_chg_trigger);
> +
> +		if (matrix_mdev->cfg_chg_trigger)
> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> +
> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
>  static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
>  			    unsigned long arg)
>  {
> @@ -2175,6 +2223,8 @@ static int vfio_ap_set_irqs(struct
> ap_matrix_mdev *matrix_mdev,
>  		switch (irq_set.index) {
>  		case VFIO_AP_REQ_IRQ_INDEX:
>  			return vfio_ap_set_request_irq(matrix_mdev, arg);
> +		case VFIO_AP_CFG_CHG_IRQ_INDEX:
> +			return vfio_ap_set_cfg_change_irq(matrix_mdev, arg);
>  		default:
>  			return -EINVAL;
>  		}
> diff --git a/drivers/s390/crypto/vfio_ap_private.h
> b/drivers/s390/crypto/vfio_ap_private.h
> index 437a161c8659..37de9c69b6eb 100644
> --- a/drivers/s390/crypto/vfio_ap_private.h
> +++ b/drivers/s390/crypto/vfio_ap_private.h
> @@ -105,6 +105,7 @@ struct ap_queue_table {
>   * @mdev:	the mediated device
>   * @qtable:	table of queues (struct vfio_ap_queue) assigned to the 
> mdev
>   * @req_trigger eventfd ctx for signaling userspace to return a device
> + * @cfg_chg_trigger eventfd ctx to signal AP config changed to 
> userspace
>   * @apm_add:	bitmap of APIDs added to the host's AP configuration
>   * @aqm_add:	bitmap of APQIs added to the host's AP configuration
>   * @adm_add:	bitmap of control domain numbers added to the host's AP
> @@ -120,6 +121,7 @@ struct ap_matrix_mdev {
>  	struct mdev_device *mdev;
>  	struct ap_queue_table qtable;
>  	struct eventfd_ctx *req_trigger;
> +	struct eventfd_ctx *cfg_chg_trigger;
>  	DECLARE_BITMAP(apm_add, AP_DEVICES);
>  	DECLARE_BITMAP(aqm_add, AP_DOMAINS);
>  	DECLARE_BITMAP(adm_add, AP_DOMAINS);
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..a2d3e1ac6239 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -671,6 +671,7 @@ enum {
>   */
>  enum {
>  	VFIO_AP_REQ_IRQ_INDEX,
> +	VFIO_AP_CFG_CHG_IRQ_INDEX,
>  	VFIO_AP_NUM_IRQS
>  };

Rorie, this is to inform listeners on the host of the guest.
The guest itself already sees this "inside" with uevents triggered
by the AP bus code.

Do you have a consumer for these events?
Alex Williamson Jan. 14, 2025, 8:05 p.m. UTC | #3
On Tue,  7 Jan 2025 13:36:45 -0500
Rorie Reyes <rreyes@linux.ibm.com> wrote:

> In this patch, an eventfd object is created by the vfio_ap device driver
> and used to notify userspace when a guests's AP configuration is
> dynamically changed. Such changes may occur whenever:
> 
> * An adapter, domain or control domain is assigned to or unassigned from a
>   mediated device that is attached to the guest.
> * A queue assigned to the mediated device that is attached to a guest is
>   bound to or unbound from the vfio_ap device driver. This can occur
>   either by manually binding/unbinding the queue via the vfio_ap driver's
>   sysfs bind/unbind attribute interfaces, or because an adapter, domain or
>   control domain assigned to the mediated device is added to or removed
>   from the host's AP configuration via an SE/HMC
> 
> The purpose of this patch is to provide immediate notification of changes
> made to a guest's AP configuration by the vfio_ap driver. This will enable
> the guest to take immediate action rather than relying on polling or some
> other inefficient mechanism to detect changes to its AP configuration.
> 
> Note that there are corresponding QEMU patches that will be shipped along
> with this patch (see vfio-ap: Report vfio-ap configuration changes) that
> will pick up the eventfd signal.
> 
> Signed-off-by: Rorie Reyes <rreyes@linux.ibm.com>
> Reviewed-by: Anthony Krowiak <akrowiak@linux.ibm.com>
> Tested-by: Anthony Krowiak <akrowiak@linux.ibm.com>
> ---
>  drivers/s390/crypto/vfio_ap_ops.c     | 52 ++++++++++++++++++++++++++-
>  drivers/s390/crypto/vfio_ap_private.h |  2 ++
>  include/uapi/linux/vfio.h             |  1 +
>  3 files changed, 54 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index a52c2690933f..c6ff4ab13f16 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -650,13 +650,22 @@ static void vfio_ap_matrix_init(struct ap_config_info *info,
>  	matrix->adm_max = info->apxa ? info->nd : 15;
>  }
>  
> +static void signal_guest_ap_cfg_changed(struct ap_matrix_mdev *matrix_mdev)
> +{
> +		if (matrix_mdev->cfg_chg_trigger)
> +			eventfd_signal(matrix_mdev->cfg_chg_trigger);
> +}
> +
>  static void vfio_ap_mdev_update_guest_apcb(struct ap_matrix_mdev *matrix_mdev)
>  {
> -	if (matrix_mdev->kvm)
> +	if (matrix_mdev->kvm) {
>  		kvm_arch_crypto_set_masks(matrix_mdev->kvm,
>  					  matrix_mdev->shadow_apcb.apm,
>  					  matrix_mdev->shadow_apcb.aqm,
>  					  matrix_mdev->shadow_apcb.adm);
> +
> +		signal_guest_ap_cfg_changed(matrix_mdev);
> +	}
>  }
>  
>  static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev *matrix_mdev)
> @@ -792,6 +801,7 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev)
>  	if (ret)
>  		goto err_put_vdev;
>  	matrix_mdev->req_trigger = NULL;
> +	matrix_mdev->cfg_chg_trigger = NULL;
>  	dev_set_drvdata(&mdev->dev, matrix_mdev);
>  	mutex_lock(&matrix_dev->mdevs_lock);
>  	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
> @@ -1860,6 +1870,7 @@ static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev)
>  		get_update_locks_for_kvm(kvm);
>  
>  		kvm_arch_crypto_clear_masks(kvm);
> +		signal_guest_ap_cfg_changed(matrix_mdev);
>  		vfio_ap_mdev_reset_queues(matrix_mdev);
>  		kvm_put_kvm(kvm);
>  		matrix_mdev->kvm = NULL;
> @@ -2097,6 +2108,10 @@ static ssize_t vfio_ap_get_irq_info(unsigned long arg)
>  		info.count = 1;
>  		info.flags = VFIO_IRQ_INFO_EVENTFD;
>  		break;
> +	case VFIO_AP_CFG_CHG_IRQ_INDEX:
> +		info.count = 1;
> +		info.flags = VFIO_IRQ_INFO_EVENTFD;
> +		break;
>  	default:
>  		return -EINVAL;
>  	}
> @@ -2160,6 +2175,39 @@ static int vfio_ap_set_request_irq(struct ap_matrix_mdev *matrix_mdev,
>  	return 0;
>  }
>  
> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
> +{
> +	s32 fd;
> +	void __user *data;
> +	unsigned long minsz;
> +	struct eventfd_ctx *cfg_chg_trigger;
> +
> +	minsz = offsetofend(struct vfio_irq_set, count);
> +	data = (void __user *)(arg + minsz);
> +
> +	if (get_user(fd, (s32 __user *)data))
> +		return -EFAULT;
> +
> +	if (fd == -1) {
> +		if (matrix_mdev->cfg_chg_trigger)
> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> +		matrix_mdev->cfg_chg_trigger = NULL;
> +	} else if (fd >= 0) {
> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
> +		if (IS_ERR(cfg_chg_trigger))
> +			return PTR_ERR(cfg_chg_trigger);
> +
> +		if (matrix_mdev->cfg_chg_trigger)
> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> +
> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}

How does this guard against a use after free, such as the eventfd being
disabled or swapped concurrent to a config change?  Thanks,

Alex

> +
>  static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
>  			    unsigned long arg)
>  {
> @@ -2175,6 +2223,8 @@ static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
>  		switch (irq_set.index) {
>  		case VFIO_AP_REQ_IRQ_INDEX:
>  			return vfio_ap_set_request_irq(matrix_mdev, arg);
> +		case VFIO_AP_CFG_CHG_IRQ_INDEX:
> +			return vfio_ap_set_cfg_change_irq(matrix_mdev, arg);
>  		default:
>  			return -EINVAL;
>  		}
> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
> index 437a161c8659..37de9c69b6eb 100644
> --- a/drivers/s390/crypto/vfio_ap_private.h
> +++ b/drivers/s390/crypto/vfio_ap_private.h
> @@ -105,6 +105,7 @@ struct ap_queue_table {
>   * @mdev:	the mediated device
>   * @qtable:	table of queues (struct vfio_ap_queue) assigned to the mdev
>   * @req_trigger eventfd ctx for signaling userspace to return a device
> + * @cfg_chg_trigger eventfd ctx to signal AP config changed to userspace
>   * @apm_add:	bitmap of APIDs added to the host's AP configuration
>   * @aqm_add:	bitmap of APQIs added to the host's AP configuration
>   * @adm_add:	bitmap of control domain numbers added to the host's AP
> @@ -120,6 +121,7 @@ struct ap_matrix_mdev {
>  	struct mdev_device *mdev;
>  	struct ap_queue_table qtable;
>  	struct eventfd_ctx *req_trigger;
> +	struct eventfd_ctx *cfg_chg_trigger;
>  	DECLARE_BITMAP(apm_add, AP_DEVICES);
>  	DECLARE_BITMAP(aqm_add, AP_DOMAINS);
>  	DECLARE_BITMAP(adm_add, AP_DOMAINS);
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..a2d3e1ac6239 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -671,6 +671,7 @@ enum {
>   */
>  enum {
>  	VFIO_AP_REQ_IRQ_INDEX,
> +	VFIO_AP_CFG_CHG_IRQ_INDEX,
>  	VFIO_AP_NUM_IRQS
>  };
>
Anthony Krowiak Jan. 15, 2025, 7:35 p.m. UTC | #4
On 1/14/25 3:05 PM, Alex Williamson wrote:
> On Tue,  7 Jan 2025 13:36:45 -0500
> Rorie Reyes <rreyes@linux.ibm.com> wrote:
>
>> In this patch, an eventfd object is created by the vfio_ap device driver
>> and used to notify userspace when a guests's AP configuration is
>> dynamically changed. Such changes may occur whenever:
>>
>> * An adapter, domain or control domain is assigned to or unassigned from a
>>    mediated device that is attached to the guest.
>> * A queue assigned to the mediated device that is attached to a guest is
>>    bound to or unbound from the vfio_ap device driver. This can occur
>>    either by manually binding/unbinding the queue via the vfio_ap driver's
>>    sysfs bind/unbind attribute interfaces, or because an adapter, domain or
>>    control domain assigned to the mediated device is added to or removed
>>    from the host's AP configuration via an SE/HMC
>>
>> The purpose of this patch is to provide immediate notification of changes
>> made to a guest's AP configuration by the vfio_ap driver. This will enable
>> the guest to take immediate action rather than relying on polling or some
>> other inefficient mechanism to detect changes to its AP configuration.
>>
>> Note that there are corresponding QEMU patches that will be shipped along
>> with this patch (see vfio-ap: Report vfio-ap configuration changes) that
>> will pick up the eventfd signal.
>>
>> Signed-off-by: Rorie Reyes <rreyes@linux.ibm.com>
>> Reviewed-by: Anthony Krowiak <akrowiak@linux.ibm.com>
>> Tested-by: Anthony Krowiak <akrowiak@linux.ibm.com>
>> ---
>>   drivers/s390/crypto/vfio_ap_ops.c     | 52 ++++++++++++++++++++++++++-
>>   drivers/s390/crypto/vfio_ap_private.h |  2 ++
>>   include/uapi/linux/vfio.h             |  1 +
>>   3 files changed, 54 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
>> index a52c2690933f..c6ff4ab13f16 100644
>> --- a/drivers/s390/crypto/vfio_ap_ops.c
>> +++ b/drivers/s390/crypto/vfio_ap_ops.c
>> @@ -650,13 +650,22 @@ static void vfio_ap_matrix_init(struct ap_config_info *info,
>>   	matrix->adm_max = info->apxa ? info->nd : 15;
>>   }
>>   
>> +static void signal_guest_ap_cfg_changed(struct ap_matrix_mdev *matrix_mdev)
>> +{
>> +		if (matrix_mdev->cfg_chg_trigger)
>> +			eventfd_signal(matrix_mdev->cfg_chg_trigger);
>> +}
>> +
>>   static void vfio_ap_mdev_update_guest_apcb(struct ap_matrix_mdev *matrix_mdev)
>>   {
>> -	if (matrix_mdev->kvm)
>> +	if (matrix_mdev->kvm) {
>>   		kvm_arch_crypto_set_masks(matrix_mdev->kvm,
>>   					  matrix_mdev->shadow_apcb.apm,
>>   					  matrix_mdev->shadow_apcb.aqm,
>>   					  matrix_mdev->shadow_apcb.adm);
>> +
>> +		signal_guest_ap_cfg_changed(matrix_mdev);
>> +	}
>>   }
>>   
>>   static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev *matrix_mdev)
>> @@ -792,6 +801,7 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev)
>>   	if (ret)
>>   		goto err_put_vdev;
>>   	matrix_mdev->req_trigger = NULL;
>> +	matrix_mdev->cfg_chg_trigger = NULL;
>>   	dev_set_drvdata(&mdev->dev, matrix_mdev);
>>   	mutex_lock(&matrix_dev->mdevs_lock);
>>   	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
>> @@ -1860,6 +1870,7 @@ static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev)
>>   		get_update_locks_for_kvm(kvm);
>>   
>>   		kvm_arch_crypto_clear_masks(kvm);
>> +		signal_guest_ap_cfg_changed(matrix_mdev);
>>   		vfio_ap_mdev_reset_queues(matrix_mdev);
>>   		kvm_put_kvm(kvm);
>>   		matrix_mdev->kvm = NULL;
>> @@ -2097,6 +2108,10 @@ static ssize_t vfio_ap_get_irq_info(unsigned long arg)
>>   		info.count = 1;
>>   		info.flags = VFIO_IRQ_INFO_EVENTFD;
>>   		break;
>> +	case VFIO_AP_CFG_CHG_IRQ_INDEX:
>> +		info.count = 1;
>> +		info.flags = VFIO_IRQ_INFO_EVENTFD;
>> +		break;
>>   	default:
>>   		return -EINVAL;
>>   	}
>> @@ -2160,6 +2175,39 @@ static int vfio_ap_set_request_irq(struct ap_matrix_mdev *matrix_mdev,
>>   	return 0;
>>   }
>>   
>> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
>> +{
>> +	s32 fd;
>> +	void __user *data;
>> +	unsigned long minsz;
>> +	struct eventfd_ctx *cfg_chg_trigger;
>> +
>> +	minsz = offsetofend(struct vfio_irq_set, count);
>> +	data = (void __user *)(arg + minsz);
>> +
>> +	if (get_user(fd, (s32 __user *)data))
>> +		return -EFAULT;
>> +
>> +	if (fd == -1) {
>> +		if (matrix_mdev->cfg_chg_trigger)
>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
>> +		matrix_mdev->cfg_chg_trigger = NULL;
>> +	} else if (fd >= 0) {
>> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
>> +		if (IS_ERR(cfg_chg_trigger))
>> +			return PTR_ERR(cfg_chg_trigger);
>> +
>> +		if (matrix_mdev->cfg_chg_trigger)
>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
>> +
>> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
>> +	} else {
>> +		return -EINVAL;
>> +	}
>> +
>> +	return 0;
>> +}
> How does this guard against a use after free, such as the eventfd being
> disabled or swapped concurrent to a config change?  Thanks,
>
> Alex

Hi Alex. I spent a great deal of time today trying to figure out exactly 
what
you are asking here; reading about eventfd and digging through code.
I looked at other places where eventfd is used to set up communication
of events targetting a vfio device from KVM to userspace (e.g., 
hw/vfio/ccw.c)
and do not find anything much different than what is done here. In fact,
this code looks identical to the code that sets up an eventfd for the
VFIO_AP_REQ_IRQ_INDEX.

Maybe you can explain how an eventfd is disabled or swapped, or maybe
explain how we can guard against its use after free. Thanks.

Anthony Krowiak

>
>> +
>>   static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
>>   			    unsigned long arg)
>>   {
>> @@ -2175,6 +2223,8 @@ static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
>>   		switch (irq_set.index) {
>>   		case VFIO_AP_REQ_IRQ_INDEX:
>>   			return vfio_ap_set_request_irq(matrix_mdev, arg);
>> +		case VFIO_AP_CFG_CHG_IRQ_INDEX:
>> +			return vfio_ap_set_cfg_change_irq(matrix_mdev, arg);
>>   		default:
>>   			return -EINVAL;
>>   		}
>> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
>> index 437a161c8659..37de9c69b6eb 100644
>> --- a/drivers/s390/crypto/vfio_ap_private.h
>> +++ b/drivers/s390/crypto/vfio_ap_private.h
>> @@ -105,6 +105,7 @@ struct ap_queue_table {
>>    * @mdev:	the mediated device
>>    * @qtable:	table of queues (struct vfio_ap_queue) assigned to the mdev
>>    * @req_trigger eventfd ctx for signaling userspace to return a device
>> + * @cfg_chg_trigger eventfd ctx to signal AP config changed to userspace
>>    * @apm_add:	bitmap of APIDs added to the host's AP configuration
>>    * @aqm_add:	bitmap of APQIs added to the host's AP configuration
>>    * @adm_add:	bitmap of control domain numbers added to the host's AP
>> @@ -120,6 +121,7 @@ struct ap_matrix_mdev {
>>   	struct mdev_device *mdev;
>>   	struct ap_queue_table qtable;
>>   	struct eventfd_ctx *req_trigger;
>> +	struct eventfd_ctx *cfg_chg_trigger;
>>   	DECLARE_BITMAP(apm_add, AP_DEVICES);
>>   	DECLARE_BITMAP(aqm_add, AP_DOMAINS);
>>   	DECLARE_BITMAP(adm_add, AP_DOMAINS);
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index c8dbf8219c4f..a2d3e1ac6239 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -671,6 +671,7 @@ enum {
>>    */
>>   enum {
>>   	VFIO_AP_REQ_IRQ_INDEX,
>> +	VFIO_AP_CFG_CHG_IRQ_INDEX,
>>   	VFIO_AP_NUM_IRQS
>>   };
>>   
>
Halil Pasic Jan. 16, 2025, 12:17 a.m. UTC | #5
On Wed, 15 Jan 2025 14:35:02 -0500
Anthony Krowiak <akrowiak@linux.ibm.com> wrote:

> >> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
> >> +{
> >> +	s32 fd;
> >> +	void __user *data;
> >> +	unsigned long minsz;
> >> +	struct eventfd_ctx *cfg_chg_trigger;
> >> +
> >> +	minsz = offsetofend(struct vfio_irq_set, count);
> >> +	data = (void __user *)(arg + minsz);
> >> +
> >> +	if (get_user(fd, (s32 __user *)data))
> >> +		return -EFAULT;
> >> +
> >> +	if (fd == -1) {
> >> +		if (matrix_mdev->cfg_chg_trigger)
> >> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> >> +		matrix_mdev->cfg_chg_trigger = NULL;
> >> +	} else if (fd >= 0) {
> >> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
> >> +		if (IS_ERR(cfg_chg_trigger))
> >> +			return PTR_ERR(cfg_chg_trigger);
> >> +
> >> +		if (matrix_mdev->cfg_chg_trigger)
> >> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> >> +
> >> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
> >> +	} else {
> >> +		return -EINVAL;
> >> +	}
> >> +
> >> +	return 0;
> >> +}  
> > How does this guard against a use after free, such as the eventfd being
> > disabled or swapped concurrent to a config change?  Thanks,
> >
> > Alex  
> 
> Hi Alex. I spent a great deal of time today trying to figure out exactly 
> what
> you are asking here; reading about eventfd and digging through code.
> I looked at other places where eventfd is used to set up communication
> of events targetting a vfio device from KVM to userspace (e.g., 
> hw/vfio/ccw.c)
> and do not find anything much different than what is done here. In fact,
> this code looks identical to the code that sets up an eventfd for the
> VFIO_AP_REQ_IRQ_INDEX.
> 
> Maybe you can explain how an eventfd is disabled or swapped, or maybe
> explain how we can guard against its use after free. Thanks.

Maybe I will try! The value of matrix_mdev->cfg_chg_trigger is used in:
* vfio_ap_set_cfg_change_irq() (rw, with matrix_dev->mdevs_lock)
* signal_guest_ap_cfg_changed()(r, takes no locks itself, )
  * called by vfio_ap_mdev_update_guest_apcb() 
    * called at a bunch of places but AFAICT always with
      matrix_dev->mdevs_lock held
  * called by vfio_ap_mdev_unset_kvm() (with matrix_dev->mdevs_lock held
    via get_update_locks_for_kvm())
* vfio_ap_mdev_probe() (w, assigns NULL to it)

If vfio_ap_set_cfg_change_irq() could change/destroy
matrix_mdev->cfg_chg_trigger while another thread of execution
is using it e.g. with signal_guest_ap_cfg_changed() that would be a
possible UAF and thus BAD.

Now AFAICT matrix_mdev->cfg_chg_trigger is protected by
matrix_dev->mdevs_lock on each access except for in vfio_ap_mdev_probe()
which is AFAIK just an initialization in a safe state where we are
guaranteed to have exclusive access.

The eventfd is swapped and disabled in vfio_ap_set_cfg_change_irq() with
userspace supplying a new valid fd or -1 respectively.

Tony does that answer your question to Alex?

Alex, does the above answer your question on what guards against UAF (the
short answer is: matrix_dev->mdevs_lock)?

Regards,
Halil
Anthony Krowiak Jan. 16, 2025, 3:38 p.m. UTC | #6
On 1/15/25 7:17 PM, Halil Pasic wrote:
> On Wed, 15 Jan 2025 14:35:02 -0500
> Anthony Krowiak <akrowiak@linux.ibm.com> wrote:
>
>>>> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
>>>> +{
>>>> +	s32 fd;
>>>> +	void __user *data;
>>>> +	unsigned long minsz;
>>>> +	struct eventfd_ctx *cfg_chg_trigger;
>>>> +
>>>> +	minsz = offsetofend(struct vfio_irq_set, count);
>>>> +	data = (void __user *)(arg + minsz);
>>>> +
>>>> +	if (get_user(fd, (s32 __user *)data))
>>>> +		return -EFAULT;
>>>> +
>>>> +	if (fd == -1) {
>>>> +		if (matrix_mdev->cfg_chg_trigger)
>>>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
>>>> +		matrix_mdev->cfg_chg_trigger = NULL;
>>>> +	} else if (fd >= 0) {
>>>> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
>>>> +		if (IS_ERR(cfg_chg_trigger))
>>>> +			return PTR_ERR(cfg_chg_trigger);
>>>> +
>>>> +		if (matrix_mdev->cfg_chg_trigger)
>>>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
>>>> +
>>>> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
>>>> +	} else {
>>>> +		return -EINVAL;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>> How does this guard against a use after free, such as the eventfd being
>>> disabled or swapped concurrent to a config change?  Thanks,
>>>
>>> Alex
>> Hi Alex. I spent a great deal of time today trying to figure out exactly
>> what
>> you are asking here; reading about eventfd and digging through code.
>> I looked at other places where eventfd is used to set up communication
>> of events targetting a vfio device from KVM to userspace (e.g.,
>> hw/vfio/ccw.c)
>> and do not find anything much different than what is done here. In fact,
>> this code looks identical to the code that sets up an eventfd for the
>> VFIO_AP_REQ_IRQ_INDEX.
>>
>> Maybe you can explain how an eventfd is disabled or swapped, or maybe
>> explain how we can guard against its use after free. Thanks.
> Maybe I will try! The value of matrix_mdev->cfg_chg_trigger is used in:
> * vfio_ap_set_cfg_change_irq() (rw, with matrix_dev->mdevs_lock)
> * signal_guest_ap_cfg_changed()(r, takes no locks itself, )
>    * called by vfio_ap_mdev_update_guest_apcb()
>      * called at a bunch of places but AFAICT always with
>        matrix_dev->mdevs_lock held
>    * called by vfio_ap_mdev_unset_kvm() (with matrix_dev->mdevs_lock held
>      via get_update_locks_for_kvm())
> * vfio_ap_mdev_probe() (w, assigns NULL to it)
>
> If vfio_ap_set_cfg_change_irq() could change/destroy
> matrix_mdev->cfg_chg_trigger while another thread of execution
> is using it e.g. with signal_guest_ap_cfg_changed() that would be a
> possible UAF and thus BAD.
>
> Now AFAICT matrix_mdev->cfg_chg_trigger is protected by
> matrix_dev->mdevs_lock on each access except for in vfio_ap_mdev_probe()
> which is AFAIK just an initialization in a safe state where we are
> guaranteed to have exclusive access.
>
> The eventfd is swapped and disabled in vfio_ap_set_cfg_change_irq() with
> userspace supplying a new valid fd or -1 respectively.
>
> Tony does that answer your question to Alex?
>
> Alex, does the above answer your question on what guards against UAF (the
> short answer is: matrix_dev->mdevs_lock)?

I agree that the matrix_dev->mdevs_lock does prevent changes to
matrix_mdev->cfg_chg_trigger while it is being accessed by the
vfio_ap device driver. My confusion arises from my interpretation of
Alex's question; it seemed to me that he was talking its use outside
of the vfio_ap driver and how to guard against that.

>
> Regards,
> Halil
>
>
>
Anthony Krowiak Jan. 16, 2025, 4:46 p.m. UTC | #7
>
> Rorie, this is to inform listeners on the host of the guest.
> The guest itself already sees this "inside" with uevents triggered
> by the AP bus code.
>
> Do you have a consumer for these events?

There is a series of QEMU patches that register a notification handler 
for this
event. When that handler gets called, it generates and queues a CRW to 
the guest
indicating there is event information pending which will cause the AP 
bus driver
to get notified of the AP configuration change via its ap_bus_cfg_chg 
notifier call.

The QEMU series can be seen at:
https://lore.kernel.org/qemu-devel/7171c479-5cb4-4748-ba37-da4cf2fac35b@linux.ibm.com/T/ 


Kind regards,
A Krowiak
Alex Williamson Jan. 16, 2025, 4:52 p.m. UTC | #8
On Thu, 16 Jan 2025 10:38:41 -0500
Anthony Krowiak <akrowiak@linux.ibm.com> wrote:

> On 1/15/25 7:17 PM, Halil Pasic wrote:
> > On Wed, 15 Jan 2025 14:35:02 -0500
> > Anthony Krowiak <akrowiak@linux.ibm.com> wrote:
> >  
> >>>> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
> >>>> +{
> >>>> +	s32 fd;
> >>>> +	void __user *data;
> >>>> +	unsigned long minsz;
> >>>> +	struct eventfd_ctx *cfg_chg_trigger;
> >>>> +
> >>>> +	minsz = offsetofend(struct vfio_irq_set, count);
> >>>> +	data = (void __user *)(arg + minsz);
> >>>> +
> >>>> +	if (get_user(fd, (s32 __user *)data))
> >>>> +		return -EFAULT;
> >>>> +
> >>>> +	if (fd == -1) {
> >>>> +		if (matrix_mdev->cfg_chg_trigger)
> >>>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> >>>> +		matrix_mdev->cfg_chg_trigger = NULL;
> >>>> +	} else if (fd >= 0) {
> >>>> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
> >>>> +		if (IS_ERR(cfg_chg_trigger))
> >>>> +			return PTR_ERR(cfg_chg_trigger);
> >>>> +
> >>>> +		if (matrix_mdev->cfg_chg_trigger)
> >>>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
> >>>> +
> >>>> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
> >>>> +	} else {
> >>>> +		return -EINVAL;
> >>>> +	}
> >>>> +
> >>>> +	return 0;
> >>>> +}  
> >>> How does this guard against a use after free, such as the eventfd being
> >>> disabled or swapped concurrent to a config change?  Thanks,
> >>>
> >>> Alex  
> >> Hi Alex. I spent a great deal of time today trying to figure out exactly
> >> what
> >> you are asking here; reading about eventfd and digging through code.
> >> I looked at other places where eventfd is used to set up communication
> >> of events targetting a vfio device from KVM to userspace (e.g.,
> >> hw/vfio/ccw.c)
> >> and do not find anything much different than what is done here. In fact,
> >> this code looks identical to the code that sets up an eventfd for the
> >> VFIO_AP_REQ_IRQ_INDEX.
> >>
> >> Maybe you can explain how an eventfd is disabled or swapped, or maybe
> >> explain how we can guard against its use after free. Thanks.  
> > Maybe I will try! The value of matrix_mdev->cfg_chg_trigger is used in:
> > * vfio_ap_set_cfg_change_irq() (rw, with matrix_dev->mdevs_lock)
> > * signal_guest_ap_cfg_changed()(r, takes no locks itself, )
> >    * called by vfio_ap_mdev_update_guest_apcb()
> >      * called at a bunch of places but AFAICT always with
> >        matrix_dev->mdevs_lock held
> >    * called by vfio_ap_mdev_unset_kvm() (with matrix_dev->mdevs_lock held
> >      via get_update_locks_for_kvm())
> > * vfio_ap_mdev_probe() (w, assigns NULL to it)
> >
> > If vfio_ap_set_cfg_change_irq() could change/destroy
> > matrix_mdev->cfg_chg_trigger while another thread of execution
> > is using it e.g. with signal_guest_ap_cfg_changed() that would be a
> > possible UAF and thus BAD.
> >
> > Now AFAICT matrix_mdev->cfg_chg_trigger is protected by
> > matrix_dev->mdevs_lock on each access except for in vfio_ap_mdev_probe()
> > which is AFAIK just an initialization in a safe state where we are
> > guaranteed to have exclusive access.
> >
> > The eventfd is swapped and disabled in vfio_ap_set_cfg_change_irq() with
> > userspace supplying a new valid fd or -1 respectively.
> >
> > Tony does that answer your question to Alex?
> >
> > Alex, does the above answer your question on what guards against UAF (the
> > short answer is: matrix_dev->mdevs_lock)?  

Yes, that answers my question, thanks for untangling it.  We might
consider a lockdep_assert_held() in the new
signal_guest_ap_cfg_changed() since it does get called from a variety
of paths and we need that lock to prevent the UAF.

> I agree that the matrix_dev->mdevs_lock does prevent changes to
> matrix_mdev->cfg_chg_trigger while it is being accessed by the
> vfio_ap device driver. My confusion arises from my interpretation of
> Alex's question; it seemed to me that he was talking its use outside
> of the vfio_ap driver and how to guard against that.

Nope, Halil zeroed in on the UAF possibility that concerned me.  Thanks,

Alex
Halil Pasic Jan. 16, 2025, 7:18 p.m. UTC | #9
On Thu, 16 Jan 2025 11:52:28 -0500
Alex Williamson <alex.williamson@redhat.com> wrote:

> > > Alex, does the above answer your question on what guards against UAF (the
> > > short answer is: matrix_dev->mdevs_lock)?    
> 
> Yes, that answers my question, thanks for untangling it.  We might
> consider a lockdep_assert_held() in the new
> signal_guest_ap_cfg_changed() since it does get called from a variety
> of paths and we need that lock to prevent the UAF.

Yes I second that! I was thinking about it myself yesterday. And there
are also a couple of other functions that expect to be called with
certain locks held. I would love to see lockdep_assert_held() there
as well.

Since I went through that code last night I could spin a patch that
catches some of these at least. But if I don't within two weeks, I
won't be grumpy if somebody else picks that up.

Regards,
Halil
Halil Pasic Jan. 16, 2025, 7:30 p.m. UTC | #10
On Thu, 16 Jan 2025 10:38:41 -0500
Anthony Krowiak <akrowiak@linux.ibm.com> wrote:

> > Alex, does the above answer your question on what guards against UAF (the
> > short answer is: matrix_dev->mdevs_lock)?  
> 
> I agree that the matrix_dev->mdevs_lock does prevent changes to
> matrix_mdev->cfg_chg_trigger while it is being accessed by the
> vfio_ap device driver. My confusion arises from my interpretation of
> Alex's question; it seemed to me that he was talking its use outside
> of the vfio_ap driver and how to guard against that.

BTW the key for understanding how we are protected form something
like userspace closing he eventfd is that eventfd_ctx_fdget()
takes a reference to the internal eventfd context,  which makes
sure userspace can not shoot us in the foot and the context
remains to be safe to use until we have done our put. Generally
userspace is responsible for not shooting itself in the foot,
so how QEMU uses its end is mostly QEMUs problem in my understanding.

Regards,
Halil
Harald Freudenberger Jan. 17, 2025, 8:30 a.m. UTC | #11
On 2025-01-16 17:46, Anthony Krowiak wrote:
>> 
>> Rorie, this is to inform listeners on the host of the guest.
>> The guest itself already sees this "inside" with uevents triggered
>> by the AP bus code.
>> 
>> Do you have a consumer for these events?
> 
> There is a series of QEMU patches that register a notification handler 
> for this
> event. When that handler gets called, it generates and queues a CRW to 
> the guest
> indicating there is event information pending which will cause the AP 
> bus driver
> to get notified of the AP configuration change via its ap_bus_cfg_chg
> notifier call.
> 
> The QEMU series can be seen at:
> https://lore.kernel.org/qemu-devel/7171c479-5cb4-4748-ba37-da4cf2fac35b@linux.ibm.com/T/
> 
> 
> Kind regards,
> A Krowiak

Ok, so this way the AP bus of the guest finally get's it's config change 
callback invoked :-)
Anthony Krowiak Jan. 17, 2025, 12:42 p.m. UTC | #12
On 1/17/25 3:30 AM, Harald Freudenberger wrote:
> On 2025-01-16 17:46, Anthony Krowiak wrote:
>>>
>>> Rorie, this is to inform listeners on the host of the guest.
>>> The guest itself already sees this "inside" with uevents triggered
>>> by the AP bus code.
>>>
>>> Do you have a consumer for these events?
>>
>> There is a series of QEMU patches that register a notification 
>> handler for this
>> event. When that handler gets called, it generates and queues a CRW 
>> to the guest
>> indicating there is event information pending which will cause the AP 
>> bus driver
>> to get notified of the AP configuration change via its ap_bus_cfg_chg
>> notifier call.
>>
>> The QEMU series can be seen at:
>> https://lore.kernel.org/qemu-devel/7171c479-5cb4-4748-ba37-da4cf2fac35b@linux.ibm.com/T/ 
>>
>>
>>
>> Kind regards,
>> A Krowiak
>
> Ok, so this way the AP bus of the guest finally get's it's config 
> change callback invoked :-)

Yes, and it happens pretty quickly; at least in the test environment.
Anthony Krowiak Jan. 17, 2025, 12:50 p.m. UTC | #13
On 1/16/25 2:18 PM, Halil Pasic wrote:
> On Thu, 16 Jan 2025 11:52:28 -0500
> Alex Williamson <alex.williamson@redhat.com> wrote:
>
>>>> Alex, does the above answer your question on what guards against UAF (the
>>>> short answer is: matrix_dev->mdevs_lock)?
>> Yes, that answers my question, thanks for untangling it.  We might
>> consider a lockdep_assert_held() in the new
>> signal_guest_ap_cfg_changed() since it does get called from a variety
>> of paths and we need that lock to prevent the UAF.
> Yes I second that! I was thinking about it myself yesterday. And there
> are also a couple of other functions that expect to be called with
> certain locks held. I would love to see lockdep_assert_held() there
> as well.
>
> Since I went through that code last night I could spin a patch that
> catches some of these at least. But if I don't within two weeks, I
> won't be grumpy if somebody else picks that up.

Sure, sounds like a good idea. Don't worry about it, I can take care of 
it. Thanks.

>
> Regards,
> Halil
Anthony Krowiak Jan. 17, 2025, 12:53 p.m. UTC | #14
On 1/16/25 2:30 PM, Halil Pasic wrote:
> On Thu, 16 Jan 2025 10:38:41 -0500
> Anthony Krowiak <akrowiak@linux.ibm.com> wrote:
>
>>> Alex, does the above answer your question on what guards against UAF (the
>>> short answer is: matrix_dev->mdevs_lock)?
>> I agree that the matrix_dev->mdevs_lock does prevent changes to
>> matrix_mdev->cfg_chg_trigger while it is being accessed by the
>> vfio_ap device driver. My confusion arises from my interpretation of
>> Alex's question; it seemed to me that he was talking its use outside
>> of the vfio_ap driver and how to guard against that.
> BTW the key for understanding how we are protected form something
> like userspace closing he eventfd is that eventfd_ctx_fdget()
> takes a reference to the internal eventfd context,  which makes
> sure userspace can not shoot us in the foot and the context
> remains to be safe to use until we have done our put. Generally
> userspace is responsible for not shooting itself in the foot,
> so how QEMU uses its end is mostly QEMUs problem in my understanding.

I started digging through that code to try to find the reference to the
eventfd and whether/how it is protected, but got lost in the
twists and turns. Thanks for the info.

>
> Regards,
> Halil
Anthony Krowiak Jan. 17, 2025, 12:55 p.m. UTC | #15
On 1/16/25 11:52 AM, Alex Williamson wrote:
> On Thu, 16 Jan 2025 10:38:41 -0500
> Anthony Krowiak <akrowiak@linux.ibm.com> wrote:
>
>> On 1/15/25 7:17 PM, Halil Pasic wrote:
>>> On Wed, 15 Jan 2025 14:35:02 -0500
>>> Anthony Krowiak <akrowiak@linux.ibm.com> wrote:
>>>   
>>>>>> +static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
>>>>>> +{
>>>>>> +	s32 fd;
>>>>>> +	void __user *data;
>>>>>> +	unsigned long minsz;
>>>>>> +	struct eventfd_ctx *cfg_chg_trigger;
>>>>>> +
>>>>>> +	minsz = offsetofend(struct vfio_irq_set, count);
>>>>>> +	data = (void __user *)(arg + minsz);
>>>>>> +
>>>>>> +	if (get_user(fd, (s32 __user *)data))
>>>>>> +		return -EFAULT;
>>>>>> +
>>>>>> +	if (fd == -1) {
>>>>>> +		if (matrix_mdev->cfg_chg_trigger)
>>>>>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
>>>>>> +		matrix_mdev->cfg_chg_trigger = NULL;
>>>>>> +	} else if (fd >= 0) {
>>>>>> +		cfg_chg_trigger = eventfd_ctx_fdget(fd);
>>>>>> +		if (IS_ERR(cfg_chg_trigger))
>>>>>> +			return PTR_ERR(cfg_chg_trigger);
>>>>>> +
>>>>>> +		if (matrix_mdev->cfg_chg_trigger)
>>>>>> +			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
>>>>>> +
>>>>>> +		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
>>>>>> +	} else {
>>>>>> +		return -EINVAL;
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>> How does this guard against a use after free, such as the eventfd being
>>>>> disabled or swapped concurrent to a config change?  Thanks,
>>>>>
>>>>> Alex
>>>> Hi Alex. I spent a great deal of time today trying to figure out exactly
>>>> what
>>>> you are asking here; reading about eventfd and digging through code.
>>>> I looked at other places where eventfd is used to set up communication
>>>> of events targetting a vfio device from KVM to userspace (e.g.,
>>>> hw/vfio/ccw.c)
>>>> and do not find anything much different than what is done here. In fact,
>>>> this code looks identical to the code that sets up an eventfd for the
>>>> VFIO_AP_REQ_IRQ_INDEX.
>>>>
>>>> Maybe you can explain how an eventfd is disabled or swapped, or maybe
>>>> explain how we can guard against its use after free. Thanks.
>>> Maybe I will try! The value of matrix_mdev->cfg_chg_trigger is used in:
>>> * vfio_ap_set_cfg_change_irq() (rw, with matrix_dev->mdevs_lock)
>>> * signal_guest_ap_cfg_changed()(r, takes no locks itself, )
>>>     * called by vfio_ap_mdev_update_guest_apcb()
>>>       * called at a bunch of places but AFAICT always with
>>>         matrix_dev->mdevs_lock held
>>>     * called by vfio_ap_mdev_unset_kvm() (with matrix_dev->mdevs_lock held
>>>       via get_update_locks_for_kvm())
>>> * vfio_ap_mdev_probe() (w, assigns NULL to it)
>>>
>>> If vfio_ap_set_cfg_change_irq() could change/destroy
>>> matrix_mdev->cfg_chg_trigger while another thread of execution
>>> is using it e.g. with signal_guest_ap_cfg_changed() that would be a
>>> possible UAF and thus BAD.
>>>
>>> Now AFAICT matrix_mdev->cfg_chg_trigger is protected by
>>> matrix_dev->mdevs_lock on each access except for in vfio_ap_mdev_probe()
>>> which is AFAIK just an initialization in a safe state where we are
>>> guaranteed to have exclusive access.
>>>
>>> The eventfd is swapped and disabled in vfio_ap_set_cfg_change_irq() with
>>> userspace supplying a new valid fd or -1 respectively.
>>>
>>> Tony does that answer your question to Alex?
>>>
>>> Alex, does the above answer your question on what guards against UAF (the
>>> short answer is: matrix_dev->mdevs_lock)?
> Yes, that answers my question, thanks for untangling it.  We might
> consider a lockdep_assert_held() in the new
> signal_guest_ap_cfg_changed() since it does get called from a variety
> of paths and we need that lock to prevent the UAF.
>
>> I agree that the matrix_dev->mdevs_lock does prevent changes to
>> matrix_mdev->cfg_chg_trigger while it is being accessed by the
>> vfio_ap device driver. My confusion arises from my interpretation of
>> Alex's question; it seemed to me that he was talking its use outside
>> of the vfio_ap driver and how to guard against that.
> Nope, Halil zeroed in on the UAF possibility that concerned me.  Thanks,

He is a marksman!:)

>
> Alex
>
Rorie Reyes Feb. 5, 2025, 5:33 p.m. UTC | #16
On 1/13/25 11:08 AM, Alexander Gordeev wrote:
> On Tue, Jan 07, 2025 at 01:36:45PM -0500, Rorie Reyes wrote:
>
> Hi Rorie, Antony,
>
>> Note that there are corresponding QEMU patches that will be shipped along
>> with this patch (see vfio-ap: Report vfio-ap configuration changes) that
>> will pick up the eventfd signal.
> How this patch is synchronized with the mentioned QEMU series?
> What is the series status, especially with the comment from Cédric Le Goater [1]?
>
> 1. https://lore.kernel.org/all/20250107184354.91079-1-rreyes@linux.ibm.com/T/#mb0d37909c5f69bdff96289094ac0bad0922a7cce
>
> Thanks!

Hey Alex, sorry for the long delay. This patch is synchronized with the 
QEMU series by registering an event notifier handler to process AP 
configuration

change events. This is done by queuing the event and generating a CRW. 
The series status is currently going through a v2 RFC after implementing 
changes

requested by Cedric and Tony.

Let me know if there's anything else you're concerned with

Thanks!
Anthony Krowiak Feb. 5, 2025, 5:47 p.m. UTC | #17
On 2/5/25 12:33 PM, Rorie Reyes wrote:
>
> On 1/13/25 11:08 AM, Alexander Gordeev wrote:
>> On Tue, Jan 07, 2025 at 01:36:45PM -0500, Rorie Reyes wrote:
>>
>> Hi Rorie, Antony,
>>
>>> Note that there are corresponding QEMU patches that will be shipped 
>>> along
>>> with this patch (see vfio-ap: Report vfio-ap configuration changes) 
>>> that
>>> will pick up the eventfd signal.
>> How this patch is synchronized with the mentioned QEMU series?
>> What is the series status, especially with the comment from Cédric Le 
>> Goater [1]?
>>
>> 1. 
>> https://lore.kernel.org/all/20250107184354.91079-1-rreyes@linux.ibm.com/T/#mb0d37909c5f69bdff96289094ac0bad0922a7cce
>>
>> Thanks!
>
> Hey Alex, sorry for the long delay. This patch is synchronized with 
> the QEMU series by registering an event notifier handler to process AP 
> configuration
>
> change events. This is done by queuing the event and generating a CRW. 
> The series status is currently going through a v2 RFC after 
> implementing changes
>
> requested by Cedric and Tony.
>
> Let me know if there's anything else you're concerned with
>
> Thanks!

I don't think that is what Alex was asking. I believe he is asking how 
the QEMU and kernel patch series are going to be synchronized.
Given the kernel series changes a value in vfio.h which is used by QEMU, 
the two series need to be coordinated since the vfio.h file
used by QEMU can not be updated until the kernel code is available. So 
these two sets of code have
to be merged upstream during a merge window. which is different for the 
kernel and QEMU. At least I think that is what Alex is asking.
Alexander Gordeev Feb. 6, 2025, 7:40 a.m. UTC | #18
On Wed, Feb 05, 2025 at 12:47:55PM -0500, Anthony Krowiak wrote:
> > > How this patch is synchronized with the mentioned QEMU series?
> > > What is the series status, especially with the comment from Cédric
> > > Le Goater [1]?
> > > 
> > > 1. https://lore.kernel.org/all/20250107184354.91079-1-rreyes@linux.ibm.com/T/#mb0d37909c5f69bdff96289094ac0bad0922a7cce
...
> > Hey Alex, sorry for the long delay. This patch is synchronized with the
> > QEMU series by registering an event notifier handler to process AP
> > configuration
> > 
> > change events. This is done by queuing the event and generating a CRW.
> > The series status is currently going through a v2 RFC after implementing
> > changes
> > 
> > requested by Cedric and Tony.
> > 
> > Let me know if there's anything else you're concerned with
> > 
> > Thanks!
> 
> I don't think that is what Alex was asking. I believe he is asking how the
> QEMU and kernel patch series are going to be synchronized.
> Given the kernel series changes a value in vfio.h which is used by QEMU, the
> two series need to be coordinated since the vfio.h file
> used by QEMU can not be updated until the kernel code is available. So these
> two sets of code have
> to be merged upstream during a merge window. which is different for the
> kernel and QEMU. At least I think that is what Alex is asking.

Correct.
Thanks for the clarification, Anthony!
Rorie Reyes Feb. 6, 2025, 2:12 p.m. UTC | #19
On 2/6/25 2:40 AM, Alexander Gordeev wrote:
> On Wed, Feb 05, 2025 at 12:47:55PM -0500, Anthony Krowiak wrote:
>>>> How this patch is synchronized with the mentioned QEMU series?
>>>> What is the series status, especially with the comment from Cédric
>>>> Le Goater [1]?
>>>>
>>>> 1. https://lore.kernel.org/all/20250107184354.91079-1-rreyes@linux.ibm.com/T/#mb0d37909c5f69bdff96289094ac0bad0922a7cce
> ...
>>> Hey Alex, sorry for the long delay. This patch is synchronized with the
>>> QEMU series by registering an event notifier handler to process AP
>>> configuration
>>>
>>> change events. This is done by queuing the event and generating a CRW.
>>> The series status is currently going through a v2 RFC after implementing
>>> changes
>>>
>>> requested by Cedric and Tony.
>>>
>>> Let me know if there's anything else you're concerned with
>>>
>>> Thanks!
>> I don't think that is what Alex was asking. I believe he is asking how the
>> QEMU and kernel patch series are going to be synchronized.
>> Given the kernel series changes a value in vfio.h which is used by QEMU, the
>> two series need to be coordinated since the vfio.h file
>> used by QEMU can not be updated until the kernel code is available. So these
>> two sets of code have
>> to be merged upstream during a merge window. which is different for the
>> kernel and QEMU. At least I think that is what Alex is asking.
> Correct.
> Thanks for the clarification, Anthony!

Tony, thank you for the back up!

Alexander, is there anything else you need from my end for clarification?
diff mbox series

Patch

diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index a52c2690933f..c6ff4ab13f16 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -650,13 +650,22 @@  static void vfio_ap_matrix_init(struct ap_config_info *info,
 	matrix->adm_max = info->apxa ? info->nd : 15;
 }
 
+static void signal_guest_ap_cfg_changed(struct ap_matrix_mdev *matrix_mdev)
+{
+		if (matrix_mdev->cfg_chg_trigger)
+			eventfd_signal(matrix_mdev->cfg_chg_trigger);
+}
+
 static void vfio_ap_mdev_update_guest_apcb(struct ap_matrix_mdev *matrix_mdev)
 {
-	if (matrix_mdev->kvm)
+	if (matrix_mdev->kvm) {
 		kvm_arch_crypto_set_masks(matrix_mdev->kvm,
 					  matrix_mdev->shadow_apcb.apm,
 					  matrix_mdev->shadow_apcb.aqm,
 					  matrix_mdev->shadow_apcb.adm);
+
+		signal_guest_ap_cfg_changed(matrix_mdev);
+	}
 }
 
 static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev *matrix_mdev)
@@ -792,6 +801,7 @@  static int vfio_ap_mdev_probe(struct mdev_device *mdev)
 	if (ret)
 		goto err_put_vdev;
 	matrix_mdev->req_trigger = NULL;
+	matrix_mdev->cfg_chg_trigger = NULL;
 	dev_set_drvdata(&mdev->dev, matrix_mdev);
 	mutex_lock(&matrix_dev->mdevs_lock);
 	list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
@@ -1860,6 +1870,7 @@  static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev)
 		get_update_locks_for_kvm(kvm);
 
 		kvm_arch_crypto_clear_masks(kvm);
+		signal_guest_ap_cfg_changed(matrix_mdev);
 		vfio_ap_mdev_reset_queues(matrix_mdev);
 		kvm_put_kvm(kvm);
 		matrix_mdev->kvm = NULL;
@@ -2097,6 +2108,10 @@  static ssize_t vfio_ap_get_irq_info(unsigned long arg)
 		info.count = 1;
 		info.flags = VFIO_IRQ_INFO_EVENTFD;
 		break;
+	case VFIO_AP_CFG_CHG_IRQ_INDEX:
+		info.count = 1;
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -2160,6 +2175,39 @@  static int vfio_ap_set_request_irq(struct ap_matrix_mdev *matrix_mdev,
 	return 0;
 }
 
+static int vfio_ap_set_cfg_change_irq(struct ap_matrix_mdev *matrix_mdev, unsigned long arg)
+{
+	s32 fd;
+	void __user *data;
+	unsigned long minsz;
+	struct eventfd_ctx *cfg_chg_trigger;
+
+	minsz = offsetofend(struct vfio_irq_set, count);
+	data = (void __user *)(arg + minsz);
+
+	if (get_user(fd, (s32 __user *)data))
+		return -EFAULT;
+
+	if (fd == -1) {
+		if (matrix_mdev->cfg_chg_trigger)
+			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
+		matrix_mdev->cfg_chg_trigger = NULL;
+	} else if (fd >= 0) {
+		cfg_chg_trigger = eventfd_ctx_fdget(fd);
+		if (IS_ERR(cfg_chg_trigger))
+			return PTR_ERR(cfg_chg_trigger);
+
+		if (matrix_mdev->cfg_chg_trigger)
+			eventfd_ctx_put(matrix_mdev->cfg_chg_trigger);
+
+		matrix_mdev->cfg_chg_trigger = cfg_chg_trigger;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
 			    unsigned long arg)
 {
@@ -2175,6 +2223,8 @@  static int vfio_ap_set_irqs(struct ap_matrix_mdev *matrix_mdev,
 		switch (irq_set.index) {
 		case VFIO_AP_REQ_IRQ_INDEX:
 			return vfio_ap_set_request_irq(matrix_mdev, arg);
+		case VFIO_AP_CFG_CHG_IRQ_INDEX:
+			return vfio_ap_set_cfg_change_irq(matrix_mdev, arg);
 		default:
 			return -EINVAL;
 		}
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index 437a161c8659..37de9c69b6eb 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -105,6 +105,7 @@  struct ap_queue_table {
  * @mdev:	the mediated device
  * @qtable:	table of queues (struct vfio_ap_queue) assigned to the mdev
  * @req_trigger eventfd ctx for signaling userspace to return a device
+ * @cfg_chg_trigger eventfd ctx to signal AP config changed to userspace
  * @apm_add:	bitmap of APIDs added to the host's AP configuration
  * @aqm_add:	bitmap of APQIs added to the host's AP configuration
  * @adm_add:	bitmap of control domain numbers added to the host's AP
@@ -120,6 +121,7 @@  struct ap_matrix_mdev {
 	struct mdev_device *mdev;
 	struct ap_queue_table qtable;
 	struct eventfd_ctx *req_trigger;
+	struct eventfd_ctx *cfg_chg_trigger;
 	DECLARE_BITMAP(apm_add, AP_DEVICES);
 	DECLARE_BITMAP(aqm_add, AP_DOMAINS);
 	DECLARE_BITMAP(adm_add, AP_DOMAINS);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c8dbf8219c4f..a2d3e1ac6239 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -671,6 +671,7 @@  enum {
  */
 enum {
 	VFIO_AP_REQ_IRQ_INDEX,
+	VFIO_AP_CFG_CHG_IRQ_INDEX,
 	VFIO_AP_NUM_IRQS
 };