diff mbox series

[3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl

Message ID 20231127063909.129153-4-yi.l.liu@intel.com (mailing list archive)
State New
Headers show
Series vfio-pci support pasid attach/detach | expand

Commit Message

Yi Liu Nov. 27, 2023, 6:39 a.m. UTC
This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
hence userspace could probe PASID capability by it. This is a bit different
with other capabilities which are reported to userspace when the user reads
the device's PCI configuration space. There are two reasons for this.

 - First, Qemu by default exposes all available PCI capabilities in vfio-pci
   config space to the guest as read-only, so adding PASID capability in the
   vfio-pci config space will make it exposed to the guest automatically while
   an old Qemu doesn't really support it.

 - Second, PASID capability does not exit on VFs (instead shares the cap of
   the PF). Creating a virtual PASID capability in vfio-pci config space needs
   to find a hole to place it, but doing so may require device specific
   knowledge to avoid potential conflict with device specific registers like
   hiden bits in VF config space. It's simpler by moving this burden to the
   VMM instead of maintaining a quirk system in the kernel.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        | 13 +++++++++
 2 files changed, 60 insertions(+)

Comments

Duan, Zhenzhong Nov. 27, 2023, 7:28 a.m. UTC | #1
>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Sent: Monday, November 27, 2023 2:39 PM
>Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>ioctl
>
>This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>hence userspace could probe PASID capability by it. This is a bit different
>with other capabilities which are reported to userspace when the user reads
>the device's PCI configuration space. There are two reasons for this.
>
> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>   config space to the guest as read-only, so adding PASID capability in the
>   vfio-pci config space will make it exposed to the guest automatically while
>   an old Qemu doesn't really support it.
>
> - Second, PASID capability does not exit on VFs (instead shares the cap of
>   the PF). Creating a virtual PASID capability in vfio-pci config space needs
>   to find a hole to place it, but doing so may require device specific
>   knowledge to avoid potential conflict with device specific registers like
>   hiden bits in VF config space. It's simpler by moving this burden to the
>   VMM instead of maintaining a quirk system in the kernel.
>
>Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>---
> drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h        | 13 +++++++++
> 2 files changed, 60 insertions(+)
>
>diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>index 1929103ee59a..8038aa45500e 100644
>--- a/drivers/vfio/pci/vfio_pci_core.c
>+++ b/drivers/vfio/pci/vfio_pci_core.c
>@@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>vfio_device *device, u32 flags,
> 	return 0;
> }
>
>+static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>+				       struct vfio_device_feature_pasid __user
>*arg,
>+				       size_t argsz)
>+{
>+	struct vfio_pci_core_device *vdev =
>+		container_of(device, struct vfio_pci_core_device, vdev);
>+	struct vfio_device_feature_pasid pasid = { 0 };
>+	struct pci_dev *pdev = vdev->pdev;
>+	u32 capabilities = 0;
>+	int ret;
>+
>+	/* We do not support SET of the PASID capability */
>+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>+				 sizeof(pasid));
>+	if (ret != 1)
>+		return ret;
>+
>+	/*
>+	 * Needs go to PF if the device is VF as VF shares its PF's
>+	 * PASID Capability.
>+	 */
>+	if (pdev->is_virtfn)
>+		pdev = pci_physfn(pdev);
>+
>+	if (!pdev->pasid_enabled)
>+		goto out;

Does a PF bound to VFIO have pasid enabled by default?
Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?

Thanks
Zhenzhong

>+
>+#ifdef CONFIG_PCI_PASID
>+	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>+			      &capabilities);
>+#endif
>+
>+	if (capabilities & PCI_PASID_CAP_EXEC)
>+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
>+	if (capabilities & PCI_PASID_CAP_PRIV)
>+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
>+
>+	pasid.width = (capabilities >> 8) & 0x1f;
>+
>+out:
>+	if (copy_to_user(arg, &pasid, sizeof(pasid)))
>+		return -EFAULT;
>+	return 0;
>+}
>+
> int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> 				void __user *arg, size_t argsz)
> {
>@@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
>*device, u32 flags,
> 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
> 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
>+	case VFIO_DEVICE_FEATURE_PASID:
>+		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
> 	default:
> 		return -ENOTTY;
> 	}
>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>index 495193629029..8326faf8622b 100644
>--- a/include/uapi/linux/vfio.h
>+++ b/include/uapi/linux/vfio.h
>@@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
> };
> #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>
>+/**
>+ * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
>+ * Zero width means no support for PASID.
>+ */
>+struct vfio_device_feature_pasid {
>+	__u16 capabilities;
>+#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>+#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>+	__u8 width;
>+	__u8 __reserved;
>+};
>+#define VFIO_DEVICE_FEATURE_PASID 11
>+
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
>--
>2.34.1
Yi Liu Nov. 28, 2023, 3:11 a.m. UTC | #2
On 2023/11/27 15:28, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, November 27, 2023 2:39 PM
>> Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>> ioctl
>>
>> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>> hence userspace could probe PASID capability by it. This is a bit different
>> with other capabilities which are reported to userspace when the user reads
>> the device's PCI configuration space. There are two reasons for this.
>>
>> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>    config space to the guest as read-only, so adding PASID capability in the
>>    vfio-pci config space will make it exposed to the guest automatically while
>>    an old Qemu doesn't really support it.
>>
>> - Second, PASID capability does not exit on VFs (instead shares the cap of
>>    the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>    to find a hole to place it, but doing so may require device specific
>>    knowledge to avoid potential conflict with device specific registers like
>>    hiden bits in VF config space. It's simpler by moving this burden to the
>>    VMM instead of maintaining a quirk system in the kernel.
>>
>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> ---
>> drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
>> include/uapi/linux/vfio.h        | 13 +++++++++
>> 2 files changed, 60 insertions(+)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>> index 1929103ee59a..8038aa45500e 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.c
>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>> vfio_device *device, u32 flags,
>> 	return 0;
>> }
>>
>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>> +				       struct vfio_device_feature_pasid __user
>> *arg,
>> +				       size_t argsz)
>> +{
>> +	struct vfio_pci_core_device *vdev =
>> +		container_of(device, struct vfio_pci_core_device, vdev);
>> +	struct vfio_device_feature_pasid pasid = { 0 };
>> +	struct pci_dev *pdev = vdev->pdev;
>> +	u32 capabilities = 0;
>> +	int ret;
>> +
>> +	/* We do not support SET of the PASID capability */
>> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>> +				 sizeof(pasid));
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	/*
>> +	 * Needs go to PF if the device is VF as VF shares its PF's
>> +	 * PASID Capability.
>> +	 */
>> +	if (pdev->is_virtfn)
>> +		pdev = pci_physfn(pdev);
>> +
>> +	if (!pdev->pasid_enabled)
>> +		goto out;
> 
> Does a PF bound to VFIO have pasid enabled by default?

Today, host iommu driver (at least intel iommu driver) enables it in the
time of device probe and seems not changed afterward. So yes, VFIO should
see it if pasid is enabled.

> Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?

guest kernel should not have the capability to change host's pasid
configuration. It can only write to its own vconfig emulated by
hypervisor.

> Thanks
> Zhenzhong
> 
>> +
>> +#ifdef CONFIG_PCI_PASID
>> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>> +			      &capabilities);
>> +#endif
>> +
>> +	if (capabilities & PCI_PASID_CAP_EXEC)
>> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
>> +	if (capabilities & PCI_PASID_CAP_PRIV)
>> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
>> +
>> +	pasid.width = (capabilities >> 8) & 0x1f;
>> +
>> +out:
>> +	if (copy_to_user(arg, &pasid, sizeof(pasid)))
>> +		return -EFAULT;
>> +	return 0;
>> +}
>> +
>> int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>> 				void __user *arg, size_t argsz)
>> {
>> @@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
>> *device, u32 flags,
>> 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>> 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>> 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
>> +	case VFIO_DEVICE_FEATURE_PASID:
>> +		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
>> 	default:
>> 		return -ENOTTY;
>> 	}
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 495193629029..8326faf8622b 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
>> };
>> #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>>
>> +/**
>> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
>> + * Zero width means no support for PASID.
>> + */
>> +struct vfio_device_feature_pasid {
>> +	__u16 capabilities;
>> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>> +	__u8 width;
>> +	__u8 __reserved;
>> +};
>> +#define VFIO_DEVICE_FEATURE_PASID 11
>> +
>> /* -------- API for Type1 VFIO IOMMU -------- */
>>
>> /**
>> --
>> 2.34.1
>
Duan, Zhenzhong Nov. 28, 2023, 4:23 a.m. UTC | #3
>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Sent: Tuesday, November 28, 2023 11:12 AM
>Subject: Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>ioctl
>
>On 2023/11/27 15:28, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Liu, Yi L <yi.l.liu@intel.com>
>>> Sent: Monday, November 27, 2023 2:39 PM
>>> Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>>> ioctl
>>>
>>> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>>> hence userspace could probe PASID capability by it. This is a bit different
>>> with other capabilities which are reported to userspace when the user reads
>>> the device's PCI configuration space. There are two reasons for this.
>>>
>>> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>    config space to the guest as read-only, so adding PASID capability in the
>>>    vfio-pci config space will make it exposed to the guest automatically while
>>>    an old Qemu doesn't really support it.
>>>
>>> - Second, PASID capability does not exit on VFs (instead shares the cap of
>>>    the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>>    to find a hole to place it, but doing so may require device specific
>>>    knowledge to avoid potential conflict with device specific registers like
>>>    hiden bits in VF config space. It's simpler by moving this burden to the
>>>    VMM instead of maintaining a quirk system in the kernel.
>>>
>>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>>> ---
>>> drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
>>> include/uapi/linux/vfio.h        | 13 +++++++++
>>> 2 files changed, 60 insertions(+)
>>>
>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>>> index 1929103ee59a..8038aa45500e 100644
>>> --- a/drivers/vfio/pci/vfio_pci_core.c
>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>>> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>>> vfio_device *device, u32 flags,
>>> 	return 0;
>>> }
>>>
>>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>>> +				       struct vfio_device_feature_pasid __user
>>> *arg,
>>> +				       size_t argsz)
>>> +{
>>> +	struct vfio_pci_core_device *vdev =
>>> +		container_of(device, struct vfio_pci_core_device, vdev);
>>> +	struct vfio_device_feature_pasid pasid = { 0 };
>>> +	struct pci_dev *pdev = vdev->pdev;
>>> +	u32 capabilities = 0;
>>> +	int ret;
>>> +
>>> +	/* We do not support SET of the PASID capability */
>>> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>>> +				 sizeof(pasid));
>>> +	if (ret != 1)
>>> +		return ret;
>>> +
>>> +	/*
>>> +	 * Needs go to PF if the device is VF as VF shares its PF's
>>> +	 * PASID Capability.
>>> +	 */
>>> +	if (pdev->is_virtfn)
>>> +		pdev = pci_physfn(pdev);
>>> +
>>> +	if (!pdev->pasid_enabled)
>>> +		goto out;
>>
>> Does a PF bound to VFIO have pasid enabled by default?
>
>Today, host iommu driver (at least intel iommu driver) enables it in the
>time of device probe and seems not changed afterward. So yes, VFIO should
>see it if pasid is enabled.
>
>> Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?
>
>guest kernel should not have the capability to change host's pasid
>configuration. It can only write to its own vconfig emulated by
>hypervisor.

Understood, thanks Yi.

BRs.
Zhenzhong
Tian, Kevin Dec. 7, 2023, 8:47 a.m. UTC | #4
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, November 27, 2023 2:39 PM
> 
> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
> +				       struct vfio_device_feature_pasid __user
> *arg,
> +				       size_t argsz)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	struct vfio_device_feature_pasid pasid = { 0 };
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 capabilities = 0;
> +	int ret;
> +
> +	/* We do not support SET of the PASID capability */

this line alone is meaningless. Please explain the reason e.g. due to
no PASID capability per VF...

> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(pasid));
> +	if (ret != 1)
> +		return ret;
> +
> +	/*
> +	 * Needs go to PF if the device is VF as VF shares its PF's
> +	 * PASID Capability.
> +	 */

/* VF shares the PASID capability of its PF */

> +	if (pdev->is_virtfn)
> +		pdev = pci_physfn(pdev);
> +
> +	if (!pdev->pasid_enabled)
> +		goto out;
> +
> +#ifdef CONFIG_PCI_PASID
> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> +			      &capabilities);
> +#endif

#ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
won't be set anyway.

and it should read from PCI_PASID_CTRL which indicates whether a
capability is actually enabled.

> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
> device.
> + * Zero width means no support for PASID.

also mention the encoding of this field according to PCIe spec.

or turn it to a plain number field.

> + */
> +struct vfio_device_feature_pasid {
> +	__u16 capabilities;
> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> +	__u8 width;
> +	__u8 __reserved;
> +};
> +#define VFIO_DEVICE_FEATURE_PASID 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
> 
>  /**
> --
> 2.34.1
Yi Liu Dec. 11, 2023, 8:08 a.m. UTC | #5
On 2023/12/7 16:47, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, November 27, 2023 2:39 PM
>>
>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
>> +				       struct vfio_device_feature_pasid __user
>> *arg,
>> +				       size_t argsz)
>> +{
>> +	struct vfio_pci_core_device *vdev =
>> +		container_of(device, struct vfio_pci_core_device, vdev);
>> +	struct vfio_device_feature_pasid pasid = { 0 };
>> +	struct pci_dev *pdev = vdev->pdev;
>> +	u32 capabilities = 0;
>> +	int ret;
>> +
>> +	/* We do not support SET of the PASID capability */
> 
> this line alone is meaningless. Please explain the reason e.g. due to
> no PASID capability per VF...

sure. I think the major reason is we don't allow userspace to change the
PASID configuration. is it?

> 
>> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>> +				 sizeof(pasid));
>> +	if (ret != 1)
>> +		return ret;
>> +
>> +	/*
>> +	 * Needs go to PF if the device is VF as VF shares its PF's
>> +	 * PASID Capability.
>> +	 */
> 
> /* VF shares the PASID capability of its PF */

got it.

>> +	if (pdev->is_virtfn)
>> +		pdev = pci_physfn(pdev);
>> +
>> +	if (!pdev->pasid_enabled)
>> +		goto out;
>> +
>> +#ifdef CONFIG_PCI_PASID
>> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>> +			      &capabilities);
>> +#endif
> 
> #ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
> won't be set anyway.

it's sad that the pdev->pasid_cap is defined under #if CONFIG_PCI_PASID.
Perhaps we can have a wrapper for it.

> and it should read from PCI_PASID_CTRL which indicates whether a
> capability is actually enabled.

yes, for the EXEC and PRIV capability, needs to check if it's enabled or
not before reporting.

> 
>> +/**
>> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
>> device.
>> + * Zero width means no support for PASID.
> 
> also mention the encoding of this field according to PCIe spec.

yes.

> or turn it to a plain number field.

It is not exact the same as the spec since bit0 is reserved. But
here bit0 is used as well.

>> + */
>> +struct vfio_device_feature_pasid {
>> +	__u16 capabilities;
>> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>> +	__u8 width;
>> +	__u8 __reserved;
>> +};
>> +#define VFIO_DEVICE_FEATURE_PASID 11
>> +
>>   /* -------- API for Type1 VFIO IOMMU -------- */
>>
>>   /**
>> --
>> 2.34.1
>
Alex Williamson Dec. 11, 2023, 6:03 p.m. UTC | #6
On Sun, 26 Nov 2023 22:39:09 -0800
Yi Liu <yi.l.liu@intel.com> wrote:

> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
> hence userspace could probe PASID capability by it. This is a bit different
> with other capabilities which are reported to userspace when the user reads
> the device's PCI configuration space. There are two reasons for this.
> 
>  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>    config space to the guest as read-only, so adding PASID capability in the
>    vfio-pci config space will make it exposed to the guest automatically while
>    an old Qemu doesn't really support it.

Shouldn't we also be working on hiding the PASID capability in QEMU
ASAP?  This feature only allows QEMU to know PASID control is actually
available, not the guest.  Maybe we're hoping this is really only used
by VFs where there's no capability currently exposed to the guest?

>  - Second, PASID capability does not exit on VFs (instead shares the cap of

s/exit/exist/

>    the PF). Creating a virtual PASID capability in vfio-pci config space needs
>    to find a hole to place it, but doing so may require device specific
>    knowledge to avoid potential conflict with device specific registers like
>    hiden bits in VF config space. It's simpler by moving this burden to the
>    VMM instead of maintaining a quirk system in the kernel.

This feels a bit like an incomplete solution though and we might
already posses device specific knowledge in the form of a variant
driver.  Should this feature structure include a flag + field that
could serve to generically indicate to the VMM a location for
implementing the PASID capability?  The default core implementation
might fill this only for PFs where clearly an emualted PASID capability
can overlap the physical capability.  Thanks,

Alex

> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 47 ++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        | 13 +++++++++
>  2 files changed, 60 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 1929103ee59a..8038aa45500e 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
> +				       struct vfio_device_feature_pasid __user *arg,
> +				       size_t argsz)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	struct vfio_device_feature_pasid pasid = { 0 };
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 capabilities = 0;
> +	int ret;
> +
> +	/* We do not support SET of the PASID capability */
> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(pasid));
> +	if (ret != 1)
> +		return ret;
> +
> +	/*
> +	 * Needs go to PF if the device is VF as VF shares its PF's
> +	 * PASID Capability.
> +	 */
> +	if (pdev->is_virtfn)
> +		pdev = pci_physfn(pdev);
> +
> +	if (!pdev->pasid_enabled)
> +		goto out;
> +
> +#ifdef CONFIG_PCI_PASID
> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> +			      &capabilities);
> +#endif
> +
> +	if (capabilities & PCI_PASID_CAP_EXEC)
> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
> +	if (capabilities & PCI_PASID_CAP_PRIV)
> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
> +
> +	pasid.width = (capabilities >> 8) & 0x1f;
> +
> +out:
> +	if (copy_to_user(arg, &pasid, sizeof(pasid)))
> +		return -EFAULT;
> +	return 0;
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PASID:
> +		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 495193629029..8326faf8622b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
> + * Zero width means no support for PASID.
> + */
> +struct vfio_device_feature_pasid {
> +	__u16 capabilities;
> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> +	__u8 width;
> +	__u8 __reserved;
> +};
> +#define VFIO_DEVICE_FEATURE_PASID 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
Jason Gunthorpe Dec. 11, 2023, 6:10 p.m. UTC | #7
On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> On Sun, 26 Nov 2023 22:39:09 -0800
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
> > hence userspace could probe PASID capability by it. This is a bit different
> > with other capabilities which are reported to userspace when the user reads
> > the device's PCI configuration space. There are two reasons for this.
> > 
> >  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
> >    config space to the guest as read-only, so adding PASID capability in the
> >    vfio-pci config space will make it exposed to the guest automatically while
> >    an old Qemu doesn't really support it.
> 
> Shouldn't we also be working on hiding the PASID capability in QEMU
> ASAP?  This feature only allows QEMU to know PASID control is actually
> available, not the guest.  Maybe we're hoping this is really only used
> by VFs where there's no capability currently exposed to the guest?

Makes sense, yes

> >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> >    to find a hole to place it, but doing so may require device specific
> >    knowledge to avoid potential conflict with device specific registers like
> >    hiden bits in VF config space. It's simpler by moving this burden to the
> >    VMM instead of maintaining a quirk system in the kernel.
> 
> This feels a bit like an incomplete solution though and we might
> already posses device specific knowledge in the form of a variant
> driver.  Should this feature structure include a flag + field that
> could serve to generically indicate to the VMM a location for
> implementing the PASID capability?  The default core implementation
> might fill this only for PFs where clearly an emualted PASID capability
> can overlap the physical capability.  Thanks,

In many ways I would perfer to solve this for good by having a way to
learn a range of available config space - I liked the suggestion to
use a DVSEC to mark empty space.

Jason
Alex Williamson Dec. 11, 2023, 6:49 p.m. UTC | #8
On Mon, 11 Dec 2023 14:10:28 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> > On Sun, 26 Nov 2023 22:39:09 -0800
> > Yi Liu <yi.l.liu@intel.com> wrote:

> > >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> > >    to find a hole to place it, but doing so may require device specific
> > >    knowledge to avoid potential conflict with device specific registers like
> > >    hiden bits in VF config space. It's simpler by moving this burden to the
> > >    VMM instead of maintaining a quirk system in the kernel.  
> > 
> > This feels a bit like an incomplete solution though and we might
> > already posses device specific knowledge in the form of a variant
> > driver.  Should this feature structure include a flag + field that
> > could serve to generically indicate to the VMM a location for
> > implementing the PASID capability?  The default core implementation
> > might fill this only for PFs where clearly an emualted PASID capability
> > can overlap the physical capability.  Thanks,  
> 
> In many ways I would perfer to solve this for good by having a way to
> learn a range of available config space - I liked the suggestion to
> use a DVSEC to mark empty space.

Yes, DVSEC is the most plausible option for the device itself to convey
unused config space, but that requires hardware adoption so presumably
we're going to need to fill the gaps with device specific code.  That
code might live in a variant driver or in the VMM.  If we have faith
that DVSEC is the way, it'd make sense for a variant driver to
implement a virtual DVSEC to work out the QEMU implementation and set a
precedent.

I mostly just want us to recognize that this feature structure also has
the possibility to fill this gap and we're consciously passing it over
and should maybe formally propose the DVSEC solution and reference it
in the commit log or comments here to provide a complete picture.
Thanks,

Alex
Tian, Kevin Dec. 12, 2023, 2:16 a.m. UTC | #9
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, December 12, 2023 2:04 AM
> 
> On Sun, 26 Nov 2023 22:39:09 -0800
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This reports the PASID capability data to userspace via
> VFIO_DEVICE_FEATURE,
> > hence userspace could probe PASID capability by it. This is a bit different
> > with other capabilities which are reported to userspace when the user
> reads
> > the device's PCI configuration space. There are two reasons for this.
> >
> >  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
> >    config space to the guest as read-only, so adding PASID capability in the
> >    vfio-pci config space will make it exposed to the guest automatically while
> >    an old Qemu doesn't really support it.
> 
> Shouldn't we also be working on hiding the PASID capability in QEMU
> ASAP?  This feature only allows QEMU to know PASID control is actually
> available, not the guest.  Maybe we're hoping this is really only used
> by VFs where there's no capability currently exposed to the guest?

We expect this to be used by both PF/VF. It doesn't make sense to have
separate interfaces between them.

I'm not aware of that the PASID capability has been exported today. So
yes we should fix QEMU asap. and also remove the line exposing it
in vfio_pci_config.c.

> 
> >  - Second, PASID capability does not exit on VFs (instead shares the cap of
> 
> s/exit/exist/
> 
> >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> >    to find a hole to place it, but doing so may require device specific
> >    knowledge to avoid potential conflict with device specific registers like
> >    hiden bits in VF config space. It's simpler by moving this burden to the
> >    VMM instead of maintaining a quirk system in the kernel.
> 
> This feels a bit like an incomplete solution though and we might
> already posses device specific knowledge in the form of a variant
> driver.  Should this feature structure include a flag + field that
> could serve to generically indicate to the VMM a location for
> implementing the PASID capability?  The default core implementation
> might fill this only for PFs where clearly an emualted PASID capability
> can overlap the physical capability.  Thanks,
> 

make sense
Tian, Kevin Dec. 12, 2023, 2:20 a.m. UTC | #10
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, December 11, 2023 4:08 PM
> 
> On 2023/12/7 16:47, Tian, Kevin wrote:
> >> From: Liu, Yi L <yi.l.liu@intel.com>
> >> Sent: Monday, November 27, 2023 2:39 PM
> >>
> >> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
> flags,
> >> +				       struct vfio_device_feature_pasid __user
> >> *arg,
> >> +				       size_t argsz)
> >> +{
> >> +	struct vfio_pci_core_device *vdev =
> >> +		container_of(device, struct vfio_pci_core_device, vdev);
> >> +	struct vfio_device_feature_pasid pasid = { 0 };
> >> +	struct pci_dev *pdev = vdev->pdev;
> >> +	u32 capabilities = 0;
> >> +	int ret;
> >> +
> >> +	/* We do not support SET of the PASID capability */
> >
> > this line alone is meaningless. Please explain the reason e.g. due to
> > no PASID capability per VF...
> 
> sure. I think the major reason is we don't allow userspace to change the
> PASID configuration. is it?

if only PF it's still possible to develop a model allowing userspace to
change.

but with VF this is not possible in concept.

> >> +	if (pdev->is_virtfn)
> >> +		pdev = pci_physfn(pdev);
> >> +
> >> +	if (!pdev->pasid_enabled)
> >> +		goto out;
> >> +
> >> +#ifdef CONFIG_PCI_PASID
> >> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> >> +			      &capabilities);
> >> +#endif
> >
> > #ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
> > won't be set anyway.
> 
> it's sad that the pdev->pasid_cap is defined under #if CONFIG_PCI_PASID.
> Perhaps we can have a wrapper for it.

oh I didn't note it.

> 
> > and it should read from PCI_PASID_CTRL which indicates whether a
> > capability is actually enabled.
> 
> yes, for the EXEC and PRIV capability, needs to check if it's enabled or
> not before reporting.
> 
> >
> >> +/**
> >> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
> >> device.
> >> + * Zero width means no support for PASID.
> >
> > also mention the encoding of this field according to PCIe spec.
> 
> yes.
> 
> > or turn it to a plain number field.
> 
> It is not exact the same as the spec since bit0 is reserved. But
> here bit0 is used as well.
> 

what is bit0 used for?
Duan, Zhenzhong Dec. 12, 2023, 2:43 a.m. UTC | #11
>-----Original Message-----
>From: Alex Williamson <alex.williamson@redhat.com>
>Sent: Tuesday, December 12, 2023 2:04 AM
>Subject: Re: [PATCH 3/3] vfio: Report PASID capability via
>VFIO_DEVICE_FEATURE ioctl
>
>On Sun, 26 Nov 2023 22:39:09 -0800
>Yi Liu <yi.l.liu@intel.com> wrote:
>
>> This reports the PASID capability data to userspace via
>VFIO_DEVICE_FEATURE,
>> hence userspace could probe PASID capability by it. This is a bit different
>> with other capabilities which are reported to userspace when the user
>reads
>> the device's PCI configuration space. There are two reasons for this.
>>
>>  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>    config space to the guest as read-only, so adding PASID capability in the
>>    vfio-pci config space will make it exposed to the guest automatically
>while
>>    an old Qemu doesn't really support it.
>
>Shouldn't we also be working on hiding the PASID capability in QEMU
>ASAP?  This feature only allows QEMU to know PASID control is actually
>available, not the guest.  Maybe we're hoping this is really only used
>by VFs where there's no capability currently exposed to the guest?

PASID capability is not exposed to QEMU through config space,
VFIO_DEVICE_FEATURE ioctl is the only interface to expose PASID
cap to QEMU for both PF and VF.

/*
 * Lengths of PCIe/PCI-X Extended Config Capabilities
 *   0: Removed or masked from the user visible capability list
 *   FF: Variable length
 */
static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
...
        [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
}

Thanks
Zhenzhong
Yi Liu Dec. 12, 2023, 3:26 a.m. UTC | #12
On 2023/12/12 10:20, Tian, Kevin wrote:
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Sent: Monday, December 11, 2023 4:08 PM
>>
>> On 2023/12/7 16:47, Tian, Kevin wrote:
>>>> From: Liu, Yi L <yi.l.liu@intel.com>
>>>> Sent: Monday, November 27, 2023 2:39 PM
>>>>
>>>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
>> flags,
>>>> +				       struct vfio_device_feature_pasid __user
>>>> *arg,
>>>> +				       size_t argsz)
>>>> +{
>>>> +	struct vfio_pci_core_device *vdev =
>>>> +		container_of(device, struct vfio_pci_core_device, vdev);
>>>> +	struct vfio_device_feature_pasid pasid = { 0 };
>>>> +	struct pci_dev *pdev = vdev->pdev;
>>>> +	u32 capabilities = 0;
>>>> +	int ret;
>>>> +
>>>> +	/* We do not support SET of the PASID capability */
>>>
>>> this line alone is meaningless. Please explain the reason e.g. due to
>>> no PASID capability per VF...
>>
>> sure. I think the major reason is we don't allow userspace to change the
>> PASID configuration. is it?
> 
> if only PF it's still possible to develop a model allowing userspace to
> change.
> 
> but with VF this is not possible in concept.

got it.

> 
>>>> +	if (pdev->is_virtfn)
>>>> +		pdev = pci_physfn(pdev);
>>>> +
>>>> +	if (!pdev->pasid_enabled)
>>>> +		goto out;
>>>> +
>>>> +#ifdef CONFIG_PCI_PASID
>>>> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
>>>> +			      &capabilities);
>>>> +#endif
>>>
>>> #ifdef is unnecessary. If CONFIG_PCI_PASID is false pdev->pasid_enabled
>>> won't be set anyway.
>>
>> it's sad that the pdev->pasid_cap is defined under #if CONFIG_PCI_PASID.
>> Perhaps we can have a wrapper for it.
> 
> oh I didn't note it.

If Alex feels better to have a wrapper, we may have one.

>>
>>> and it should read from PCI_PASID_CTRL which indicates whether a
>>> capability is actually enabled.
>>
>> yes, for the EXEC and PRIV capability, needs to check if it's enabled or
>> not before reporting.
>>
>>>
>>>> +/**
>>>> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the
>>>> device.
>>>> + * Zero width means no support for PASID.
>>>
>>> also mention the encoding of this field according to PCIe spec.
>>
>> yes.
>>
>>> or turn it to a plain number field.
>>
>> It is not exact the same as the spec since bit0 is reserved. But
>> here bit0 is used as well.
>>
> 
> what is bit0 used for?

it's just been reserved. No usage is mentioned in the latest spec. I don't
know the background neither.
Alex Williamson Dec. 12, 2023, 3:39 a.m. UTC | #13
On Tue, 12 Dec 2023 02:43:20 +0000
"Duan, Zhenzhong" <zhenzhong.duan@intel.com> wrote:

> >-----Original Message-----
> >From: Alex Williamson <alex.williamson@redhat.com>
> >Sent: Tuesday, December 12, 2023 2:04 AM
> >Subject: Re: [PATCH 3/3] vfio: Report PASID capability via
> >VFIO_DEVICE_FEATURE ioctl
> >
> >On Sun, 26 Nov 2023 22:39:09 -0800
> >Yi Liu <yi.l.liu@intel.com> wrote:
> >  
> >> This reports the PASID capability data to userspace via  
> >VFIO_DEVICE_FEATURE,  
> >> hence userspace could probe PASID capability by it. This is a bit different
> >> with other capabilities which are reported to userspace when the user  
> >reads  
> >> the device's PCI configuration space. There are two reasons for this.
> >>
> >>  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
> >>    config space to the guest as read-only, so adding PASID capability in the
> >>    vfio-pci config space will make it exposed to the guest automatically  
> >while  
> >>    an old Qemu doesn't really support it.  
> >
> >Shouldn't we also be working on hiding the PASID capability in QEMU
> >ASAP?  This feature only allows QEMU to know PASID control is actually
> >available, not the guest.  Maybe we're hoping this is really only used
> >by VFs where there's no capability currently exposed to the guest?  
> 
> PASID capability is not exposed to QEMU through config space,
> VFIO_DEVICE_FEATURE ioctl is the only interface to expose PASID
> cap to QEMU for both PF and VF.
> 
> /*
>  * Lengths of PCIe/PCI-X Extended Config Capabilities
>  *   0: Removed or masked from the user visible capability list
>  *   FF: Variable length
>  */
> static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
> ...
>         [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
> }

Ah, thanks.  The comment made me think is was already exposed and I
didn't double check.  So we really just want to convey the information
of the PASID capability outside of config space because if we pass the
capability itself existing userspace will blindly expose a read-only
version to the guest.  That could be better explained in the commit log
and comments.

So how do we keep up with PCIe spec updates relative to the PASID
capability with this proposal?  Would it make more sense to report the
raw capability register and capability version rather that a translated
copy thereof?  Perhaps just masking the fields we're currently prepared
to expose.  Thanks,

Alex
Yi Liu Dec. 12, 2023, 3:44 a.m. UTC | #14
On 2023/12/12 10:16, Tian, Kevin wrote:
>> From: Alex Williamson <alex.williamson@redhat.com>
>> Sent: Tuesday, December 12, 2023 2:04 AM
>>
>> On Sun, 26 Nov 2023 22:39:09 -0800
>> Yi Liu <yi.l.liu@intel.com> wrote:
>>
>>> This reports the PASID capability data to userspace via
>> VFIO_DEVICE_FEATURE,
>>> hence userspace could probe PASID capability by it. This is a bit different
>>> with other capabilities which are reported to userspace when the user
>> reads
>>> the device's PCI configuration space. There are two reasons for this.
>>>
>>>   - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>     config space to the guest as read-only, so adding PASID capability in the
>>>     vfio-pci config space will make it exposed to the guest automatically while
>>>     an old Qemu doesn't really support it.
>>
>> Shouldn't we also be working on hiding the PASID capability in QEMU
>> ASAP?  This feature only allows QEMU to know PASID control is actually
>> available, not the guest.  Maybe we're hoping this is really only used
>> by VFs where there's no capability currently exposed to the guest?
> 
> We expect this to be used by both PF/VF. It doesn't make sense to have
> separate interfaces between them.
> 
> I'm not aware of that the PASID capability has been exported today. So
> yes we should fix QEMU asap. and also remove the line exposing it
> in vfio_pci_config.c.

Kernel side hides the PASID capability by setting its length as 0
in the below array. As a result, QEMU wont see it in the cap chain.
Do you mean we need to let QEMU always ignore it even if kernel side
does not hide it?

static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
	...
	[PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
	...
};

So far, kernel is still hiding it.

> 
>>
>>>   - Second, PASID capability does not exit on VFs (instead shares the cap of
>>
>> s/exit/exist/
>>
>>>     the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>>     to find a hole to place it, but doing so may require device specific
>>>     knowledge to avoid potential conflict with device specific registers like
>>>     hiden bits in VF config space. It's simpler by moving this burden to the
>>>     VMM instead of maintaining a quirk system in the kernel.
>>
>> This feels a bit like an incomplete solution though and we might
>> already posses device specific knowledge in the form of a variant
>> driver.  Should this feature structure include a flag + field that
>> could serve to generically indicate to the VMM a location for
>> implementing the PASID capability?  The default core implementation
>> might fill this only for PFs where clearly an emualted PASID capability
>> can overlap the physical capability.  Thanks,
>>
> 
> make sense

A location maybe not enough, may also need to know if any successive
cap, so that we can insert the capability into the cap chain.
Yi Liu Dec. 12, 2023, 3:53 a.m. UTC | #15
On 2023/12/12 11:39, Alex Williamson wrote:
> On Tue, 12 Dec 2023 02:43:20 +0000
> "Duan, Zhenzhong" <zhenzhong.duan@intel.com> wrote:
> 
>>> -----Original Message-----
>>> From: Alex Williamson <alex.williamson@redhat.com>
>>> Sent: Tuesday, December 12, 2023 2:04 AM
>>> Subject: Re: [PATCH 3/3] vfio: Report PASID capability via
>>> VFIO_DEVICE_FEATURE ioctl
>>>
>>> On Sun, 26 Nov 2023 22:39:09 -0800
>>> Yi Liu <yi.l.liu@intel.com> wrote:
>>>   
>>>> This reports the PASID capability data to userspace via
>>> VFIO_DEVICE_FEATURE,
>>>> hence userspace could probe PASID capability by it. This is a bit different
>>>> with other capabilities which are reported to userspace when the user
>>> reads
>>>> the device's PCI configuration space. There are two reasons for this.
>>>>
>>>>   - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>>     config space to the guest as read-only, so adding PASID capability in the
>>>>     vfio-pci config space will make it exposed to the guest automatically
>>> while
>>>>     an old Qemu doesn't really support it.
>>>
>>> Shouldn't we also be working on hiding the PASID capability in QEMU
>>> ASAP?  This feature only allows QEMU to know PASID control is actually
>>> available, not the guest.  Maybe we're hoping this is really only used
>>> by VFs where there's no capability currently exposed to the guest?
>>
>> PASID capability is not exposed to QEMU through config space,
>> VFIO_DEVICE_FEATURE ioctl is the only interface to expose PASID
>> cap to QEMU for both PF and VF.
>>
>> /*
>>   * Lengths of PCIe/PCI-X Extended Config Capabilities
>>   *   0: Removed or masked from the user visible capability list
>>   *   FF: Variable length
>>   */
>> static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
>> ...
>>          [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
>> }
> 
> Ah, thanks.  The comment made me think is was already exposed and I
> didn't double check.  So we really just want to convey the information
> of the PASID capability outside of config space because if we pass the
> capability itself existing userspace will blindly expose a read-only
> version to the guest.  That could be better explained in the commit log
> and comments.

aha, yes. It was mentioned there, but seems not quite clear. Will refine. :)

  - First, Qemu by default exposes all available PCI capabilities in vfio-pci
    config space to the guest as read-only, so adding PASID capability in the
    vfio-pci config space will make it exposed to the guest automatically while
    an old Qemu doesn't really support it.


> So how do we keep up with PCIe spec updates relative to the PASID
> capability with this proposal?  Would it make more sense to report the
> raw capability register and capability version rather that a translated
> copy thereof?  Perhaps just masking the fields we're currently prepared
> to expose.  Thanks,

I have a minor concern on reporting raw capability register and capability
version. In this way, an old host kernel (supports version 1 pasid cap)
running on top of new hw which supports say version 2 pasid capability, the
VM would see the new capabilities that host kernel does not know. Is it
good?
Jason Gunthorpe Dec. 12, 2023, 3:27 p.m. UTC | #16
On Mon, Dec 11, 2023 at 08:39:46PM -0700, Alex Williamson wrote:

> So how do we keep up with PCIe spec updates relative to the PASID
> capability with this proposal?  Would it make more sense to report the
> raw capability register and capability version rather that a translated
> copy thereof?  Perhaps just masking the fields we're currently prepared
> to expose. 

I think the VMM must always create a cap based on the PCIe version it
understands. We don't know what future specs will put there so it
seems risky to forward it if we don't know that any possible
hypervisor support is present.

We have this problem on and off where stuff in PCI config space needs
explicit hypervisor support or it doesn't work in the VM and things
get confusing.

Jason
Jason Gunthorpe Dec. 12, 2023, 3:31 p.m. UTC | #17
On Tue, Dec 12, 2023 at 02:20:01AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, December 11, 2023 4:08 PM
> > 
> > On 2023/12/7 16:47, Tian, Kevin wrote:
> > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > >> Sent: Monday, November 27, 2023 2:39 PM
> > >>
> > >> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
> > flags,
> > >> +				       struct vfio_device_feature_pasid __user
> > >> *arg,
> > >> +				       size_t argsz)
> > >> +{
> > >> +	struct vfio_pci_core_device *vdev =
> > >> +		container_of(device, struct vfio_pci_core_device, vdev);
> > >> +	struct vfio_device_feature_pasid pasid = { 0 };
> > >> +	struct pci_dev *pdev = vdev->pdev;
> > >> +	u32 capabilities = 0;
> > >> +	int ret;
> > >> +
> > >> +	/* We do not support SET of the PASID capability */
> > >
> > > this line alone is meaningless. Please explain the reason e.g. due to
> > > no PASID capability per VF...
> > 
> > sure. I think the major reason is we don't allow userspace to change the
> > PASID configuration. is it?
> 
> if only PF it's still possible to develop a model allowing userspace to
> change.

More importantly the primary purpose of setting the PASID width is
because of the physical properties of the IOMMU HW.

IOMMU HW that supports virtualization should do so in a way that the
PASID with can be globally set to some value the hypervisor is aware
the HW can decode in all cases.

The VM should have no way to make the HW ignore (vs check for zero)
upper bits of the PASID that would require the physical PASID bits to
be reduced.

So we should never allow programming of this, VMM just fakes it and
ignores sets.

Similar argument for enable, IOMMU HW supporting virtualization should
always be able to decode PASID and reject PASID TLPs if the VM hasn't
configured the vIOMMU to decode them. The purpose of the disable bit
is to accommodate IOMMU HW that cannot decode the PASID TLP at all and
would become confused.

Jason
Jason Gunthorpe Dec. 12, 2023, 3:35 p.m. UTC | #18
On Mon, Dec 11, 2023 at 11:49:49AM -0700, Alex Williamson wrote:
> On Mon, 11 Dec 2023 14:10:28 -0400
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> > > On Sun, 26 Nov 2023 22:39:09 -0800
> > > Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > > >    the PF). Creating a virtual PASID capability in vfio-pci config space needs
> > > >    to find a hole to place it, but doing so may require device specific
> > > >    knowledge to avoid potential conflict with device specific registers like
> > > >    hiden bits in VF config space. It's simpler by moving this burden to the
> > > >    VMM instead of maintaining a quirk system in the kernel.  
> > > 
> > > This feels a bit like an incomplete solution though and we might
> > > already posses device specific knowledge in the form of a variant
> > > driver.  Should this feature structure include a flag + field that
> > > could serve to generically indicate to the VMM a location for
> > > implementing the PASID capability?  The default core implementation
> > > might fill this only for PFs where clearly an emualted PASID capability
> > > can overlap the physical capability.  Thanks,  
> > 
> > In many ways I would perfer to solve this for good by having a way to
> > learn a range of available config space - I liked the suggestion to
> > use a DVSEC to mark empty space.
> 
> Yes, DVSEC is the most plausible option for the device itself to convey
> unused config space, but that requires hardware adoption so presumably
> we're going to need to fill the gaps with device specific code.  That
> code might live in a variant driver or in the VMM.  If we have faith
> that DVSEC is the way, it'd make sense for a variant driver to
> implement a virtual DVSEC to work out the QEMU implementation and set a
> precedent.

How hard do you think it would be for the kernel to synthesize the
dvsec if the varient driver can provide a range for it?

On the other hand I'm not so keen on having variant drivers that are
only doing this just to avoid a table in qemu :\ It seems like a
reasonable thing to add to existing drivers, though none of them
support PASID yet..

> I mostly just want us to recognize that this feature structure also has
> the possibility to fill this gap and we're consciously passing it over
> and should maybe formally propose the DVSEC solution and reference it
> in the commit log or comments here to provide a complete picture.

You mean by passing an explicit empty range or something in a feature
IOCTL?

Jason
Tian, Kevin Dec. 13, 2023, 1:59 a.m. UTC | #19
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, December 12, 2023 11:32 PM
> 
> On Tue, Dec 12, 2023 at 02:20:01AM +0000, Tian, Kevin wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Monday, December 11, 2023 4:08 PM
> > >
> > > On 2023/12/7 16:47, Tian, Kevin wrote:
> > > >> From: Liu, Yi L <yi.l.liu@intel.com>
> > > >> Sent: Monday, November 27, 2023 2:39 PM
> > > >>
> > > >> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32
> > > flags,
> > > >> +				       struct vfio_device_feature_pasid __user
> > > >> *arg,
> > > >> +				       size_t argsz)
> > > >> +{
> > > >> +	struct vfio_pci_core_device *vdev =
> > > >> +		container_of(device, struct vfio_pci_core_device, vdev);
> > > >> +	struct vfio_device_feature_pasid pasid = { 0 };
> > > >> +	struct pci_dev *pdev = vdev->pdev;
> > > >> +	u32 capabilities = 0;
> > > >> +	int ret;
> > > >> +
> > > >> +	/* We do not support SET of the PASID capability */
> > > >
> > > > this line alone is meaningless. Please explain the reason e.g. due to
> > > > no PASID capability per VF...
> > >
> > > sure. I think the major reason is we don't allow userspace to change the
> > > PASID configuration. is it?
> >
> > if only PF it's still possible to develop a model allowing userspace to
> > change.
> 
> More importantly the primary purpose of setting the PASID width is
> because of the physical properties of the IOMMU HW.
> 
> IOMMU HW that supports virtualization should do so in a way that the
> PASID with can be globally set to some value the hypervisor is aware
> the HW can decode in all cases.
> 
> The VM should have no way to make the HW ignore (vs check for zero)
> upper bits of the PASID that would require the physical PASID bits to
> be reduced.
> 
> So we should never allow programming of this, VMM just fakes it and
> ignores sets.

PASID width is read-only so certainly sets should be ignored

> 
> Similar argument for enable, IOMMU HW supporting virtualization should
> always be able to decode PASID and reject PASID TLPs if the VM hasn't
> configured the vIOMMU to decode them. The purpose of the disable bit
> is to accommodate IOMMU HW that cannot decode the PASID TLP at all and
> would become confused.
> 

Yes, this explains why disallowing userspace to change doesn't cause
problem in this series. My earlier point was just that allowing userspace
to change could be implemented for PF (though unnecessary with your
explanation) to mimic the hardware behavior.
Tian, Kevin Dec. 13, 2023, 2:10 a.m. UTC | #20
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, December 12, 2023 11:35 PM
> 
> On Mon, Dec 11, 2023 at 11:49:49AM -0700, Alex Williamson wrote:
> > On Mon, 11 Dec 2023 14:10:28 -0400
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > > On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
> > > > On Sun, 26 Nov 2023 22:39:09 -0800
> > > > Yi Liu <yi.l.liu@intel.com> wrote:
> >
> > > > >    the PF). Creating a virtual PASID capability in vfio-pci config space
> needs
> > > > >    to find a hole to place it, but doing so may require device specific
> > > > >    knowledge to avoid potential conflict with device specific registers
> like
> > > > >    hiden bits in VF config space. It's simpler by moving this burden to
> the
> > > > >    VMM instead of maintaining a quirk system in the kernel.
> > > >
> > > > This feels a bit like an incomplete solution though and we might
> > > > already posses device specific knowledge in the form of a variant
> > > > driver.  Should this feature structure include a flag + field that
> > > > could serve to generically indicate to the VMM a location for
> > > > implementing the PASID capability?  The default core implementation
> > > > might fill this only for PFs where clearly an emualted PASID capability
> > > > can overlap the physical capability.  Thanks,
> > >
> > > In many ways I would perfer to solve this for good by having a way to
> > > learn a range of available config space - I liked the suggestion to
> > > use a DVSEC to mark empty space.
> >
> > Yes, DVSEC is the most plausible option for the device itself to convey
> > unused config space, but that requires hardware adoption so presumably
> > we're going to need to fill the gaps with device specific code.  That
> > code might live in a variant driver or in the VMM.  If we have faith
> > that DVSEC is the way, it'd make sense for a variant driver to
> > implement a virtual DVSEC to work out the QEMU implementation and set
> a
> > precedent.
> 
> How hard do you think it would be for the kernel to synthesize the
> dvsec if the varient driver can provide a range for it?
> 
> On the other hand I'm not so keen on having variant drivers that are
> only doing this just to avoid a table in qemu :\ It seems like a

me too. If we really want something like this I'd prefer to tracking a
table of device specific ranges instead of requesting full-fledged
variant drivers.
Yi Liu Jan. 15, 2024, 8:20 a.m. UTC | #21
On 2023/12/12 23:27, Jason Gunthorpe wrote:
> On Mon, Dec 11, 2023 at 08:39:46PM -0700, Alex Williamson wrote:
> 
>> So how do we keep up with PCIe spec updates relative to the PASID
>> capability with this proposal?  Would it make more sense to report the
>> raw capability register and capability version rather that a translated
>> copy thereof?  Perhaps just masking the fields we're currently prepared
>> to expose.
> 
> I think the VMM must always create a cap based on the PCIe version it
> understands. We don't know what future specs will put there so it
> seems risky to forward it if we don't know that any possible
> hypervisor support is present.

This series parses the capability register and reports the known caps
to user. While this does not include the version number, userspace should
decide the proper version number. Seems like what you suggests here.

> 
> We have this problem on and off where stuff in PCI config space needs
> explicit hypervisor support or it doesn't work in the VM and things
> get confusing.
> 
> Jason
Yi Liu Jan. 15, 2024, 9:49 a.m. UTC | #22
On 2023/12/12 23:35, Jason Gunthorpe wrote:
> On Mon, Dec 11, 2023 at 11:49:49AM -0700, Alex Williamson wrote:
>> On Mon, 11 Dec 2023 14:10:28 -0400
>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>>
>>> On Mon, Dec 11, 2023 at 11:03:45AM -0700, Alex Williamson wrote:
>>>> On Sun, 26 Nov 2023 22:39:09 -0800
>>>> Yi Liu <yi.l.liu@intel.com> wrote:
>>
>>>>>     the PF). Creating a virtual PASID capability in vfio-pci config space needs
>>>>>     to find a hole to place it, but doing so may require device specific
>>>>>     knowledge to avoid potential conflict with device specific registers like
>>>>>     hiden bits in VF config space. It's simpler by moving this burden to the
>>>>>     VMM instead of maintaining a quirk system in the kernel.
>>>>
>>>> This feels a bit like an incomplete solution though and we might
>>>> already posses device specific knowledge in the form of a variant
>>>> driver.  Should this feature structure include a flag + field that
>>>> could serve to generically indicate to the VMM a location for
>>>> implementing the PASID capability?  The default core implementation
>>>> might fill this only for PFs where clearly an emualted PASID capability
>>>> can overlap the physical capability.  Thanks,
>>>
>>> In many ways I would perfer to solve this for good by having a way to
>>> learn a range of available config space - I liked the suggestion to
>>> use a DVSEC to mark empty space.
>>
>> Yes, DVSEC is the most plausible option for the device itself to convey
>> unused config space, but that requires hardware adoption so presumably
>> we're going to need to fill the gaps with device specific code.  That
>> code might live in a variant driver or in the VMM.  If we have faith
>> that DVSEC is the way, it'd make sense for a variant driver to
>> implement a virtual DVSEC to work out the QEMU implementation and set a
>> precedent.
> 
> How hard do you think it would be for the kernel to synthesize the
> dvsec if the varient driver can provide a range for it?
> 
> On the other hand I'm not so keen on having variant drivers that are
> only doing this just to avoid a table in qemu :\ It seems like a
> reasonable thing to add to existing drivers, though none of them
> support PASID yet..
> 
>> I mostly just want us to recognize that this feature structure also has
>> the possibility to fill this gap and we're consciously passing it over
>> and should maybe formally propose the DVSEC solution and reference it
>> in the commit log or comments here to provide a complete picture.
> 
> You mean by passing an explicit empty range or something in a feature
> IOCTL?

Hi Alex,

Any more suggestion on this? It appears to me that you are fine with PF
to implement the virtual PASID capability in the same offset with physical
PASID capability, while other cases need a way to know where to put the
virtual PASID capability. This may be done by a DVSEC or just pass empty
ranges through the VFIO_DEVICE_FEATURE ioctl?

Regards,
Yi Liu
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1929103ee59a..8038aa45500e 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1495,6 +1495,51 @@  static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
 	return 0;
 }
 
+static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
+				       struct vfio_device_feature_pasid __user *arg,
+				       size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	struct vfio_device_feature_pasid pasid = { 0 };
+	struct pci_dev *pdev = vdev->pdev;
+	u32 capabilities = 0;
+	int ret;
+
+	/* We do not support SET of the PASID capability */
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(pasid));
+	if (ret != 1)
+		return ret;
+
+	/*
+	 * Needs go to PF if the device is VF as VF shares its PF's
+	 * PASID Capability.
+	 */
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
+	if (!pdev->pasid_enabled)
+		goto out;
+
+#ifdef CONFIG_PCI_PASID
+	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
+			      &capabilities);
+#endif
+
+	if (capabilities & PCI_PASID_CAP_EXEC)
+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
+	if (capabilities & PCI_PASID_CAP_PRIV)
+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
+
+	pasid.width = (capabilities >> 8) & 0x1f;
+
+out:
+	if (copy_to_user(arg, &pasid, sizeof(pasid)))
+		return -EFAULT;
+	return 0;
+}
+
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz)
 {
@@ -1508,6 +1553,8 @@  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_PASID:
+		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 495193629029..8326faf8622b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1512,6 +1512,19 @@  struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
+ * Zero width means no support for PASID.
+ */
+struct vfio_device_feature_pasid {
+	__u16 capabilities;
+#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
+#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
+	__u8 width;
+	__u8 __reserved;
+};
+#define VFIO_DEVICE_FEATURE_PASID 11
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**