diff mbox series

[v2,4/4] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl

Message ID 20240412082121.33382-5-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio-pci support pasid attach/detach | expand

Commit Message

Yi Liu April 12, 2024, 8:21 a.m. UTC
Today, vfio-pci hides the PASID capability of devices from userspace. Unlike
other PCI capabilities, PASID capability is going to be reported to user by
VFIO_DEVICE_FEATURE. Hence userspace could probe PASID capability by it.
This is a bit different from the other capabilities which are reported to
userspace when the user reads the device's PCI configuration space. There
are two reasons for this.

 - First, userspace like Qemu by default exposes all the available PCI
   capabilities in vfio-pci config space to the guest as read-only, so
   adding PASID capability in the vfio-pci config space will make it
   exposed to the guest automatically while an old Qemu doesn't really
   support it.

 - Second, the PASID capability does not exist on VFs (instead shares the
   cap of the PF). Creating a virtual PASID capability in vfio-pci config
   space needs to find a hole to place it, but doing so may require device
   specific knowledge to avoid potential conflict with device specific
   registers like hidden bits in VF's config space. It's simpler to move
   this burden to the VMM instead of maintaining a quirk system in the kernel.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 50 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        | 14 +++++++++
 2 files changed, 64 insertions(+)

Comments

Tian, Kevin April 16, 2024, 9:40 a.m. UTC | #1
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Friday, April 12, 2024 4:21 PM
> 
> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
> +				       struct vfio_device_feature_pasid __user
> *arg,
> +				       size_t argsz)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	struct vfio_device_feature_pasid pasid = { 0 };
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 capabilities = 0;
> +	u16 ctrl = 0;
> +	int ret;
> +
> +	/*
> +	 * Due to no PASID capability per VF, to be consistent, we do not
> +	 * support SET of the PASID capability for both PF and VF.
> +	 */

/* Disallow SET of the PASID capability given it is shared by all VF's 
 * and configured implicitly by the IOMMU driver.
 */

> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(pasid));
> +	if (ret != 1)
> +		return ret;
> +
> +	/* VF shares the PASID capability of its PF */
> +	if (pdev->is_virtfn)
> +		pdev = pci_physfn(pdev);
> +
> +	if (!pdev->pasid_enabled)
> +		goto out;
> +
> +#ifdef CONFIG_PCI_PASID
> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> +			      &capabilities);
> +	pci_read_config_word(pdev, pdev->pasid_cap + PCI_PASID_CTRL,
> +			     &ctrl);
> +#endif
> +
> +	pasid.width = (capabilities >> 8) & 0x1f;

it's cleaner to have helpers instead of directly checking CONFIG_PCI_XXX here.

there is an existing helper for the width: pci_max_pasids()

pci_pasid_features() can report supported features but not the actual
enabled set.

for enabled features it's already stored in pdev->pasid_features. so what's
required here is probably a new pci_pasid_enabled_features() to return
that field.
Alex Williamson April 16, 2024, 5:57 p.m. UTC | #2
On Fri, 12 Apr 2024 01:21:21 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> Today, vfio-pci hides the PASID capability of devices from userspace. Unlike
> other PCI capabilities, PASID capability is going to be reported to user by
> VFIO_DEVICE_FEATURE. Hence userspace could probe PASID capability by it.
> This is a bit different from the other capabilities which are reported to
> userspace when the user reads the device's PCI configuration space. There
> are two reasons for this.
> 
>  - First, userspace like Qemu by default exposes all the available PCI
>    capabilities in vfio-pci config space to the guest as read-only, so
>    adding PASID capability in the vfio-pci config space will make it
>    exposed to the guest automatically while an old Qemu doesn't really
>    support it.
> 
>  - Second, the PASID capability does not exist on VFs (instead shares the
>    cap of the PF). Creating a virtual PASID capability in vfio-pci config
>    space needs to find a hole to place it, but doing so may require device
>    specific knowledge to avoid potential conflict with device specific
>    registers like hidden bits in VF's config space. It's simpler to move
>    this burden to the VMM instead of maintaining a quirk system in the kernel.
> 
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 50 ++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        | 14 +++++++++
>  2 files changed, 64 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index d94d61b92c1a..ca64e461d435 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1495,6 +1495,54 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
> +				       struct vfio_device_feature_pasid __user *arg,
> +				       size_t argsz)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	struct vfio_device_feature_pasid pasid = { 0 };
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 capabilities = 0;
> +	u16 ctrl = 0;
> +	int ret;
> +
> +	/*
> +	 * Due to no PASID capability per VF, to be consistent, we do not
> +	 * support SET of the PASID capability for both PF and VF.
> +	 */
> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(pasid));
> +	if (ret != 1)
> +		return ret;
> +
> +	/* VF shares the PASID capability of its PF */
> +	if (pdev->is_virtfn)
> +		pdev = pci_physfn(pdev);
> +
> +	if (!pdev->pasid_enabled)
> +		goto out;
> +
> +#ifdef CONFIG_PCI_PASID
> +	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
> +			      &capabilities);
> +	pci_read_config_word(pdev, pdev->pasid_cap + PCI_PASID_CTRL,
> +			     &ctrl);
> +#endif
> +
> +	pasid.width = (capabilities >> 8) & 0x1f;
> +
> +	if (ctrl & PCI_PASID_CTRL_EXEC)
> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
> +	if (ctrl & PCI_PASID_CTRL_PRIV)
> +		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;

I agree with Kevin here, let's make use of and add helpers to avoid
#ifdef blocks of code.

> +
> +out:
> +	if (copy_to_user(arg, &pasid, sizeof(pasid)))
> +		return -EFAULT;
> +	return 0;
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1508,6 +1556,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PASID:
> +		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 9591dc24b75c..e50e55c67ab4 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1513,6 +1513,20 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
> + * Zero width means no support for PASID.

Why would we do that rather than reporting the feature as unsupported?
Just return -ENOTTY if PASID is not supported or enabled.

> + */
> +struct vfio_device_feature_pasid {
> +	__u16 capabilities;
> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> +	__u8 width;
> +	__u8 __reserved;
> +};

Building on Kevin's comment on the cover letter, if we could describe
an offset for emulating a PASID capability, this seems like the place
we'd do it.  I think we're not doing that because we'd like an in-band
mechanism for a device to report unused config space, such as a DVSEC
capability, so that it can be implemented on a physical device.  As
noted in the commit log here, we'd also prefer not to bloat the kernel
with more device quirks.

In an ideal world we might be able to jump start support of that DVSEC
option by emulating the DVSEC capability on top of the PASID capability
for PFs, but unfortunately the PASID capability is 8 bytes while the
DVSEC capability is at least 12 bytes, so we can't implement that
generically either.

I don't know there's any good solution here or whether there's actually
any value to the PASID capability on a PF, but do we need to consider
leaving a field+flag here to describe the offset for that scenario?
Would we then allow variant drivers to take advantage of it?  Does this
then turn into the quirk that we're trying to avoid in the kernel
rather than userspace and is that a problem?  Thanks,

Alex

> +
> +#define VFIO_DEVICE_FEATURE_PASID 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
Tian, Kevin April 17, 2024, 7:09 a.m. UTC | #3
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Wednesday, April 17, 2024 1:57 AM
> 
> On Fri, 12 Apr 2024 01:21:21 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > + */
> > +struct vfio_device_feature_pasid {
> > +	__u16 capabilities;
> > +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> > +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> > +	__u8 width;
> > +	__u8 __reserved;
> > +};
> 
> Building on Kevin's comment on the cover letter, if we could describe
> an offset for emulating a PASID capability, this seems like the place
> we'd do it.  I think we're not doing that because we'd like an in-band
> mechanism for a device to report unused config space, such as a DVSEC
> capability, so that it can be implemented on a physical device.  As
> noted in the commit log here, we'd also prefer not to bloat the kernel
> with more device quirks.
> 
> In an ideal world we might be able to jump start support of that DVSEC
> option by emulating the DVSEC capability on top of the PASID capability
> for PFs, but unfortunately the PASID capability is 8 bytes while the
> DVSEC capability is at least 12 bytes, so we can't implement that
> generically either.

Yeah, that's a problem.

> 
> I don't know there's any good solution here or whether there's actually
> any value to the PASID capability on a PF, but do we need to consider
> leaving a field+flag here to describe the offset for that scenario?

Yes, I prefer to this way.

> Would we then allow variant drivers to take advantage of it?  Does this
> then turn into the quirk that we're trying to avoid in the kernel
> rather than userspace and is that a problem?  Thanks,
> 

We don't want to proactively pursue quirks in the kernel.

But if a variant driver exists for other reasons, I don't see why it 
should be prohibited from deciding an offset to ease the
userspace. 
Alex Williamson April 17, 2024, 8:25 p.m. UTC | #4
On Wed, 17 Apr 2024 07:09:52 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Wednesday, April 17, 2024 1:57 AM
> > 
> > On Fri, 12 Apr 2024 01:21:21 -0700
> > Yi Liu <yi.l.liu@intel.com> wrote:
> >   
> > > + */
> > > +struct vfio_device_feature_pasid {
> > > +	__u16 capabilities;
> > > +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> > > +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> > > +	__u8 width;
> > > +	__u8 __reserved;
> > > +};  
> > 
> > Building on Kevin's comment on the cover letter, if we could describe
> > an offset for emulating a PASID capability, this seems like the place
> > we'd do it.  I think we're not doing that because we'd like an in-band
> > mechanism for a device to report unused config space, such as a DVSEC
> > capability, so that it can be implemented on a physical device.  As
> > noted in the commit log here, we'd also prefer not to bloat the kernel
> > with more device quirks.
> > 
> > In an ideal world we might be able to jump start support of that DVSEC
> > option by emulating the DVSEC capability on top of the PASID capability
> > for PFs, but unfortunately the PASID capability is 8 bytes while the
> > DVSEC capability is at least 12 bytes, so we can't implement that
> > generically either.  
> 
> Yeah, that's a problem.
> 
> > 
> > I don't know there's any good solution here or whether there's actually
> > any value to the PASID capability on a PF, but do we need to consider
> > leaving a field+flag here to describe the offset for that scenario?  
> 
> Yes, I prefer to this way.
> 
> > Would we then allow variant drivers to take advantage of it?  Does this
> > then turn into the quirk that we're trying to avoid in the kernel
> > rather than userspace and is that a problem?  Thanks,
> >   
> 
> We don't want to proactively pursue quirks in the kernel.
> 
> But if a variant driver exists for other reasons, I don't see why it 
> should be prohibited from deciding an offset to ease the
> userspace. 
Tian, Kevin April 18, 2024, 12:21 a.m. UTC | #5
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, April 18, 2024 4:26 AM
> 
> On Wed, 17 Apr 2024 07:09:52 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Wednesday, April 17, 2024 1:57 AM
> > >
> > > On Fri, 12 Apr 2024 01:21:21 -0700
> > > Yi Liu <yi.l.liu@intel.com> wrote:
> > >
> > > > + */
> > > > +struct vfio_device_feature_pasid {
> > > > +	__u16 capabilities;
> > > > +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> > > > +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> > > > +	__u8 width;
> > > > +	__u8 __reserved;
> > > > +};
> > >
> > > Building on Kevin's comment on the cover letter, if we could describe
> > > an offset for emulating a PASID capability, this seems like the place
> > > we'd do it.  I think we're not doing that because we'd like an in-band
> > > mechanism for a device to report unused config space, such as a DVSEC
> > > capability, so that it can be implemented on a physical device.  As
> > > noted in the commit log here, we'd also prefer not to bloat the kernel
> > > with more device quirks.
> > >
> > > In an ideal world we might be able to jump start support of that DVSEC
> > > option by emulating the DVSEC capability on top of the PASID capability
> > > for PFs, but unfortunately the PASID capability is 8 bytes while the
> > > DVSEC capability is at least 12 bytes, so we can't implement that
> > > generically either.
> >
> > Yeah, that's a problem.
> >
> > >
> > > I don't know there's any good solution here or whether there's actually
> > > any value to the PASID capability on a PF, but do we need to consider
> > > leaving a field+flag here to describe the offset for that scenario?
> >
> > Yes, I prefer to this way.
> >
> > > Would we then allow variant drivers to take advantage of it?  Does this
> > > then turn into the quirk that we're trying to avoid in the kernel
> > > rather than userspace and is that a problem?  Thanks,
> > >
> >
> > We don't want to proactively pursue quirks in the kernel.
> >
> > But if a variant driver exists for other reasons, I don't see why it
> > should be prohibited from deciding an offset to ease the
> > userspace. 
Yi Liu April 18, 2024, 8:23 a.m. UTC | #6
On 2024/4/18 08:21, Tian, Kevin wrote:
>> From: Alex Williamson <alex.williamson@redhat.com>
>> Sent: Thursday, April 18, 2024 4:26 AM
>>
>> On Wed, 17 Apr 2024 07:09:52 +0000
>> "Tian, Kevin" <kevin.tian@intel.com> wrote:
>>
>>>> From: Alex Williamson <alex.williamson@redhat.com>
>>>> Sent: Wednesday, April 17, 2024 1:57 AM
>>>>
>>>> On Fri, 12 Apr 2024 01:21:21 -0700
>>>> Yi Liu <yi.l.liu@intel.com> wrote:
>>>>
>>>>> + */
>>>>> +struct vfio_device_feature_pasid {
>>>>> +	__u16 capabilities;
>>>>> +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
>>>>> +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
>>>>> +	__u8 width;
>>>>> +	__u8 __reserved;
>>>>> +};
>>>>
>>>> Building on Kevin's comment on the cover letter, if we could describe
>>>> an offset for emulating a PASID capability, this seems like the place
>>>> we'd do it.  I think we're not doing that because we'd like an in-band
>>>> mechanism for a device to report unused config space, such as a DVSEC
>>>> capability, so that it can be implemented on a physical device.  As
>>>> noted in the commit log here, we'd also prefer not to bloat the kernel
>>>> with more device quirks.
>>>>
>>>> In an ideal world we might be able to jump start support of that DVSEC
>>>> option by emulating the DVSEC capability on top of the PASID capability
>>>> for PFs, but unfortunately the PASID capability is 8 bytes while the
>>>> DVSEC capability is at least 12 bytes, so we can't implement that
>>>> generically either.
>>>
>>> Yeah, that's a problem.
>>>
>>>>
>>>> I don't know there's any good solution here or whether there's actually
>>>> any value to the PASID capability on a PF, but do we need to consider
>>>> leaving a field+flag here to describe the offset for that scenario?
>>>
>>> Yes, I prefer to this way.
>>>
>>>> Would we then allow variant drivers to take advantage of it?  Does this
>>>> then turn into the quirk that we're trying to avoid in the kernel
>>>> rather than userspace and is that a problem?  Thanks,
>>>>
>>>
>>> We don't want to proactively pursue quirks in the kernel.
>>>
>>> But if a variant driver exists for other reasons, I don't see why it
>>> should be prohibited from deciding an offset to ease the
>>> userspace. 
Alex Williamson April 18, 2024, 4:34 p.m. UTC | #7
On Thu, 18 Apr 2024 00:21:36 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Thursday, April 18, 2024 4:26 AM
> > 
> > On Wed, 17 Apr 2024 07:09:52 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > Sent: Wednesday, April 17, 2024 1:57 AM
> > > >
> > > > On Fri, 12 Apr 2024 01:21:21 -0700
> > > > Yi Liu <yi.l.liu@intel.com> wrote:
> > > >  
> > > > > + */
> > > > > +struct vfio_device_feature_pasid {
> > > > > +	__u16 capabilities;
> > > > > +#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
> > > > > +#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
> > > > > +	__u8 width;
> > > > > +	__u8 __reserved;
> > > > > +};  
> > > >
> > > > Building on Kevin's comment on the cover letter, if we could describe
> > > > an offset for emulating a PASID capability, this seems like the place
> > > > we'd do it.  I think we're not doing that because we'd like an in-band
> > > > mechanism for a device to report unused config space, such as a DVSEC
> > > > capability, so that it can be implemented on a physical device.  As
> > > > noted in the commit log here, we'd also prefer not to bloat the kernel
> > > > with more device quirks.
> > > >
> > > > In an ideal world we might be able to jump start support of that DVSEC
> > > > option by emulating the DVSEC capability on top of the PASID capability
> > > > for PFs, but unfortunately the PASID capability is 8 bytes while the
> > > > DVSEC capability is at least 12 bytes, so we can't implement that
> > > > generically either.  
> > >
> > > Yeah, that's a problem.
> > >  
> > > >
> > > > I don't know there's any good solution here or whether there's actually
> > > > any value to the PASID capability on a PF, but do we need to consider
> > > > leaving a field+flag here to describe the offset for that scenario?  
> > >
> > > Yes, I prefer to this way.
> > >  
> > > > Would we then allow variant drivers to take advantage of it?  Does this
> > > > then turn into the quirk that we're trying to avoid in the kernel
> > > > rather than userspace and is that a problem?  Thanks,
> > > >  
> > >
> > > We don't want to proactively pursue quirks in the kernel.
> > >
> > > But if a variant driver exists for other reasons, I don't see why it
> > > should be prohibited from deciding an offset to ease the
> > > userspace. 
Jason Gunthorpe April 23, 2024, 12:39 p.m. UTC | #8
On Fri, Apr 12, 2024 at 01:21:21AM -0700, Yi Liu wrote:
> Today, vfio-pci hides the PASID capability of devices from userspace. Unlike
> other PCI capabilities, PASID capability is going to be reported to user by
> VFIO_DEVICE_FEATURE. Hence userspace could probe PASID capability by it.
> This is a bit different from the other capabilities which are reported to
> userspace when the user reads the device's PCI configuration space. There
> are two reasons for this.

I'm thinking this probably does not belong in VFIO, iommufd should
report what the device, driver and OS is able to do with this
device. PASID support is at least 50% an iommu property too.

This is a seperate issue to forming the config space.

I didn't notice anything about SIOV in this, are we tackling it later?

IIRC we need the vIOMMU to specify a vPASID during attach and somehow
that gets mapped into a pPASID and synchronized with the KVM ENQCMD
translation?

Jason
Tian, Kevin April 24, 2024, 12:24 a.m. UTC | #9
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, April 23, 2024 8:40 PM
> 
> On Fri, Apr 12, 2024 at 01:21:21AM -0700, Yi Liu wrote:
> > Today, vfio-pci hides the PASID capability of devices from userspace. Unlike
> > other PCI capabilities, PASID capability is going to be reported to user by
> > VFIO_DEVICE_FEATURE. Hence userspace could probe PASID capability by
> it.
> > This is a bit different from the other capabilities which are reported to
> > userspace when the user reads the device's PCI configuration space. There
> > are two reasons for this.
> 
> I'm thinking this probably does not belong in VFIO, iommufd should
> report what the device, driver and OS is able to do with this
> device. PASID support is at least 50% an iommu property too.

We have PASID capability in both device side and iommu side.

VFIO is for the former and iommufd is for the latter.

both should report the capability only if that cap exists and is
enabled by OS.

> 
> This is a seperate issue to forming the config space.
> 
> I didn't notice anything about SIOV in this, are we tackling it later?

yes.

> 
> IIRC we need the vIOMMU to specify a vPASID during attach and somehow
> that gets mapped into a pPASID and synchronized with the KVM ENQCMD
> translation?
> 

yes, that is the original plan. More accurately the vfio attach uAPI
is always about a pPASID. The mapping will be added separately to
iommufd and synced with KVM.

But internally we are evaluating whether there is enough value
to justify adding this complexity to the kernel. It's the main burden
in SIOVr1. Given the limited usages very likely we'll only do the
basic SIOV support w/o the vPASID cap...
Jason Gunthorpe April 24, 2024, 1:59 p.m. UTC | #10
On Wed, Apr 24, 2024 at 12:24:19AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, April 23, 2024 8:40 PM
> > 
> > On Fri, Apr 12, 2024 at 01:21:21AM -0700, Yi Liu wrote:
> > > Today, vfio-pci hides the PASID capability of devices from userspace. Unlike
> > > other PCI capabilities, PASID capability is going to be reported to user by
> > > VFIO_DEVICE_FEATURE. Hence userspace could probe PASID capability by
> > it.
> > > This is a bit different from the other capabilities which are reported to
> > > userspace when the user reads the device's PCI configuration space. There
> > > are two reasons for this.
> > 
> > I'm thinking this probably does not belong in VFIO, iommufd should
> > report what the device, driver and OS is able to do with this
> > device. PASID support is at least 50% an iommu property too.
> 
> We have PASID capability in both device side and iommu side.
> 
> VFIO is for the former and iommufd is for the latter.

iommu can do the device side too, we have a device info ioctl after
all.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d94d61b92c1a..ca64e461d435 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1495,6 +1495,54 @@  static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
 	return 0;
 }
 
+static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
+				       struct vfio_device_feature_pasid __user *arg,
+				       size_t argsz)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	struct vfio_device_feature_pasid pasid = { 0 };
+	struct pci_dev *pdev = vdev->pdev;
+	u32 capabilities = 0;
+	u16 ctrl = 0;
+	int ret;
+
+	/*
+	 * Due to no PASID capability per VF, to be consistent, we do not
+	 * support SET of the PASID capability for both PF and VF.
+	 */
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(pasid));
+	if (ret != 1)
+		return ret;
+
+	/* VF shares the PASID capability of its PF */
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
+	if (!pdev->pasid_enabled)
+		goto out;
+
+#ifdef CONFIG_PCI_PASID
+	pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
+			      &capabilities);
+	pci_read_config_word(pdev, pdev->pasid_cap + PCI_PASID_CTRL,
+			     &ctrl);
+#endif
+
+	pasid.width = (capabilities >> 8) & 0x1f;
+
+	if (ctrl & PCI_PASID_CTRL_EXEC)
+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
+	if (ctrl & PCI_PASID_CTRL_PRIV)
+		pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
+
+out:
+	if (copy_to_user(arg, &pasid, sizeof(pasid)))
+		return -EFAULT;
+	return 0;
+}
+
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz)
 {
@@ -1508,6 +1556,8 @@  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_PASID:
+		return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9591dc24b75c..e50e55c67ab4 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1513,6 +1513,20 @@  struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
+ * Zero width means no support for PASID.
+ */
+struct vfio_device_feature_pasid {
+	__u16 capabilities;
+#define VFIO_DEVICE_PASID_CAP_EXEC	(1 << 0)
+#define VFIO_DEVICE_PASID_CAP_PRIV	(1 << 1)
+	__u8 width;
+	__u8 __reserved;
+};
+
+#define VFIO_DEVICE_FEATURE_PASID 11
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**