diff mbox series

[v2,10/11] vfio: Make vfio_container optionally compiled

Message ID 10-v2-65016290f146+33e-vfio_iommufd_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Connect VFIO to IOMMUFD | expand

Commit Message

Jason Gunthorpe Nov. 8, 2022, 12:52 a.m. UTC
Add a kconfig CONFIG_VFIO_CONTAINER that controls compiling the container
code. If 'n' then only iommufd will provide the container service. All the
support for vfio iommu drivers, including type1, will not be built.

This allows a compilation check that no inappropriate dependencies between
the device/group and container have been created.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/Kconfig  | 35 +++++++++++++++--------
 drivers/vfio/Makefile |  4 +--
 drivers/vfio/vfio.h   | 65 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 13 deletions(-)

Comments

Alex Williamson Nov. 8, 2022, 10:28 p.m. UTC | #1
On Mon,  7 Nov 2022 20:52:54 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> Add a kconfig CONFIG_VFIO_CONTAINER that controls compiling the container
> code. If 'n' then only iommufd will provide the container service. All the
> support for vfio iommu drivers, including type1, will not be built.
> 
> This allows a compilation check that no inappropriate dependencies between
> the device/group and container have been created.
> 
> Tested-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/vfio/Kconfig  | 35 +++++++++++++++--------
>  drivers/vfio/Makefile |  4 +--
>  drivers/vfio/vfio.h   | 65 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 91 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 1118d322eec97d..286c1663bd7564 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,8 +3,8 @@ menuconfig VFIO
>  	tristate "VFIO Non-Privileged userspace driver framework"
>  	select IOMMU_API
>  	depends on IOMMUFD || !IOMMUFD
> -	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
>  	select INTERVAL_TREE
> +	select VFIO_CONTAINER if IOMMUFD=n
>  	help
>  	  VFIO provides a framework for secure userspace device drivers.
>  	  See Documentation/driver-api/vfio.rst for more details.
> @@ -12,6 +12,18 @@ menuconfig VFIO
>  	  If you don't know what to do here, say N.
>  
>  if VFIO
> +config VFIO_CONTAINER
> +	bool "Support for the VFIO container /dev/vfio/vfio"
> +	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
> +	default y
> +	help
> +	  The VFIO container is the classic interface to VFIO for establishing
> +	  IOMMU mappings. If N is selected here then IOMMUFD must be used to
> +	  manage the mappings.
> +
> +	  Unless testing IOMMUFD say Y here.
> +
> +if VFIO_CONTAINER
>  config VFIO_IOMMU_TYPE1
>  	tristate
>  	default n
> @@ -21,16 +33,6 @@ config VFIO_IOMMU_SPAPR_TCE
>  	depends on SPAPR_TCE_IOMMU
>  	default VFIO
>  
> -config VFIO_SPAPR_EEH
> -	tristate
> -	depends on EEH && VFIO_IOMMU_SPAPR_TCE
> -	default VFIO
> -
> -config VFIO_VIRQFD
> -	tristate
> -	select EVENTFD
> -	default n
> -
>  config VFIO_NOIOMMU
>  	bool "VFIO No-IOMMU support"
>  	help


Perhaps this should have been obvious, but I'm realizing that
vfio-noiommu mode is completely missing without VFIO_CONTAINER, which
seems a barrier to deprecating VFIO_CONTAINER and perhaps makes it a
question whether IOMMUFD should really be taking over /dev/vfio/vfio.
No-iommu mode has users.  Thanks,

Alex
Jason Gunthorpe Nov. 9, 2022, 12:54 a.m. UTC | #2
On Tue, Nov 08, 2022 at 03:28:31PM -0700, Alex Williamson wrote:

> Perhaps this should have been obvious, but I'm realizing that
> vfio-noiommu mode is completely missing without VFIO_CONTAINER, which
> seems a barrier to deprecating VFIO_CONTAINER and perhaps makes it a

Yes, it is the same as the allow_unsafe_interrupts - it is something
that currently goes missing if you turn off VFIO_CONTAINER.

This seems straightforward enough to resolve in a followup, we mostly
just need someone with an existing no-iommu application to test
compatability against. Keeping it working with the device cdev will
also be a bit interesting. If you have or know about some application
I can try to make a patch.

> question whether IOMMUFD should really be taking over /dev/vfio/vfio.
> No-iommu mode has users.  

I view VFIO_CONTAINER=n as a process. An aspiration we can work
toward.

At this point there are few places that might want to use it. Android
perhaps, for example. It is also useful for testing. One of the main
values is you can switch the options and feed the kernel into an
existing test environment and see what happens. This is how we are
able to quickly get s390 mdev testing, for instance.

We are not going to get to a widely useful VFIO_CONTAINER=n if we
don't have a target that people can test against and evaluate what
compatability gaps may exist.

So, everytime we find something like this - let's think about how can
we make iommufd compatibility handle it and not jump straight to
giving up :)

I'm kind of thinking v6.4 might be a reasonable kernel target when we
might have closed off enough things.

Jason
Alex Williamson Nov. 9, 2022, 5:18 p.m. UTC | #3
On Tue, 8 Nov 2022 20:54:58 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Nov 08, 2022 at 03:28:31PM -0700, Alex Williamson wrote:
> 
> > Perhaps this should have been obvious, but I'm realizing that
> > vfio-noiommu mode is completely missing without VFIO_CONTAINER, which
> > seems a barrier to deprecating VFIO_CONTAINER and perhaps makes it a  
> 
> Yes, it is the same as the allow_unsafe_interrupts - it is something
> that currently goes missing if you turn off VFIO_CONTAINER.
> 
> This seems straightforward enough to resolve in a followup, we mostly
> just need someone with an existing no-iommu application to test
> compatability against. Keeping it working with the device cdev will
> also be a bit interesting. If you have or know about some application
> I can try to make a patch.

DPDK supports no-iommu mode.

> > question whether IOMMUFD should really be taking over /dev/vfio/vfio.
> > No-iommu mode has users.    
> 
> I view VFIO_CONTAINER=n as a process. An aspiration we can work
> toward.
> 
> At this point there are few places that might want to use it. Android
> perhaps, for example. It is also useful for testing. One of the main
> values is you can switch the options and feed the kernel into an
> existing test environment and see what happens. This is how we are
> able to quickly get s390 mdev testing, for instance.
> 
> We are not going to get to a widely useful VFIO_CONTAINER=n if we
> don't have a target that people can test against and evaluate what
> compatability gaps may exist.
> 
> So, everytime we find something like this - let's think about how can
> we make iommufd compatibility handle it and not jump straight to
> giving up :)
> 
> I'm kind of thinking v6.4 might be a reasonable kernel target when we
> might have closed off enough things.

I agree that it's very useful for testing, I'm certainly not suggesting
to give up, but I'm not sure where no-iommu lives when iommufd owns
/dev/vfio/vfio.  Given the unsafe interrupts discussion, it doesn't
seem like the type of thing that would be a priority for iommufd.

We're on a path where vfio accepts an iommufd as a container, and
ultimately iommufd becomes the container provider, supplanting the
IOMMU driver registration aspect of vfio.  I absolutely want type1 and
spapr backends to get replaced by iommufd, but reluctance to support
aspects of vfio "legacy" behavior doesn't give me warm fuzzies about a
wholesale hand-off of the container to a different subsystem, for
example vs an iommufd shim spoofing type1 support.

Unfortunately we no longer have a CONFIG_EXPERIMENTAL option to hide
behind for disabling VFIO_CONTAINER, so regardless of our intentions
that a transition is some time off, it may become an issue sooner than
we expect.  Thanks,

Alex
Jason Gunthorpe Nov. 9, 2022, 7:52 p.m. UTC | #4
On Wed, Nov 09, 2022 at 10:18:09AM -0700, Alex Williamson wrote:

> DPDK supports no-iommu mode.

Er? Huh? How? I thought no-iommu was for applications that didn't do
DMA? How is DPDK getting packets in/out without DMA? I guess it snoops
in /proc/ or something to learn PFNs of mlock'd memory? <shudder>

> I agree that it's very useful for testing, I'm certainly not suggesting
> to give up, but I'm not sure where no-iommu lives when iommufd owns
> /dev/vfio/vfio.  Given the unsafe interrupts discussion, it doesn't
> seem like the type of thing that would be a priority for iommufd.

Ah, I think the bit below will do the job, I'm not sure without doing
some testing (and I don't think I have the necessary Intel NIC for
DPDK). The basic point is no-iommu literally means 'do not use iommufd
at all, do not create an iommufd_device or an iommufd_access'. VFIO
can easially do that on its own.

The only slightly messy bit is that uAPI requires the SET_CONTAINER
which we can just NOP in vfio_compat. With more checks it could have
higher fidelity of error cases, but not sure it is worth it.

When we talk about the device cdev flow then I suggest that no-iommu
simply requires -1 for the iommufd during bind - ie no iommufd is
used or accepted and that is how the userspace knows/confirms it is in
no-iommu mode.

> We're on a path where vfio accepts an iommufd as a container, and
> ultimately iommufd becomes the container provider, supplanting the
> IOMMU driver registration aspect of vfio.  I absolutely want type1 and
> spapr backends to get replaced by iommufd, but reluctance to support
> aspects of vfio "legacy" behavior doesn't give me warm fuzzies about a
> wholesale hand-off of the container to a different subsystem, for
> example vs an iommufd shim spoofing type1 support.

Well, I will agree to do everything required, just let's go through the
process to understand the security situation and ensure we are still
doing things the right way.

> Unfortunately we no longer have a CONFIG_EXPERIMENTAL option to hide
> behind for disabling VFIO_CONTAINER, so regardless of our intentions
> that a transition is some time off, it may become an issue sooner than
> we expect.

We can add kconfig text discouraging that?

diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index dbef3274803336..81f7594cfff8e0 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -259,6 +259,14 @@ static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
 	struct iommufd_ioas *ioas = NULL;
 	int rc = 0;
 
+	/*
+	 * Emulation for NOIMMU is imperfect in that VFIO blocks almost all
+	 * other ioctls. We let them keep working but they mostly fail since no
+	 * IOAS should exist.
+	 */
+	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU)
+		return 0;
+
 	if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
 		return -EINVAL;
 
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 595c7b2146f88c..64a336ebc99b9f 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -18,6 +18,10 @@ int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 
 	lockdep_assert_held(&vdev->dev_set->lock);
 
+	if (IS_ENABLED(CONFIG_VFIO_NO_IOMMU) &&
+	    vdev->group->type == VFIO_NO_IOMMU)
+		return 0;
+
 	/*
 	 * If the driver doesn't provide this op then it means the device does
 	 * not do DMA at all. So nothing to do.
@@ -53,6 +57,10 @@ void vfio_iommufd_unbind(struct vfio_device *vdev)
 {
 	lockdep_assert_held(&vdev->dev_set->lock);
 
+	if (IS_ENABLED(CONFIG_VFIO_NO_IOMMU) &&
+	    vdev->group->type == VFIO_NO_IOMMU)
+		return;
+
 	if (vdev->ops->unbind_iommufd)
 		vdev->ops->unbind_iommufd(vdev);
 }
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index f3c48b8c45627d..08c5b05a129c2c 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -749,10 +749,13 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
 	if (!IS_ERR(iommufd)) {
 		u32 ioas_id;
 
-		ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id);
-		if (ret) {
-			iommufd_ctx_put(group->iommufd);
-			goto out_unlock;
+		if (!IS_ENABLED(CONFIG_VFIO_NO_IOMMU) ||
+		    group->type != VFIO_NO_IOMMU) {
+			ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id);
+			if (ret) {
+				iommufd_ctx_put(group->iommufd);
+				goto out_unlock;
+			}
 		}
 
 		group->iommufd = iommufd;
Tian, Kevin Nov. 10, 2022, 6:57 a.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, November 10, 2022 3:53 AM
> 
> On Wed, Nov 09, 2022 at 10:18:09AM -0700, Alex Williamson wrote:
> 
> > DPDK supports no-iommu mode.
> 
> Er? Huh? How? I thought no-iommu was for applications that didn't do
> DMA? How is DPDK getting packets in/out without DMA? I guess it snoops
> in /proc/ or something to learn PFNs of mlock'd memory? <shudder>

iirc dpdk started with UIO plus various tricks (root privilege, hugepage, etc.)
to lock and learn PFN's from pagemap. Then when migrating it to vfio the
no-iommu option was introduced to provide UIO compatibility.

> 
> > I agree that it's very useful for testing, I'm certainly not suggesting
> > to give up, but I'm not sure where no-iommu lives when iommufd owns
> > /dev/vfio/vfio.  Given the unsafe interrupts discussion, it doesn't
> > seem like the type of thing that would be a priority for iommufd.
> 
> Ah, I think the bit below will do the job, I'm not sure without doing
> some testing (and I don't think I have the necessary Intel NIC for
> DPDK). The basic point is no-iommu literally means 'do not use iommufd
> at all, do not create an iommufd_device or an iommufd_access'. VFIO
> can easially do that on its own.
> 
> The only slightly messy bit is that uAPI requires the SET_CONTAINER
> which we can just NOP in vfio_compat. With more checks it could have
> higher fidelity of error cases, but not sure it is worth it.
> 
> When we talk about the device cdev flow then I suggest that no-iommu
> simply requires -1 for the iommufd during bind - ie no iommufd is
> used or accepted and that is how the userspace knows/confirms it is in
> no-iommu mode.
> 
> > We're on a path where vfio accepts an iommufd as a container, and
> > ultimately iommufd becomes the container provider, supplanting the
> > IOMMU driver registration aspect of vfio.  I absolutely want type1 and
> > spapr backends to get replaced by iommufd, but reluctance to support
> > aspects of vfio "legacy" behavior doesn't give me warm fuzzies about a
> > wholesale hand-off of the container to a different subsystem, for
> > example vs an iommufd shim spoofing type1 support.
> 
> Well, I will agree to do everything required, just let's go through the
> process to understand the security situation and ensure we are still
> doing things the right way.
> 
> > Unfortunately we no longer have a CONFIG_EXPERIMENTAL option to hide
> > behind for disabling VFIO_CONTAINER, so regardless of our intentions
> > that a transition is some time off, it may become an issue sooner than
> > we expect.
> 
> We can add kconfig text discouraging that?
> 
> diff --git a/drivers/iommu/iommufd/vfio_compat.c
> b/drivers/iommu/iommufd/vfio_compat.c
> index dbef3274803336..81f7594cfff8e0 100644
> --- a/drivers/iommu/iommufd/vfio_compat.c
> +++ b/drivers/iommu/iommufd/vfio_compat.c
> @@ -259,6 +259,14 @@ static int iommufd_vfio_set_iommu(struct
> iommufd_ctx *ictx, unsigned long type)
>  	struct iommufd_ioas *ioas = NULL;
>  	int rc = 0;
> 
> +	/*
> +	 * Emulation for NOIMMU is imperfect in that VFIO blocks almost all
> +	 * other ioctls. We let them keep working but they mostly fail since no
> +	 * IOAS should exist.
> +	 */
> +	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type ==
> VFIO_NOIOMMU_IOMMU)
> +		return 0;
> +
>  	if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
>  		return -EINVAL;
> 

also need a check in iommufd_vfio_check_extension() so only
VFIO_NOIOMMU_IOMMU is supported in no-iommu mode.

> diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> index 595c7b2146f88c..64a336ebc99b9f 100644
> --- a/drivers/vfio/iommufd.c
> +++ b/drivers/vfio/iommufd.c
> @@ -18,6 +18,10 @@ int vfio_iommufd_bind(struct vfio_device *vdev,
> struct iommufd_ctx *ictx)
> 
>  	lockdep_assert_held(&vdev->dev_set->lock);
> 
> +	if (IS_ENABLED(CONFIG_VFIO_NO_IOMMU) &&
> +	    vdev->group->type == VFIO_NO_IOMMU)
> +		return 0;
> +
>  	/*
>  	 * If the driver doesn't provide this op then it means the device does
>  	 * not do DMA at all. So nothing to do.
> @@ -53,6 +57,10 @@ void vfio_iommufd_unbind(struct vfio_device *vdev)
>  {
>  	lockdep_assert_held(&vdev->dev_set->lock);
> 
> +	if (IS_ENABLED(CONFIG_VFIO_NO_IOMMU) &&
> +	    vdev->group->type == VFIO_NO_IOMMU)
> +		return;
> +
>  	if (vdev->ops->unbind_iommufd)
>  		vdev->ops->unbind_iommufd(vdev);
>  }
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index f3c48b8c45627d..08c5b05a129c2c 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -749,10 +749,13 @@ static int vfio_group_ioctl_set_container(struct
> vfio_group *group,
>  	if (!IS_ERR(iommufd)) {
>  		u32 ioas_id;
> 
> -		ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id);
> -		if (ret) {
> -			iommufd_ctx_put(group->iommufd);
> -			goto out_unlock;
> +		if (!IS_ENABLED(CONFIG_VFIO_NO_IOMMU) ||
> +		    group->type != VFIO_NO_IOMMU) {
> +			ret = iommufd_vfio_compat_ioas_id(iommufd,
> &ioas_id);
> +			if (ret) {
> +				iommufd_ctx_put(group->iommufd);
> +				goto out_unlock;
> +			}
>  		}

with above I suppose other ioctls (map/unmap/etc.) are implicitly blocked
given get_compat_ioas() will fail in those paths. this is good.

btw vfio container requires exact match between group->type and
container->noiommu, i.e. noiommu group can be only attached to noiommu
container. this is another thing to be paired up.
Alex Williamson Nov. 10, 2022, 5:10 p.m. UTC | #6
On Thu, 10 Nov 2022 06:57:57 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Thursday, November 10, 2022 3:53 AM
> > 
> > On Wed, Nov 09, 2022 at 10:18:09AM -0700, Alex Williamson wrote:
> >   
> > > DPDK supports no-iommu mode.  
> > 
> > Er? Huh? How? I thought no-iommu was for applications that didn't do
> > DMA? How is DPDK getting packets in/out without DMA? I guess it snoops
> > in /proc/ or something to learn PFNs of mlock'd memory? <shudder>  
> 
> iirc dpdk started with UIO plus various tricks (root privilege, hugepage, etc.)
> to lock and learn PFN's from pagemap. Then when migrating it to vfio the
> no-iommu option was introduced to provide UIO compatibility.

IIRC, we essentially introduced no-iommu mode vfio because DPDK started
pushing for extending interrupt support in uio-pci-generic.  The UIO
driver is also only meant for devices that don't do DMA, but obviously
DPDK didn't care about that.  Rather than extend UIO, we offered this
no-iommu mode in vfio since we already had more extensive MSI support,
were better able to impose restrictions on access to the device, and
using the same device access makes the transition to proper IOMMU
backed configurations more seamless.  Thanks,

Alex
Jason Gunthorpe Nov. 10, 2022, 5:52 p.m. UTC | #7
On Thu, Nov 10, 2022 at 06:57:57AM +0000, Tian, Kevin wrote:

> > +	/*
> > +	 * Emulation for NOIMMU is imperfect in that VFIO blocks almost all
> > +	 * other ioctls. We let them keep working but they mostly fail since no
> > +	 * IOAS should exist.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type ==
> > VFIO_NOIOMMU_IOMMU)
> > +		return 0;
> > +
> >  	if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
> >  		return -EINVAL;
> 
> also need a check in iommufd_vfio_check_extension() so only
> VFIO_NOIOMMU_IOMMU is supported in no-iommu mode.

Mm, and some permission checks too

> > +		if (!IS_ENABLED(CONFIG_VFIO_NO_IOMMU) ||
> > +		    group->type != VFIO_NO_IOMMU) {
> > +			ret = iommufd_vfio_compat_ioas_id(iommufd,
> > &ioas_id);
> > +			if (ret) {
> > +				iommufd_ctx_put(group->iommufd);
> > +				goto out_unlock;
> > +			}
> >  		}
> 
> with above I suppose other ioctls (map/unmap/etc.) are implicitly blocked
> given get_compat_ioas() will fail in those paths. this is good.
> 
> btw vfio container requires exact match between group->type and
> container->noiommu, i.e. noiommu group can be only attached to noiommu
> container. this is another thing to be paired up.

Sure, as below

So, the missing ingredient here is somone who has the necessary device
to test dpdk? I wonder if qemu e1000 is able to do this path?

diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index dbef3274803336..c20e55ddc9aa81 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -26,16 +26,35 @@ static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
 }
 
 /**
- * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use
+ * iommufd_vfio_compat_ioas_get_id - Ensure a comat IOAS exists
+ * @ictx: Context to operate on
+ *
+ * Return the ID of the current compatability ioas. The ID can be passed into
+ * other functions that take an ioas_id.
+ */
+int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
+{
+	struct iommufd_ioas *ioas;
+
+	ioas = get_compat_ioas(ictx);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+	*out_ioas_id = ioas->obj.id;
+	iommufd_put_object(&ioas->obj);
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
+
+/**
+ * iommufd_vfio_compat_ioas_create_id - Return the IOAS ID that vfio should use
  * @ictx: Context to operate on
- * @out_ioas_id: The ioas_id the caller should use
  *
  * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
  * on since they do not have an IOAS ID input in their ABI. Only attaching a
- * group should cause a default creation of the internal ioas, this returns the
- * existing ioas if it has already been assigned somehow.
+ * group should cause a default creation of the internal ioas, this does nothing
+ * if an existing ioas has already been assigned somehow.
  */
-int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
+int iommufd_vfio_compat_ioas_create_id(struct iommufd_ctx *ictx)
 {
 	struct iommufd_ioas *ioas = NULL;
 	struct iommufd_ioas *out_ioas;
@@ -53,7 +72,6 @@ int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
 	}
 	xa_unlock(&ictx->objects);
 
-	*out_ioas_id = out_ioas->obj.id;
 	if (out_ioas != ioas) {
 		iommufd_put_object(&out_ioas->obj);
 		iommufd_object_abort(ictx, &ioas->obj);
@@ -68,7 +86,7 @@ int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
 	iommufd_object_finalize(ictx, &ioas->obj);
 	return 0;
 }
-EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO);
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create_id, IOMMUFD_VFIO);
 
 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
 {
@@ -230,6 +248,9 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
 	case VFIO_UNMAP_ALL:
 		return 1;
 
+	case VFIO_NOIOMMU_IOMMU:
+	return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
+
 	case VFIO_DMA_CC_IOMMU:
 		return iommufd_vfio_cc_iommu(ictx);
 
@@ -259,6 +280,17 @@ static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
 	struct iommufd_ioas *ioas = NULL;
 	int rc = 0;
 
+	/*
+	 * Emulation for NOIMMU is imperfect in that VFIO blocks almost all
+	 * other ioctls. We let them keep working but they mostly fail since no
+	 * IOAS should exist.
+	 */
+	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+		return 0;
+	}
+
 	if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
 		return -EINVAL;
 
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 595c7b2146f88c..daa8039da7a8fa 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -18,6 +18,21 @@ int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 
 	lockdep_assert_held(&vdev->dev_set->lock);
 
+	if (IS_ENABLED(CONFIG_VFIO_NO_IOMMU) &&
+	    vdev->group->type == VFIO_NO_IOMMU) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+
+		/*
+		 * Require no compat ioas to be assigned to proceed. The basic
+		 * statement is that the user cannot have done something that
+		 * implies they expected translation to exist
+		 */
+		if (!iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id))
+			return -EPERM;
+		return 0;
+	}
+
 	/*
 	 * If the driver doesn't provide this op then it means the device does
 	 * not do DMA at all. So nothing to do.
@@ -29,7 +44,7 @@ int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 	if (ret)
 		return ret;
 
-	ret = iommufd_vfio_compat_ioas_id(ictx, &ioas_id);
+	ret = iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id);
 	if (ret)
 		goto err_unbind;
 	ret = vdev->ops->attach_ioas(vdev, &ioas_id);
@@ -53,6 +68,10 @@ void vfio_iommufd_unbind(struct vfio_device *vdev)
 {
 	lockdep_assert_held(&vdev->dev_set->lock);
 
+	if (IS_ENABLED(CONFIG_VFIO_NO_IOMMU) &&
+	    vdev->group->type == VFIO_NO_IOMMU)
+		return;
+
 	if (vdev->ops->unbind_iommufd)
 		vdev->ops->unbind_iommufd(vdev);
 }
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index f3c48b8c45627d..b59eff30968a1e 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -747,12 +747,13 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
 
 	iommufd = iommufd_ctx_from_file(f.file);
 	if (!IS_ERR(iommufd)) {
-		u32 ioas_id;
-
-		ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id);
-		if (ret) {
-			iommufd_ctx_put(group->iommufd);
-			goto out_unlock;
+		if (!IS_ENABLED(CONFIG_VFIO_NO_IOMMU) ||
+		    group->type != VFIO_NO_IOMMU) {
+			ret = iommufd_vfio_compat_ioas_create_id(iommufd);
+			if (ret) {
+				iommufd_ctx_put(group->iommufd);
+				goto out_unlock;
+			}
 		}
 
 		group->iommufd = iommufd;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 7a5d64a1dae482..bf2b3ea5f90fd2 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -61,7 +61,8 @@ void iommufd_access_unpin_pages(struct iommufd_access *access,
 				unsigned long iova, unsigned long length);
 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
 		      void *data, size_t len, unsigned int flags);
-int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
+int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
+int iommufd_vfio_compat_ioas_create_id(struct iommufd_ctx *ictx);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -93,8 +94,7 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long
 	return -EOPNOTSUPP;
 }
 
-static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx,
-					      u32 *out_ioas_id)
+static inline int iommufd_vfio_compat_ioas_create_id(struct iommufd_ctx *ictx)
 {
 	return -EOPNOTSUPP;
 }
diff mbox series

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 1118d322eec97d..286c1663bd7564 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,8 +3,8 @@  menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	select IOMMU_API
 	depends on IOMMUFD || !IOMMUFD
-	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
 	select INTERVAL_TREE
+	select VFIO_CONTAINER if IOMMUFD=n
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/driver-api/vfio.rst for more details.
@@ -12,6 +12,18 @@  menuconfig VFIO
 	  If you don't know what to do here, say N.
 
 if VFIO
+config VFIO_CONTAINER
+	bool "Support for the VFIO container /dev/vfio/vfio"
+	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
+	default y
+	help
+	  The VFIO container is the classic interface to VFIO for establishing
+	  IOMMU mappings. If N is selected here then IOMMUFD must be used to
+	  manage the mappings.
+
+	  Unless testing IOMMUFD say Y here.
+
+if VFIO_CONTAINER
 config VFIO_IOMMU_TYPE1
 	tristate
 	default n
@@ -21,16 +33,6 @@  config VFIO_IOMMU_SPAPR_TCE
 	depends on SPAPR_TCE_IOMMU
 	default VFIO
 
-config VFIO_SPAPR_EEH
-	tristate
-	depends on EEH && VFIO_IOMMU_SPAPR_TCE
-	default VFIO
-
-config VFIO_VIRQFD
-	tristate
-	select EVENTFD
-	default n
-
 config VFIO_NOIOMMU
 	bool "VFIO No-IOMMU support"
 	help
@@ -44,6 +46,17 @@  config VFIO_NOIOMMU
 	  this mode since there is no IOMMU to provide DMA translation.
 
 	  If you don't know what to do here, say N.
+endif
+
+config VFIO_SPAPR_EEH
+	tristate
+	depends on EEH && VFIO_IOMMU_SPAPR_TCE
+	default VFIO
+
+config VFIO_VIRQFD
+	tristate
+	select EVENTFD
+	default n
 
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 3863922529ef20..b953517dc70f99 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -4,9 +4,9 @@  vfio_virqfd-y := virqfd.o
 obj-$(CONFIG_VFIO) += vfio.o
 
 vfio-y += vfio_main.o \
-	  iova_bitmap.o \
-	  container.o
+	  iova_bitmap.o
 vfio-$(CONFIG_IOMMUFD) += iommufd.o
+vfio-$(CONFIG_VFIO_CONTAINER) += container.o
 
 obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index d57a08afb5cf5c..3378714a746274 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -55,7 +55,9 @@  struct vfio_group {
 	struct list_head		device_list;
 	struct mutex			device_lock;
 	struct list_head		vfio_next;
+#if IS_ENABLED(CONFIG_VFIO_CONTAINER)
 	struct list_head		container_next;
+#endif
 	enum vfio_group_type		type;
 	struct mutex			group_lock;
 	struct kvm			*kvm;
@@ -64,6 +66,7 @@  struct vfio_group {
 	struct iommufd_ctx		*iommufd;
 };
 
+#if IS_ENABLED(CONFIG_VFIO_CONTAINER)
 /* events for the backend driver notify callback */
 enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
@@ -129,6 +132,68 @@  int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova,
 
 int __init vfio_container_init(void);
 void vfio_container_cleanup(void);
+#else
+static inline struct vfio_container *
+vfio_container_from_file(struct file *filep)
+{
+	return NULL;
+}
+
+static inline int vfio_group_use_container(struct vfio_group *group)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_group_unuse_container(struct vfio_group *group)
+{
+}
+
+static inline int vfio_container_attach_group(struct vfio_container *container,
+					      struct vfio_group *group)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_group_detach_container(struct vfio_group *group)
+{
+}
+
+static inline void vfio_device_container_register(struct vfio_device *device)
+{
+}
+
+static inline void vfio_device_container_unregister(struct vfio_device *device)
+{
+}
+
+static inline int vfio_container_pin_pages(struct vfio_container *container,
+					   struct iommu_group *iommu_group,
+					   dma_addr_t iova, int npage, int prot,
+					   struct page **pages)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void vfio_container_unpin_pages(struct vfio_container *container,
+					      dma_addr_t iova, int npage)
+{
+}
+
+static inline int vfio_container_dma_rw(struct vfio_container *container,
+					dma_addr_t iova, void *data, size_t len,
+					bool write)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int vfio_container_init(void)
+{
+	return 0;
+}
+static inline void vfio_container_cleanup(void)
+{
+}
+#endif
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
 int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx);