diff mbox series

[v4,02/13] driver core: Set DMA ownership during driver bind/unbind

Message ID 20211217063708.1740334-3-baolu.lu@linux.intel.com (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series Fix BUG_ON in vfio_iommu_group_notifier() | expand

Commit Message

Baolu Lu Dec. 17, 2021, 6:36 a.m. UTC
This extends really_probe() to allow checking for dma ownership conflict
during the driver binding process. By default, the DMA_OWNER_DMA_API is
claimed for the bound driver before calling its .probe() callback. If this
operation fails (e.g. the iommu group of the target device already has the
DMA_OWNER_USER set), the binding process is aborted to avoid breaking the
security contract for devices in the iommu group.

Without this change, the vfio driver has to listen to a bus BOUND_DRIVER
event and then BUG_ON() in case of dma ownership conflict. This leads to
bad user experience since careless driver binding operation may crash the
system if the admin overlooks the group restriction. Aside from bad design,
this leads to a security problem as a root user can force the kernel to
BUG() even with lockdown=integrity.

Driver may set a new flag (suppress_auto_claim_dma_owner) to disable auto
claim in the binding process. Examples include kernel drivers (pci_stub,
PCI bridge drivers, etc.) which don't trigger DMA at all thus can be safely
exempted in DMA ownership check and userspace framework drivers (vfio/vdpa
etc.) which need to manually claim DMA_OWNER_USER when assigning a device
to userspace.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/linux-iommu/20210922123931.GI327412@nvidia.com/
Link: https://lore.kernel.org/linux-iommu/20210928115751.GK964074@nvidia.com/
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/device/driver.h |  2 ++
 drivers/base/dd.c             | 37 ++++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)

Comments

Greg Kroah-Hartman Dec. 22, 2021, 12:47 p.m. UTC | #1
On Fri, Dec 17, 2021 at 02:36:57PM +0800, Lu Baolu wrote:
> This extends really_probe() to allow checking for dma ownership conflict
> during the driver binding process. By default, the DMA_OWNER_DMA_API is
> claimed for the bound driver before calling its .probe() callback. If this
> operation fails (e.g. the iommu group of the target device already has the
> DMA_OWNER_USER set), the binding process is aborted to avoid breaking the
> security contract for devices in the iommu group.
> 
> Without this change, the vfio driver has to listen to a bus BOUND_DRIVER
> event and then BUG_ON() in case of dma ownership conflict. This leads to
> bad user experience since careless driver binding operation may crash the
> system if the admin overlooks the group restriction. Aside from bad design,
> this leads to a security problem as a root user can force the kernel to
> BUG() even with lockdown=integrity.
> 
> Driver may set a new flag (suppress_auto_claim_dma_owner) to disable auto
> claim in the binding process. Examples include kernel drivers (pci_stub,
> PCI bridge drivers, etc.) which don't trigger DMA at all thus can be safely
> exempted in DMA ownership check and userspace framework drivers (vfio/vdpa
> etc.) which need to manually claim DMA_OWNER_USER when assigning a device
> to userspace.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Link: https://lore.kernel.org/linux-iommu/20210922123931.GI327412@nvidia.com/
> Link: https://lore.kernel.org/linux-iommu/20210928115751.GK964074@nvidia.com/
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  include/linux/device/driver.h |  2 ++
>  drivers/base/dd.c             | 37 ++++++++++++++++++++++++++++++-----
>  2 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
> index a498ebcf4993..f5bf7030c416 100644
> --- a/include/linux/device/driver.h
> +++ b/include/linux/device/driver.h
> @@ -54,6 +54,7 @@ enum probe_type {
>   * @owner:	The module owner.
>   * @mod_name:	Used for built-in modules.
>   * @suppress_bind_attrs: Disables bind/unbind via sysfs.
> + * @suppress_auto_claim_dma_owner: Disable kernel dma auto-claim.
>   * @probe_type:	Type of the probe (synchronous or asynchronous) to use.
>   * @of_match_table: The open firmware table.
>   * @acpi_match_table: The ACPI match table.
> @@ -100,6 +101,7 @@ struct device_driver {
>  	const char		*mod_name;	/* used for built-in modules */
>  
>  	bool suppress_bind_attrs;	/* disables bind/unbind via sysfs */
> +	bool suppress_auto_claim_dma_owner;
>  	enum probe_type probe_type;
>  
>  	const struct of_device_id	*of_match_table;
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 68ea1f949daa..b04eec5dcefa 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -28,6 +28,7 @@
>  #include <linux/pm_runtime.h>
>  #include <linux/pinctrl/devinfo.h>
>  #include <linux/slab.h>
> +#include <linux/iommu.h>
>  
>  #include "base.h"
>  #include "power/power.h"
> @@ -538,6 +539,32 @@ static int call_driver_probe(struct device *dev, struct device_driver *drv)
>  	return ret;
>  }
>  
> +static int device_dma_configure(struct device *dev, struct device_driver *drv)
> +{
> +	int ret;
> +
> +	if (!dev->bus->dma_configure)
> +		return 0;
> +
> +	ret = dev->bus->dma_configure(dev);
> +	if (ret)
> +		return ret;
> +
> +	if (!drv->suppress_auto_claim_dma_owner)
> +		ret = iommu_device_set_dma_owner(dev, DMA_OWNER_DMA_API, NULL);

Wait, the busses that wanted to configure the device, just did so in
their dma_configure callback, so why not do this type of
iommu_device_set_dma_owner() in the few busses that will want this to
happen?

Right now we only have 4 different "busses" that care about this.  Out
of the following callbacks:
	fsl_mc_dma_configure
	host1x_dma_configure
	pci_dma_configure
	platform_dma_configure

Which one will actually care about the iommu_device_set_dma_owner()
call?  All of them?  None of them?  Some of them?

Again, why can't this just happen in the (very few) bus callbacks that
care about this?  In following patches in this series, you turn off this
for the pci_dma_configure users, so what is left?  3 odd bus types that
are not used often.  How well did you test devices of those types with
this patchset?

It's fine to have "suppress" fields when they are the minority, but here
it's a _very_ tiny tiny number of actual devices in a system that will
ever get the chance to have this check happen for them and trigger,
right?

I know others told you to put this in the driver core, but I fail to see
how adding this call to the 3 busses that care about it is a lot more
work than this driver core functionality that we all will have to
maintain for forever?

> +
> +	return ret;
> +}
> +
> +static void device_dma_cleanup(struct device *dev, struct device_driver *drv)
> +{
> +	if (!dev->bus->dma_configure)
> +		return;
> +
> +	if (!drv->suppress_auto_claim_dma_owner)
> +		iommu_device_release_dma_owner(dev, DMA_OWNER_DMA_API);
> +}
> +
>  static int really_probe(struct device *dev, struct device_driver *drv)
>  {
>  	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
> @@ -574,11 +601,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
>  	if (ret)
>  		goto pinctrl_bind_failed;
>  
> -	if (dev->bus->dma_configure) {
> -		ret = dev->bus->dma_configure(dev);
> -		if (ret)
> -			goto probe_failed;
> -	}
> +	if (device_dma_configure(dev, drv))
> +		goto pinctrl_bind_failed;

Are you sure you are jumping to the proper error path here?  It is not
obvious why you changed this.

thanks,

greg k-h
Jason Gunthorpe Dec. 22, 2021, 5:52 p.m. UTC | #2
On Wed, Dec 22, 2021 at 01:47:34PM +0100, Greg Kroah-Hartman wrote:

> Right now we only have 4 different "busses" that care about this.  Out
> of the following callbacks:
> 	fsl_mc_dma_configure
> 	host1x_dma_configure
> 	pci_dma_configure
> 	platform_dma_configure
> 
> Which one will actually care about the iommu_device_set_dma_owner()
> call?  All of them?  None of them?  Some of them?

You asked this already, and it was answered - all but host1x require
it, and it is harmless for host1x to do it.

> Again, why can't this just happen in the (very few) bus callbacks that
> care about this?  

Because it is not 'very few', it is all but one. This is why HCH and I
both prefer this arrangement.

Especially since host1x is pretty odd. I wasn't able to find where a
host1x driver is doing DMA using the host1x device.. The places I
looked at already doing DMA used a platform device. So I'm not sure
what its host1x_dma_configure is for, or why host1x calls
of_dma_configure() twice..

> In following patches in this series, you turn off this
> for the pci_dma_configure users, so what is left?  

??? Where do you see this?

> I know others told you to put this in the driver core, but I fail to see
> how adding this call to the 3 busses that care about it is a lot more
> work than this driver core functionality that we all will have to
> maintain for forever?

It is 4, you forgot AMBA's re-use of platform_dma_configure.

Why are you asking to duplicate code that has no reason to be
different based on bus type? That seems like bad practice.

No matter where we put this we have to maintain it "forever" not sure
what you are trying to say.

Jason
Baolu Lu Dec. 23, 2021, 2:08 a.m. UTC | #3
Hi Greg,

On 12/22/21 8:47 PM, Greg Kroah-Hartman wrote:
> Which one will actually care about the iommu_device_set_dma_owner()
> call?  All of them?  None of them?  Some of them?
> 
> Again, why can't this just happen in the (very few) bus callbacks that
> care about this?  In following patches in this series, you turn off this
> for the pci_dma_configure users, so what is left?  3 odd bus types that
> are not used often.  How well did you test devices of those types with
> this patchset?
> 
> It's fine to have "suppress" fields when they are the minority, but here
> it's a_very_  tiny tiny number of actual devices in a system that will
> ever get the chance to have this check happen for them and trigger,
> right?

Thank you for your comments. Current VFIO implementation supports
devices on pci/platform/amba/fls-mc buses for user-space DMA. So only
those buses need to call iommu_device_set/release_dma_owner() in their
dma_configure/cleanup() callbacks.

The "suppress" field is only for a few device drivers (not devices), for
example,

- vfio-pci, a PCI device driver used to bind to a PCI device so that it
   could be assigned for user-space DMA.

Other similar drivers in drivers/vfio are vfio-fsl-mc, vfio-amba and
vfio-platform. These drivers will call
iommu_device_set/release_dma_owner(DMA_OWNER_USER) explicitly when the
device is assigned to user.

The logic is that on the affected buses (pci/platform/amba/fls-mc),

- for non-vfio drivers, bus dma_configure/cleanup() will automatically
   call iommu_device_set_dma_owner(KERNEL) for the device; [This is the
   majority cases.]

- for vfio drivers, the auto-call will be suppressed, and the vfio
   drivers are supposed to call iommu_device_set_dma_owner(USER) before
   device is assigned to the userspace. [This is the rare case.]

The KERNEL and USER conflict will be detected in
iommu_device_set_dma_owner() with a -EBUSY return value. In that case,
the driver binding or device assignment should be aborted.

Best regards,
baolu
Baolu Lu Dec. 23, 2021, 3:02 a.m. UTC | #4
Hi Greg,

On 12/22/21 8:47 PM, Greg Kroah-Hartman wrote:
>> +
>> +	return ret;
>> +}
>> +
>> +static void device_dma_cleanup(struct device *dev, struct device_driver *drv)
>> +{
>> +	if (!dev->bus->dma_configure)
>> +		return;
>> +
>> +	if (!drv->suppress_auto_claim_dma_owner)
>> +		iommu_device_release_dma_owner(dev, DMA_OWNER_DMA_API);
>> +}
>> +
>>   static int really_probe(struct device *dev, struct device_driver *drv)
>>   {
>>   	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
>> @@ -574,11 +601,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
>>   	if (ret)
>>   		goto pinctrl_bind_failed;
>>   
>> -	if (dev->bus->dma_configure) {
>> -		ret = dev->bus->dma_configure(dev);
>> -		if (ret)
>> -			goto probe_failed;
>> -	}
>> +	if (device_dma_configure(dev, drv))
>> +		goto pinctrl_bind_failed;
> Are you sure you are jumping to the proper error path here?  It is not
> obvious why you changed this.

The error handling path in really_probe() seems a bit wrong. For
example,

  572         /* If using pinctrl, bind pins now before probing */
  573         ret = pinctrl_bind_pins(dev);
  574         if (ret)
  575                 goto pinctrl_bind_failed;

[...]

  663 pinctrl_bind_failed:
  664         device_links_no_driver(dev);
  665         devres_release_all(dev);
  666         arch_teardown_dma_ops(dev);
  667         kfree(dev->dma_range_map);
  668         dev->dma_range_map = NULL;
  669         driver_sysfs_remove(dev);
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  670         dev->driver = NULL;
  671         dev_set_drvdata(dev, NULL);
  672         if (dev->pm_domain && dev->pm_domain->dismiss)
  673                 dev->pm_domain->dismiss(dev);
  674         pm_runtime_reinit(dev);
  675         dev_pm_set_driver_flags(dev, 0);
  676 done:
  677         return ret;

The driver_sysfs_remove() will be called even driver_sysfs_add() hasn't
been called yet. I can fix this in a separated patch if I didn't miss
anything.

Best regards,
baolu
Greg Kroah-Hartman Dec. 23, 2021, 7:13 a.m. UTC | #5
On Thu, Dec 23, 2021 at 11:02:54AM +0800, Lu Baolu wrote:
> Hi Greg,
> 
> On 12/22/21 8:47 PM, Greg Kroah-Hartman wrote:
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static void device_dma_cleanup(struct device *dev, struct device_driver *drv)
> > > +{
> > > +	if (!dev->bus->dma_configure)
> > > +		return;
> > > +
> > > +	if (!drv->suppress_auto_claim_dma_owner)
> > > +		iommu_device_release_dma_owner(dev, DMA_OWNER_DMA_API);
> > > +}
> > > +
> > >   static int really_probe(struct device *dev, struct device_driver *drv)
> > >   {
> > >   	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
> > > @@ -574,11 +601,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
> > >   	if (ret)
> > >   		goto pinctrl_bind_failed;
> > > -	if (dev->bus->dma_configure) {
> > > -		ret = dev->bus->dma_configure(dev);
> > > -		if (ret)
> > > -			goto probe_failed;
> > > -	}
> > > +	if (device_dma_configure(dev, drv))
> > > +		goto pinctrl_bind_failed;
> > Are you sure you are jumping to the proper error path here?  It is not
> > obvious why you changed this.
> 
> The error handling path in really_probe() seems a bit wrong. For
> example,
> 
>  572         /* If using pinctrl, bind pins now before probing */
>  573         ret = pinctrl_bind_pins(dev);
>  574         if (ret)
>  575                 goto pinctrl_bind_failed;
> 
> [...]
> 
>  663 pinctrl_bind_failed:
>  664         device_links_no_driver(dev);
>  665         devres_release_all(dev);
>  666         arch_teardown_dma_ops(dev);
>  667         kfree(dev->dma_range_map);
>  668         dev->dma_range_map = NULL;
>  669         driver_sysfs_remove(dev);
>              ^^^^^^^^^^^^^^^^^^^^^^^^^
>  670         dev->driver = NULL;
>  671         dev_set_drvdata(dev, NULL);
>  672         if (dev->pm_domain && dev->pm_domain->dismiss)
>  673                 dev->pm_domain->dismiss(dev);
>  674         pm_runtime_reinit(dev);
>  675         dev_pm_set_driver_flags(dev, 0);
>  676 done:
>  677         return ret;
> 
> The driver_sysfs_remove() will be called even driver_sysfs_add() hasn't
> been called yet. I can fix this in a separated patch if I didn't miss
> anything.

If this is a bug in the existing kernel, please submit it as a separate
patch so that it can be properly backported to all affected kernels.
Never bury it in an unrelated change that will never get sent to older
kernels.

greg k-h
Baolu Lu Dec. 23, 2021, 7:23 a.m. UTC | #6
On 12/23/21 3:13 PM, Greg Kroah-Hartman wrote:
> On Thu, Dec 23, 2021 at 11:02:54AM +0800, Lu Baolu wrote:
>> Hi Greg,
>>
>> On 12/22/21 8:47 PM, Greg Kroah-Hartman wrote:
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static void device_dma_cleanup(struct device *dev, struct device_driver *drv)
>>>> +{
>>>> +	if (!dev->bus->dma_configure)
>>>> +		return;
>>>> +
>>>> +	if (!drv->suppress_auto_claim_dma_owner)
>>>> +		iommu_device_release_dma_owner(dev, DMA_OWNER_DMA_API);
>>>> +}
>>>> +
>>>>    static int really_probe(struct device *dev, struct device_driver *drv)
>>>>    {
>>>>    	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
>>>> @@ -574,11 +601,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
>>>>    	if (ret)
>>>>    		goto pinctrl_bind_failed;
>>>> -	if (dev->bus->dma_configure) {
>>>> -		ret = dev->bus->dma_configure(dev);
>>>> -		if (ret)
>>>> -			goto probe_failed;
>>>> -	}
>>>> +	if (device_dma_configure(dev, drv))
>>>> +		goto pinctrl_bind_failed;
>>> Are you sure you are jumping to the proper error path here?  It is not
>>> obvious why you changed this.
>> The error handling path in really_probe() seems a bit wrong. For
>> example,
>>
>>   572         /* If using pinctrl, bind pins now before probing */
>>   573         ret = pinctrl_bind_pins(dev);
>>   574         if (ret)
>>   575                 goto pinctrl_bind_failed;
>>
>> [...]
>>
>>   663 pinctrl_bind_failed:
>>   664         device_links_no_driver(dev);
>>   665         devres_release_all(dev);
>>   666         arch_teardown_dma_ops(dev);
>>   667         kfree(dev->dma_range_map);
>>   668         dev->dma_range_map = NULL;
>>   669         driver_sysfs_remove(dev);
>>               ^^^^^^^^^^^^^^^^^^^^^^^^^
>>   670         dev->driver = NULL;
>>   671         dev_set_drvdata(dev, NULL);
>>   672         if (dev->pm_domain && dev->pm_domain->dismiss)
>>   673                 dev->pm_domain->dismiss(dev);
>>   674         pm_runtime_reinit(dev);
>>   675         dev_pm_set_driver_flags(dev, 0);
>>   676 done:
>>   677         return ret;
>>
>> The driver_sysfs_remove() will be called even driver_sysfs_add() hasn't
>> been called yet. I can fix this in a separated patch if I didn't miss
>> anything.
> If this is a bug in the existing kernel, please submit it as a separate
> patch so that it can be properly backported to all affected kernels.
> Never bury it in an unrelated change that will never get sent to older
> kernels.

Sure! I will. Thank you!

Best regards,
baolu
Jason Gunthorpe Dec. 31, 2021, 12:36 a.m. UTC | #7
On Thu, Dec 23, 2021 at 03:23:54PM +0800, Lu Baolu wrote:

> > If this is a bug in the existing kernel, please submit it as a separate
> > patch so that it can be properly backported to all affected kernels.
> > Never bury it in an unrelated change that will never get sent to older
> > kernels.
> 
> Sure! I will. Thank you!

I recall looking at this some time ago, and yes the ordering is not in
the strict pairwise error unwind one would expect, the extra calls
turned out to be harmless. Do check carefully..

Jason
diff mbox series

Patch

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index a498ebcf4993..f5bf7030c416 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -54,6 +54,7 @@  enum probe_type {
  * @owner:	The module owner.
  * @mod_name:	Used for built-in modules.
  * @suppress_bind_attrs: Disables bind/unbind via sysfs.
+ * @suppress_auto_claim_dma_owner: Disable kernel dma auto-claim.
  * @probe_type:	Type of the probe (synchronous or asynchronous) to use.
  * @of_match_table: The open firmware table.
  * @acpi_match_table: The ACPI match table.
@@ -100,6 +101,7 @@  struct device_driver {
 	const char		*mod_name;	/* used for built-in modules */
 
 	bool suppress_bind_attrs;	/* disables bind/unbind via sysfs */
+	bool suppress_auto_claim_dma_owner;
 	enum probe_type probe_type;
 
 	const struct of_device_id	*of_match_table;
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 68ea1f949daa..b04eec5dcefa 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -28,6 +28,7 @@ 
 #include <linux/pm_runtime.h>
 #include <linux/pinctrl/devinfo.h>
 #include <linux/slab.h>
+#include <linux/iommu.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -538,6 +539,32 @@  static int call_driver_probe(struct device *dev, struct device_driver *drv)
 	return ret;
 }
 
+static int device_dma_configure(struct device *dev, struct device_driver *drv)
+{
+	int ret;
+
+	if (!dev->bus->dma_configure)
+		return 0;
+
+	ret = dev->bus->dma_configure(dev);
+	if (ret)
+		return ret;
+
+	if (!drv->suppress_auto_claim_dma_owner)
+		ret = iommu_device_set_dma_owner(dev, DMA_OWNER_DMA_API, NULL);
+
+	return ret;
+}
+
+static void device_dma_cleanup(struct device *dev, struct device_driver *drv)
+{
+	if (!dev->bus->dma_configure)
+		return;
+
+	if (!drv->suppress_auto_claim_dma_owner)
+		iommu_device_release_dma_owner(dev, DMA_OWNER_DMA_API);
+}
+
 static int really_probe(struct device *dev, struct device_driver *drv)
 {
 	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
@@ -574,11 +601,8 @@  static int really_probe(struct device *dev, struct device_driver *drv)
 	if (ret)
 		goto pinctrl_bind_failed;
 
-	if (dev->bus->dma_configure) {
-		ret = dev->bus->dma_configure(dev);
-		if (ret)
-			goto probe_failed;
-	}
+	if (device_dma_configure(dev, drv))
+		goto pinctrl_bind_failed;
 
 	ret = driver_sysfs_add(dev);
 	if (ret) {
@@ -660,6 +684,8 @@  static int really_probe(struct device *dev, struct device_driver *drv)
 	if (dev->bus)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
+
+	device_dma_cleanup(dev, drv);
 pinctrl_bind_failed:
 	device_links_no_driver(dev);
 	devres_release_all(dev);
@@ -1204,6 +1230,7 @@  static void __device_release_driver(struct device *dev, struct device *parent)
 		else if (drv->remove)
 			drv->remove(dev);
 
+		device_dma_cleanup(dev, drv);
 		device_links_driver_cleanup(dev);
 
 		devres_release_all(dev);