diff mbox series

[v7,01/11] iommu: Add DMA ownership management interfaces

Message ID 20220228005056.599595-2-baolu.lu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Fix BUG_ON in vfio_iommu_group_notifier() | expand

Commit Message

Baolu Lu Feb. 28, 2022, 12:50 a.m. UTC
Multiple devices may be placed in the same IOMMU group because they
cannot be isolated from each other. These devices must either be
entirely under kernel control or userspace control, never a mixture.

This adds dma ownership management in iommu core and exposes several
interfaces for the device drivers and the device userspace assignment
framework (i.e. VFIO), so that any conflict between user and kernel
controlled dma could be detected at the beginning.

The device driver oriented interfaces are,

	int iommu_device_use_default_domain(struct device *dev);
	void iommu_device_unuse_default_domain(struct device *dev);

By calling iommu_device_use_default_domain(), the device driver tells
the iommu layer that the device dma is handled through the kernel DMA
APIs. The iommu layer will manage the IOVA and use the default domain
for DMA address translation.

The device user-space assignment framework oriented interfaces are,

	int iommu_group_claim_dma_owner(struct iommu_group *group,
					void *owner);
	void iommu_group_release_dma_owner(struct iommu_group *group);
	bool iommu_group_dma_owner_claimed(struct iommu_group *group);

The device userspace assignment must be disallowed if the DMA owner
claiming interface returns failure.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h |  31 +++++++++
 drivers/iommu/iommu.c | 153 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 181 insertions(+), 3 deletions(-)

Comments

Eric Auger March 4, 2022, 10:34 a.m. UTC | #1
Hi Lu,

On 2/28/22 1:50 AM, Lu Baolu wrote:
> Multiple devices may be placed in the same IOMMU group because they
> cannot be isolated from each other. These devices must either be
> entirely under kernel control or userspace control, never a mixture.
>
> This adds dma ownership management in iommu core and exposes several
> interfaces for the device drivers and the device userspace assignment
> framework (i.e. VFIO), so that any conflict between user and kernel
> controlled dma could be detected at the beginning.
>
> The device driver oriented interfaces are,
>
> 	int iommu_device_use_default_domain(struct device *dev);
> 	void iommu_device_unuse_default_domain(struct device *dev);
>
> By calling iommu_device_use_default_domain(), the device driver tells
> the iommu layer that the device dma is handled through the kernel DMA
> APIs. The iommu layer will manage the IOVA and use the default domain
> for DMA address translation.
>
> The device user-space assignment framework oriented interfaces are,
>
> 	int iommu_group_claim_dma_owner(struct iommu_group *group,
> 					void *owner);
> 	void iommu_group_release_dma_owner(struct iommu_group *group);
> 	bool iommu_group_dma_owner_claimed(struct iommu_group *group);
>
> The device userspace assignment must be disallowed if the DMA owner
> claiming interface returns failure.
>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  include/linux/iommu.h |  31 +++++++++
>  drivers/iommu/iommu.c | 153 +++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 181 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 9208eca4b0d1..77972ef978b5 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -675,6 +675,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
>  void iommu_sva_unbind_device(struct iommu_sva *handle);
>  u32 iommu_sva_get_pasid(struct iommu_sva *handle);
>  
> +int iommu_device_use_default_domain(struct device *dev);
> +void iommu_device_unuse_default_domain(struct device *dev);
> +
> +int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner);
> +void iommu_group_release_dma_owner(struct iommu_group *group);
> +bool iommu_group_dma_owner_claimed(struct iommu_group *group);
> +
>  #else /* CONFIG_IOMMU_API */
>  
>  struct iommu_ops {};
> @@ -1031,6 +1038,30 @@ static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
>  {
>  	return NULL;
>  }
> +
> +static inline int iommu_device_use_default_domain(struct device *dev)
> +{
> +	return 0;
> +}
> +
> +static inline void iommu_device_unuse_default_domain(struct device *dev)
> +{
> +}
> +
> +static inline int
> +iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
> +{
> +	return -ENODEV;
> +}
> +
> +static inline void iommu_group_release_dma_owner(struct iommu_group *group)
> +{
> +}
> +
> +static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)
> +{
> +	return false;
> +}
>  #endif /* CONFIG_IOMMU_API */
>  
>  /**
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index f2c45b85b9fc..eba8e8ccf19d 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -48,6 +48,8 @@ struct iommu_group {
>  	struct iommu_domain *default_domain;
>  	struct iommu_domain *domain;
>  	struct list_head entry;
> +	unsigned int owner_cnt;
> +	void *owner;
>  };
>  
>  struct group_device {
> @@ -294,7 +296,11 @@ int iommu_probe_device(struct device *dev)
>  	mutex_lock(&group->mutex);
>  	iommu_alloc_default_domain(group, dev);
>  
> -	if (group->default_domain) {
> +	/*
> +	 * If device joined an existing group which has been claimed, don't
> +	 * attach the default domain.
> +	 */
> +	if (group->default_domain && !group->owner) {
>  		ret = __iommu_attach_device(group->default_domain, dev);
>  		if (ret) {
>  			mutex_unlock(&group->mutex);
> @@ -2109,7 +2115,7 @@ static int __iommu_attach_group(struct iommu_domain *domain,
>  {
>  	int ret;
>  
> -	if (group->default_domain && group->domain != group->default_domain)
> +	if (group->domain && group->domain != group->default_domain)
>  		return -EBUSY;
>  
>  	ret = __iommu_group_for_each_dev(group, domain,
> @@ -2146,7 +2152,11 @@ static void __iommu_detach_group(struct iommu_domain *domain,
>  {
>  	int ret;
>  
> -	if (!group->default_domain) {
> +	/*
> +	 * If the group has been claimed already, do not re-attach the default
> +	 * domain.
> +	 */
> +	if (!group->default_domain || group->owner) {
>  		__iommu_group_for_each_dev(group, domain,
>  					   iommu_group_do_detach_device);
>  		group->domain = NULL;
> @@ -3095,3 +3105,140 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
>  
>  	return ret;
>  }
> +
> +/**
> + * iommu_device_use_default_domain() - Device driver wants to handle device
> + *                                     DMA through the kernel DMA API.
> + * @dev: The device.
> + *
> + * The device driver about to bind @dev wants to do DMA through the kernel
> + * DMA API. Return 0 if it is allowed, otherwise an error.
> + */
> +int iommu_device_use_default_domain(struct device *dev)
> +{
> +	struct iommu_group *group = iommu_group_get(dev);
> +	int ret = 0;
> +
> +	if (!group)
> +		return 0;
I hit a WARN_ON() when unbinding an e1000e driver just after boot:

sudo modprobe -v vfio-pci
echo vfio-pci | sudo tee -a
/sys/bus/pci/devices/0004:01:00.0/driver_override
vfio-pci
echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind


[  390.042811] ------------[ cut here ]------------
[  390.046468] WARNING: CPU: 42 PID: 5589 at drivers/iommu/iommu.c:3123
iommu_device_unuse_default_domain+0x68/0x100
[  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT
nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc rfkill
sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c mlx5_core sg
mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform qcom_emac
mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
[  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
5.17.0-rc4-lu-v7-official+ #24
[  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS 0ACJA570
11/05/2018
[  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
[  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
[  390.150894] sp : ffff80000fbb3bc0
[  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
0000000000000000
[  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
ffff03c0c7cc5720
[  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
ffffb4e61df223f8
[  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
0000000000000000
[  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
0000000000000000
[  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
ffff03c0d519cd80
[  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
ffffb4e620b54f8c
[  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
00000000000000ff
[  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
0000000000000000
[  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
0000000000000000
[  390.225374] Call trace:
[  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
[  390.233187]  pci_dma_cleanup+0x38/0x44
[  390.236919]  __device_release_driver+0x1a8/0x260
[  390.241519]  device_driver_detach+0x50/0xd0
[  390.245686]  unbind_store+0xf8/0x120
[  390.249245]  drv_attr_store+0x30/0x44
[  390.252891]  sysfs_kf_write+0x50/0x60
[  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
[  390.260964]  new_sync_write+0xf0/0x18c
[  390.264696]  vfs_write+0x230/0x2d0
[  390.268082]  ksys_write+0x74/0x100
[  390.271467]  __arm64_sys_write+0x28/0x3c
[  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
[  390.280061]  el0_svc_common.constprop.0+0x160/0x164
[  390.284922]  do_el0_svc+0x34/0xcc
[  390.288221]  el0_svc+0x30/0x140
[  390.291346]  el0t_64_sync_handler+0xa4/0x130
[  390.295599]  el0t_64_sync+0x1a0/0x1a4
[  390.299245] ---[ end trace 0000000000000000 ]---


I put some traces in the code and I can see that iommu_device_use_default_domain() effectively is called on 0004:01:00.0 e1000e device on pci_dma_configure() but at that time the iommu group is NULL:
[   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure driver_managed_area=0
[   10.569431] e1000e 0004:01:00.0: **** iommu_device_use_default_domain ENTRY
[   10.569433] e1000e 0004:01:00.0: **** iommu_device_use_default_domain no group
[   10.569435] e1000e 0004:01:00.0: pci_dma_configure iommu_device_use_default_domain returned 0
[   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3

^^^the group is added after the 
iommu_device_use_default_domain() call
So the group->owner_cnt is not incremented as expected.

Thanks

Eric

> +
> +	mutex_lock(&group->mutex);
> +	if (group->owner_cnt) {
> +		if (group->domain != group->default_domain ||
> +		    group->owner) {
> +			ret = -EBUSY;
> +			goto unlock_out;
> +		}
> +	}
> +
> +	group->owner_cnt++;
> +
> +unlock_out:
> +	mutex_unlock(&group->mutex);
> +	iommu_group_put(group);
> +
> +	return ret;
> +}
> +
> +/**
> + * iommu_device_unuse_default_domain() - Device driver stops handling device
> + *                                       DMA through the kernel DMA API.
> + * @dev: The device.
> + *
> + * The device driver doesn't want to do DMA through kernel DMA API anymore.
> + * It must be called after iommu_device_use_default_domain().
> + */
> +void iommu_device_unuse_default_domain(struct device *dev)
> +{
> +	struct iommu_group *group = iommu_group_get(dev);
> +
> +	if (!group)
> +		return;
> +
> +	mutex_lock(&group->mutex);
> +	if (!WARN_ON(!group->owner_cnt))
> +		group->owner_cnt--;
> +
> +	mutex_unlock(&group->mutex);
> +	iommu_group_put(group);
> +}
> +
> +/**
> + * iommu_group_claim_dma_owner() - Set DMA ownership of a group
> + * @group: The group.
> + * @owner: Caller specified pointer. Used for exclusive ownership.
> + *
> + * This is to support backward compatibility for vfio which manages
> + * the dma ownership in iommu_group level. New invocations on this
> + * interface should be prohibited.
> + */
> +int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&group->mutex);
> +	if (group->owner_cnt) {
> +		ret = -EPERM;
> +		goto unlock_out;
> +	} else {
> +		if (group->domain && group->domain != group->default_domain) {
> +			ret = -EBUSY;
> +			goto unlock_out;
> +		}
> +
> +		group->owner = owner;
> +		if (group->domain)
> +			__iommu_detach_group(group->domain, group);
> +	}
> +
> +	group->owner_cnt++;
> +unlock_out:
> +	mutex_unlock(&group->mutex);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner);
> +
> +/**
> + * iommu_group_release_dma_owner() - Release DMA ownership of a group
> + * @group: The group.
> + *
> + * Release the DMA ownership claimed by iommu_group_claim_dma_owner().
> + */
> +void iommu_group_release_dma_owner(struct iommu_group *group)
> +{
> +	mutex_lock(&group->mutex);
> +	if (WARN_ON(!group->owner_cnt || !group->owner))
> +		goto unlock_out;
> +
> +	group->owner_cnt = 0;
> +	/*
> +	 * The UNMANAGED domain should be detached before all USER
> +	 * owners have been released.
> +	 */
> +	if (!WARN_ON(group->domain) && group->default_domain)
> +		__iommu_attach_group(group->default_domain, group);
> +	group->owner = NULL;
> +unlock_out:
> +	mutex_unlock(&group->mutex);
> +}
> +EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner);
> +
> +/**
> + * iommu_group_dma_owner_claimed() - Query group dma ownership status
> + * @group: The group.
> + *
> + * This provides status query on a given group. It is racy and only for
> + * non-binding status reporting.
> + */
> +bool iommu_group_dma_owner_claimed(struct iommu_group *group)
> +{
> +	unsigned int user;
> +
> +	mutex_lock(&group->mutex);
> +	user = group->owner_cnt;
> +	mutex_unlock(&group->mutex);
> +
> +	return user;
> +}
> +EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
Baolu Lu March 4, 2022, 10:43 a.m. UTC | #2
Hi Eric,

On 2022/3/4 18:34, Eric Auger wrote:
> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
> 
> sudo modprobe -v vfio-pci
> echo vfio-pci | sudo tee -a
> /sys/bus/pci/devices/0004:01:00.0/driver_override
> vfio-pci
> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
> 
> 
> [  390.042811] ------------[ cut here ]------------
> [  390.046468] WARNING: CPU: 42 PID: 5589 at drivers/iommu/iommu.c:3123
> iommu_device_unuse_default_domain+0x68/0x100
> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT
> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc rfkill
> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c mlx5_core sg
> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform qcom_emac
> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
> 5.17.0-rc4-lu-v7-official+ #24
> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS 0ACJA570
> 11/05/2018
> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
> BTYPE=--)
> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
> [  390.150894] sp : ffff80000fbb3bc0
> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
> 0000000000000000
> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
> ffff03c0c7cc5720
> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
> ffffb4e61df223f8
> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
> 0000000000000000
> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
> 0000000000000000
> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
> ffff03c0d519cd80
> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
> ffffb4e620b54f8c
> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
> 00000000000000ff
> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
> 0000000000000000
> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
> 0000000000000000
> [  390.225374] Call trace:
> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
> [  390.233187]  pci_dma_cleanup+0x38/0x44
> [  390.236919]  __device_release_driver+0x1a8/0x260
> [  390.241519]  device_driver_detach+0x50/0xd0
> [  390.245686]  unbind_store+0xf8/0x120
> [  390.249245]  drv_attr_store+0x30/0x44
> [  390.252891]  sysfs_kf_write+0x50/0x60
> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
> [  390.260964]  new_sync_write+0xf0/0x18c
> [  390.264696]  vfs_write+0x230/0x2d0
> [  390.268082]  ksys_write+0x74/0x100
> [  390.271467]  __arm64_sys_write+0x28/0x3c
> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
> [  390.284922]  do_el0_svc+0x34/0xcc
> [  390.288221]  el0_svc+0x30/0x140
> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
> [  390.299245] ---[ end trace 0000000000000000 ]---
> 
> 
> I put some traces in the code and I can see that iommu_device_use_default_domain() effectively is called on 0004:01:00.0 e1000e device on pci_dma_configure() but at that time the iommu group is NULL:
> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure driver_managed_area=0
> [   10.569431] e1000e 0004:01:00.0: **** iommu_device_use_default_domain ENTRY
> [   10.569433] e1000e 0004:01:00.0: **** iommu_device_use_default_domain no group
> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure iommu_device_use_default_domain returned 0
> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
> 
> ^^^the group is added after the
> iommu_device_use_default_domain() call
> So the group->owner_cnt is not incremented as expected.

Thank you for reporting this. Do you have any idea why the driver is
loaded before iommu_probe_device()?

Best regards,
baolu
Robin Murphy March 4, 2022, 12:22 p.m. UTC | #3
On 2022-03-04 10:43, Lu Baolu wrote:
> Hi Eric,
> 
> On 2022/3/4 18:34, Eric Auger wrote:
>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>
>> sudo modprobe -v vfio-pci
>> echo vfio-pci | sudo tee -a
>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>> vfio-pci
>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>
>>
>> [  390.042811] ------------[ cut here ]------------
>> [  390.046468] WARNING: CPU: 42 PID: 5589 at drivers/iommu/iommu.c:3123
>> iommu_device_unuse_default_domain+0x68/0x100
>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT
>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc rfkill
>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c mlx5_core sg
>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform qcom_emac
>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>> 5.17.0-rc4-lu-v7-official+ #24
>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS 0ACJA570
>> 11/05/2018
>> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>> BTYPE=--)
>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>> [  390.150894] sp : ffff80000fbb3bc0
>> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
>> 0000000000000000
>> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
>> ffff03c0c7cc5720
>> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
>> ffffb4e61df223f8
>> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
>> 0000000000000000
>> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
>> 0000000000000000
>> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
>> ffff03c0d519cd80
>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
>> ffffb4e620b54f8c
>> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
>> 00000000000000ff
>> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
>> 0000000000000000
>> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
>> 0000000000000000
>> [  390.225374] Call trace:
>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>> [  390.236919]  __device_release_driver+0x1a8/0x260
>> [  390.241519]  device_driver_detach+0x50/0xd0
>> [  390.245686]  unbind_store+0xf8/0x120
>> [  390.249245]  drv_attr_store+0x30/0x44
>> [  390.252891]  sysfs_kf_write+0x50/0x60
>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>> [  390.260964]  new_sync_write+0xf0/0x18c
>> [  390.264696]  vfs_write+0x230/0x2d0
>> [  390.268082]  ksys_write+0x74/0x100
>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>> [  390.284922]  do_el0_svc+0x34/0xcc
>> [  390.288221]  el0_svc+0x30/0x140
>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>> [  390.299245] ---[ end trace 0000000000000000 ]---
>>
>>
>> I put some traces in the code and I can see that 
>> iommu_device_use_default_domain() effectively is called on 
>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time the 
>> iommu group is NULL:
>> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure 
>> driver_managed_area=0
>> [   10.569431] e1000e 0004:01:00.0: **** 
>> iommu_device_use_default_domain ENTRY
>> [   10.569433] e1000e 0004:01:00.0: **** 
>> iommu_device_use_default_domain no group
>> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure 
>> iommu_device_use_default_domain returned 0
>> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
>>
>> ^^^the group is added after the
>> iommu_device_use_default_domain() call
>> So the group->owner_cnt is not incremented as expected.
> 
> Thank you for reporting this. Do you have any idea why the driver is
> loaded before iommu_probe_device()?

Urgh, this is the horrible firmware-data-ordering thing again. The stuff 
I've been saying about having to rework the whole .dma_configure 
mechanism in the near future is to fix this properly.

The summary is that in patch #4, calling 
iommu_device_use_default_domain() *before* {of,acpi}_dma_configure is 
currently a problem. As things stand, the IOMMU driver ignored the 
initial iommu_probe_device() call when the device was added, since at 
that point it had no fwspec yet. In this situation, 
{of,acpi}_iommu_configure() are retriggering iommu_probe_device() after 
the IOMMU driver has seen the firmware data via .of_xlate to learn that 
it it actually responsible for the given device.

Robin.
Eric Auger March 4, 2022, 1:55 p.m. UTC | #4
Hi Robin,

On 3/4/22 1:22 PM, Robin Murphy wrote:
> On 2022-03-04 10:43, Lu Baolu wrote:
>> Hi Eric,
>>
>> On 2022/3/4 18:34, Eric Auger wrote:
>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>
>>> sudo modprobe -v vfio-pci
>>> echo vfio-pci | sudo tee -a
>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>> vfio-pci
>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>
>>>
>>> [  390.042811] ------------[ cut here ]------------
>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at drivers/iommu/iommu.c:3123
>>> iommu_device_unuse_default_domain+0x68/0x100
>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT
>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc rfkill
>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>> mlx5_core sg
>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>> qcom_emac
>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>> 5.17.0-rc4-lu-v7-official+ #24
>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>> 0ACJA570
>>> 11/05/2018
>>> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>> BTYPE=--)
>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>> [  390.150894] sp : ffff80000fbb3bc0
>>> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
>>> 0000000000000000
>>> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
>>> ffff03c0c7cc5720
>>> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
>>> ffffb4e61df223f8
>>> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
>>> 0000000000000000
>>> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
>>> 0000000000000000
>>> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
>>> ffff03c0d519cd80
>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
>>> ffffb4e620b54f8c
>>> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
>>> 00000000000000ff
>>> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
>>> 0000000000000000
>>> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
>>> 0000000000000000
>>> [  390.225374] Call trace:
>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>> [  390.245686]  unbind_store+0xf8/0x120
>>> [  390.249245]  drv_attr_store+0x30/0x44
>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>> [  390.264696]  vfs_write+0x230/0x2d0
>>> [  390.268082]  ksys_write+0x74/0x100
>>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>>> [  390.284922]  do_el0_svc+0x34/0xcc
>>> [  390.288221]  el0_svc+0x30/0x140
>>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>>> [  390.299245] ---[ end trace 0000000000000000 ]---
>>>
>>>
>>> I put some traces in the code and I can see that
>>> iommu_device_use_default_domain() effectively is called on
>>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time
>>> the iommu group is NULL:
>>> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure
>>> driver_managed_area=0
>>> [   10.569431] e1000e 0004:01:00.0: ****
>>> iommu_device_use_default_domain ENTRY
>>> [   10.569433] e1000e 0004:01:00.0: ****
>>> iommu_device_use_default_domain no group
>>> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure
>>> iommu_device_use_default_domain returned 0
>>> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
>>>
>>> ^^^the group is added after the
>>> iommu_device_use_default_domain() call
>>> So the group->owner_cnt is not incremented as expected.
>>
>> Thank you for reporting this. Do you have any idea why the driver is
>> loaded before iommu_probe_device()?
>
> Urgh, this is the horrible firmware-data-ordering thing again. The
> stuff I've been saying about having to rework the whole .dma_configure
> mechanism in the near future is to fix this properly.
>
> The summary is that in patch #4, calling
> iommu_device_use_default_domain() *before* {of,acpi}_dma_configure is
> currently a problem. As things stand, the IOMMU driver ignored the
> initial iommu_probe_device() call when the device was added, since at
> that point it had no fwspec yet. In this situation,
> {of,acpi}_iommu_configure() are retriggering iommu_probe_device()
> after the IOMMU driver has seen the firmware data via .of_xlate to
> learn that it it actually responsible for the given device.

thank you for providing the info. Hope this is something Lu can work around.

Thanks!

Eric
>
> Robin.
>
Robin Murphy March 4, 2022, 2:10 p.m. UTC | #5
On 2022-03-04 13:55, Eric Auger wrote:
> Hi Robin,
> 
> On 3/4/22 1:22 PM, Robin Murphy wrote:
>> On 2022-03-04 10:43, Lu Baolu wrote:
>>> Hi Eric,
>>>
>>> On 2022/3/4 18:34, Eric Auger wrote:
>>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>>
>>>> sudo modprobe -v vfio-pci
>>>> echo vfio-pci | sudo tee -a
>>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>>> vfio-pci
>>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>>
>>>>
>>>> [  390.042811] ------------[ cut here ]------------
>>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at drivers/iommu/iommu.c:3123
>>>> iommu_device_unuse_default_domain+0x68/0x100
>>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT
>>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc rfkill
>>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>>> mlx5_core sg
>>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
>>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>>> qcom_emac
>>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>>> 5.17.0-rc4-lu-v7-official+ #24
>>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>>> 0ACJA570
>>>> 11/05/2018
>>>> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>>> BTYPE=--)
>>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>>> [  390.150894] sp : ffff80000fbb3bc0
>>>> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
>>>> 0000000000000000
>>>> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
>>>> ffff03c0c7cc5720
>>>> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
>>>> ffffb4e61df223f8
>>>> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
>>>> 0000000000000000
>>>> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
>>>> 0000000000000000
>>>> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
>>>> ffff03c0d519cd80
>>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
>>>> ffffb4e620b54f8c
>>>> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
>>>> 00000000000000ff
>>>> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
>>>> 0000000000000000
>>>> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
>>>> 0000000000000000
>>>> [  390.225374] Call trace:
>>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>>> [  390.245686]  unbind_store+0xf8/0x120
>>>> [  390.249245]  drv_attr_store+0x30/0x44
>>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>>> [  390.264696]  vfs_write+0x230/0x2d0
>>>> [  390.268082]  ksys_write+0x74/0x100
>>>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>>>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>>>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>>>> [  390.284922]  do_el0_svc+0x34/0xcc
>>>> [  390.288221]  el0_svc+0x30/0x140
>>>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>>>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>>>> [  390.299245] ---[ end trace 0000000000000000 ]---
>>>>
>>>>
>>>> I put some traces in the code and I can see that
>>>> iommu_device_use_default_domain() effectively is called on
>>>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time
>>>> the iommu group is NULL:
>>>> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure
>>>> driver_managed_area=0
>>>> [   10.569431] e1000e 0004:01:00.0: ****
>>>> iommu_device_use_default_domain ENTRY
>>>> [   10.569433] e1000e 0004:01:00.0: ****
>>>> iommu_device_use_default_domain no group
>>>> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure
>>>> iommu_device_use_default_domain returned 0
>>>> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
>>>>
>>>> ^^^the group is added after the
>>>> iommu_device_use_default_domain() call
>>>> So the group->owner_cnt is not incremented as expected.
>>>
>>> Thank you for reporting this. Do you have any idea why the driver is
>>> loaded before iommu_probe_device()?
>>
>> Urgh, this is the horrible firmware-data-ordering thing again. The
>> stuff I've been saying about having to rework the whole .dma_configure
>> mechanism in the near future is to fix this properly.
>>
>> The summary is that in patch #4, calling
>> iommu_device_use_default_domain() *before* {of,acpi}_dma_configure is
>> currently a problem. As things stand, the IOMMU driver ignored the
>> initial iommu_probe_device() call when the device was added, since at
>> that point it had no fwspec yet. In this situation,
>> {of,acpi}_iommu_configure() are retriggering iommu_probe_device()
>> after the IOMMU driver has seen the firmware data via .of_xlate to
>> learn that it it actually responsible for the given device.
> 
> thank you for providing the info. Hope this is something Lu can work around.

Hopefully it's just a case of flipping the calls around, so that 
iommu_use_default_domain() goes at the end, and calls 
arch_teardown_dma_ops() if it fails. From a quick skim I *think* that 
should still work out to the desired behaviour (or at least close enough 
that we can move forward without a circular dependency between fixes...)

Robin.
Baolu Lu March 7, 2022, 3:27 a.m. UTC | #6
Hi Robin,

On 3/4/22 10:10 PM, Robin Murphy wrote:
> On 2022-03-04 13:55, Eric Auger wrote:
>> Hi Robin,
>>
>> On 3/4/22 1:22 PM, Robin Murphy wrote:
>>> On 2022-03-04 10:43, Lu Baolu wrote:
>>>> Hi Eric,
>>>>
>>>> On 2022/3/4 18:34, Eric Auger wrote:
>>>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>>>
>>>>> sudo modprobe -v vfio-pci
>>>>> echo vfio-pci | sudo tee -a
>>>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>>>> vfio-pci
>>>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>>>
>>>>>
>>>>> [  390.042811] ------------[ cut here ]------------
>>>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at 
>>>>> drivers/iommu/iommu.c:3123
>>>>> iommu_device_unuse_default_domain+0x68/0x100
>>>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack 
>>>>> ipt_REJECT
>>>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc 
>>>>> rfkill
>>>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>>>> mlx5_core sg
>>>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
>>>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>>>> qcom_emac
>>>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>>>> 5.17.0-rc4-lu-v7-official+ #24
>>>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>>>> 0ACJA570
>>>>> 11/05/2018
>>>>> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>>>> BTYPE=--)
>>>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>>>> [  390.150894] sp : ffff80000fbb3bc0
>>>>> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
>>>>> 0000000000000000
>>>>> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
>>>>> ffff03c0c7cc5720
>>>>> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
>>>>> ffffb4e61df223f8
>>>>> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
>>>>> 0000000000000000
>>>>> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
>>>>> 0000000000000000
>>>>> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
>>>>> ffff03c0d519cd80
>>>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
>>>>> ffffb4e620b54f8c
>>>>> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
>>>>> 00000000000000ff
>>>>> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
>>>>> 0000000000000000
>>>>> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
>>>>> 0000000000000000
>>>>> [  390.225374] Call trace:
>>>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>>>> [  390.245686]  unbind_store+0xf8/0x120
>>>>> [  390.249245]  drv_attr_store+0x30/0x44
>>>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>>>> [  390.264696]  vfs_write+0x230/0x2d0
>>>>> [  390.268082]  ksys_write+0x74/0x100
>>>>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>>>>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>>>>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>>>>> [  390.284922]  do_el0_svc+0x34/0xcc
>>>>> [  390.288221]  el0_svc+0x30/0x140
>>>>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>>>>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>>>>> [  390.299245] ---[ end trace 0000000000000000 ]---
>>>>>
>>>>>
>>>>> I put some traces in the code and I can see that
>>>>> iommu_device_use_default_domain() effectively is called on
>>>>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time
>>>>> the iommu group is NULL:
>>>>> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure
>>>>> driver_managed_area=0
>>>>> [   10.569431] e1000e 0004:01:00.0: ****
>>>>> iommu_device_use_default_domain ENTRY
>>>>> [   10.569433] e1000e 0004:01:00.0: ****
>>>>> iommu_device_use_default_domain no group
>>>>> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure
>>>>> iommu_device_use_default_domain returned 0
>>>>> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
>>>>>
>>>>> ^^^the group is added after the
>>>>> iommu_device_use_default_domain() call
>>>>> So the group->owner_cnt is not incremented as expected.
>>>>
>>>> Thank you for reporting this. Do you have any idea why the driver is
>>>> loaded before iommu_probe_device()?
>>>
>>> Urgh, this is the horrible firmware-data-ordering thing again. The
>>> stuff I've been saying about having to rework the whole .dma_configure
>>> mechanism in the near future is to fix this properly.
>>>
>>> The summary is that in patch #4, calling
>>> iommu_device_use_default_domain() *before* {of,acpi}_dma_configure is
>>> currently a problem. As things stand, the IOMMU driver ignored the
>>> initial iommu_probe_device() call when the device was added, since at
>>> that point it had no fwspec yet. In this situation,
>>> {of,acpi}_iommu_configure() are retriggering iommu_probe_device()
>>> after the IOMMU driver has seen the firmware data via .of_xlate to
>>> learn that it it actually responsible for the given device.
>>
>> thank you for providing the info. Hope this is something Lu can work 
>> around.
> 
> Hopefully it's just a case of flipping the calls around, so that 
> iommu_use_default_domain() goes at the end, and calls 
> arch_teardown_dma_ops() if it fails. From a quick skim I *think* that 
> should still work out to the desired behaviour (or at least close enough 
> that we can move forward without a circular dependency between fixes...)

This is a reasonable solution to me. Thank you for the information and
suggestion.

Eric, I have updated the patch #4 and uploaded a new version here:

https://github.com/LuBaolu/intel-iommu/commits/iommu-dma-ownership-v8

Can you please give it a try?

Best regards,
baolu
Eric Auger March 7, 2022, 12:42 p.m. UTC | #7
Hi Lu,

On 3/7/22 4:27 AM, Lu Baolu wrote:
> Hi Robin,
>
> On 3/4/22 10:10 PM, Robin Murphy wrote:
>> On 2022-03-04 13:55, Eric Auger wrote:
>>> Hi Robin,
>>>
>>> On 3/4/22 1:22 PM, Robin Murphy wrote:
>>>> On 2022-03-04 10:43, Lu Baolu wrote:
>>>>> Hi Eric,
>>>>>
>>>>> On 2022/3/4 18:34, Eric Auger wrote:
>>>>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>>>>
>>>>>> sudo modprobe -v vfio-pci
>>>>>> echo vfio-pci | sudo tee -a
>>>>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>>>>> vfio-pci
>>>>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>>>>
>>>>>>
>>>>>> [  390.042811] ------------[ cut here ]------------
>>>>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at
>>>>>> drivers/iommu/iommu.c:3123
>>>>>> iommu_device_unuse_default_domain+0x68/0x100
>>>>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>>>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack
>>>>>> ipt_REJECT
>>>>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>>>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc
>>>>>> rfkill
>>>>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>>>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>>>>> mlx5_core sg
>>>>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce
>>>>>> sbsa_gwdt
>>>>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>>>>> qcom_emac
>>>>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>>>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>>>>> 5.17.0-rc4-lu-v7-official+ #24
>>>>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>>>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>>>>> 0ACJA570
>>>>>> 11/05/2018
>>>>>> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>>>>> BTYPE=--)
>>>>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>>>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>>>>> [  390.150894] sp : ffff80000fbb3bc0
>>>>>> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
>>>>>> 0000000000000000
>>>>>> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
>>>>>> ffff03c0c7cc5720
>>>>>> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
>>>>>> ffffb4e61df223f8
>>>>>> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
>>>>>> 0000000000000000
>>>>>> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
>>>>>> 0000000000000000
>>>>>> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
>>>>>> ffff03c0d519cd80
>>>>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
>>>>>> ffffb4e620b54f8c
>>>>>> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
>>>>>> 00000000000000ff
>>>>>> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
>>>>>> 0000000000000000
>>>>>> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
>>>>>> 0000000000000000
>>>>>> [  390.225374] Call trace:
>>>>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>>>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>>>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>>>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>>>>> [  390.245686]  unbind_store+0xf8/0x120
>>>>>> [  390.249245]  drv_attr_store+0x30/0x44
>>>>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>>>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>>>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>>>>> [  390.264696]  vfs_write+0x230/0x2d0
>>>>>> [  390.268082]  ksys_write+0x74/0x100
>>>>>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>>>>>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>>>>>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>>>>>> [  390.284922]  do_el0_svc+0x34/0xcc
>>>>>> [  390.288221]  el0_svc+0x30/0x140
>>>>>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>>>>>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>>>>>> [  390.299245] ---[ end trace 0000000000000000 ]---
>>>>>>
>>>>>>
>>>>>> I put some traces in the code and I can see that
>>>>>> iommu_device_use_default_domain() effectively is called on
>>>>>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time
>>>>>> the iommu group is NULL:
>>>>>> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure
>>>>>> driver_managed_area=0
>>>>>> [   10.569431] e1000e 0004:01:00.0: ****
>>>>>> iommu_device_use_default_domain ENTRY
>>>>>> [   10.569433] e1000e 0004:01:00.0: ****
>>>>>> iommu_device_use_default_domain no group
>>>>>> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure
>>>>>> iommu_device_use_default_domain returned 0
>>>>>> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
>>>>>>
>>>>>> ^^^the group is added after the
>>>>>> iommu_device_use_default_domain() call
>>>>>> So the group->owner_cnt is not incremented as expected.
>>>>>
>>>>> Thank you for reporting this. Do you have any idea why the driver is
>>>>> loaded before iommu_probe_device()?
>>>>
>>>> Urgh, this is the horrible firmware-data-ordering thing again. The
>>>> stuff I've been saying about having to rework the whole .dma_configure
>>>> mechanism in the near future is to fix this properly.
>>>>
>>>> The summary is that in patch #4, calling
>>>> iommu_device_use_default_domain() *before* {of,acpi}_dma_configure is
>>>> currently a problem. As things stand, the IOMMU driver ignored the
>>>> initial iommu_probe_device() call when the device was added, since at
>>>> that point it had no fwspec yet. In this situation,
>>>> {of,acpi}_iommu_configure() are retriggering iommu_probe_device()
>>>> after the IOMMU driver has seen the firmware data via .of_xlate to
>>>> learn that it it actually responsible for the given device.
>>>
>>> thank you for providing the info. Hope this is something Lu can work
>>> around.
>>
>> Hopefully it's just a case of flipping the calls around, so that
>> iommu_use_default_domain() goes at the end, and calls
>> arch_teardown_dma_ops() if it fails. From a quick skim I *think* that
>> should still work out to the desired behaviour (or at least close
>> enough that we can move forward without a circular dependency between
>> fixes...)
>
> This is a reasonable solution to me. Thank you for the information and
> suggestion.
>
> Eric, I have updated the patch #4 and uploaded a new version here:
>
> https://github.com/LuBaolu/intel-iommu/commits/iommu-dma-ownership-v8

with v8 I do not hit the warning anymore and the owner accounting seems
to work as expected.

Thanks

Eric
>
> Can you please give it a try?
>
> Best regards,
> baolu
>
Baolu Lu March 8, 2022, 12:31 a.m. UTC | #8
On 2022/3/7 20:42, Eric Auger wrote:
> Hi Lu,
> 
> On 3/7/22 4:27 AM, Lu Baolu wrote:
>> Hi Robin,
>>
>> On 3/4/22 10:10 PM, Robin Murphy wrote:
>>> On 2022-03-04 13:55, Eric Auger wrote:
>>>> Hi Robin,
>>>>
>>>> On 3/4/22 1:22 PM, Robin Murphy wrote:
>>>>> On 2022-03-04 10:43, Lu Baolu wrote:
>>>>>> Hi Eric,
>>>>>>
>>>>>> On 2022/3/4 18:34, Eric Auger wrote:
>>>>>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>>>>>
>>>>>>> sudo modprobe -v vfio-pci
>>>>>>> echo vfio-pci | sudo tee -a
>>>>>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>>>>>> vfio-pci
>>>>>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>>>>>
>>>>>>>
>>>>>>> [  390.042811] ------------[ cut here ]------------
>>>>>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at
>>>>>>> drivers/iommu/iommu.c:3123
>>>>>>> iommu_device_unuse_default_domain+0x68/0x100
>>>>>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>>>>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack
>>>>>>> ipt_REJECT
>>>>>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>>>>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc
>>>>>>> rfkill
>>>>>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>>>>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>>>>>> mlx5_core sg
>>>>>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce
>>>>>>> sbsa_gwdt
>>>>>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>>>>>> qcom_emac
>>>>>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>>>>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>>>>>> 5.17.0-rc4-lu-v7-official+ #24
>>>>>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>>>>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>>>>>> 0ACJA570
>>>>>>> 11/05/2018
>>>>>>> [  390.132492] pstate: a0400005 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>>>>>> BTYPE=--)
>>>>>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>>>>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>>>>>> [  390.150894] sp : ffff80000fbb3bc0
>>>>>>> [  390.154193] x29: ffff80000fbb3bc0 x28: ffff03c0cf6b2400 x27:
>>>>>>> 0000000000000000
>>>>>>> [  390.161311] x26: 0000000000000000 x25: 0000000000000000 x24:
>>>>>>> ffff03c0c7cc5720
>>>>>>> [  390.168429] x23: ffff03c0c2b9d150 x22: ffffb4e61df223f8 x21:
>>>>>>> ffffb4e61df223f8
>>>>>>> [  390.175547] x20: ffff03c7c03c3758 x19: ffff03c7c03c3700 x18:
>>>>>>> 0000000000000000
>>>>>>> [  390.182665] x17: 0000000000000000 x16: 0000000000000000 x15:
>>>>>>> 0000000000000000
>>>>>>> [  390.189783] x14: 0000000000000000 x13: 0000000000000030 x12:
>>>>>>> ffff03c0d519cd80
>>>>>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0000000000000dc0 x9 :
>>>>>>> ffffb4e620b54f8c
>>>>>>> [  390.204019] x8 : ffff03c0cf6b3220 x7 : ffff4ef132bba000 x6 :
>>>>>>> 00000000000000ff
>>>>>>> [  390.211137] x5 : ffff03c0c2b9f108 x4 : ffff03c0d51f6438 x3 :
>>>>>>> 0000000000000000
>>>>>>> [  390.218255] x2 : ffff03c0cf6b2400 x1 : 0000000000000000 x0 :
>>>>>>> 0000000000000000
>>>>>>> [  390.225374] Call trace:
>>>>>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>>>>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>>>>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>>>>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>>>>>> [  390.245686]  unbind_store+0xf8/0x120
>>>>>>> [  390.249245]  drv_attr_store+0x30/0x44
>>>>>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>>>>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>>>>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>>>>>> [  390.264696]  vfs_write+0x230/0x2d0
>>>>>>> [  390.268082]  ksys_write+0x74/0x100
>>>>>>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>>>>>>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>>>>>>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>>>>>>> [  390.284922]  do_el0_svc+0x34/0xcc
>>>>>>> [  390.288221]  el0_svc+0x30/0x140
>>>>>>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>>>>>>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>>>>>>> [  390.299245] ---[ end trace 0000000000000000 ]---
>>>>>>>
>>>>>>>
>>>>>>> I put some traces in the code and I can see that
>>>>>>> iommu_device_use_default_domain() effectively is called on
>>>>>>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time
>>>>>>> the iommu group is NULL:
>>>>>>> [   10.569427] e1000e 0004:01:00.0: ------ ENTRY pci_dma_configure
>>>>>>> driver_managed_area=0
>>>>>>> [   10.569431] e1000e 0004:01:00.0: ****
>>>>>>> iommu_device_use_default_domain ENTRY
>>>>>>> [   10.569433] e1000e 0004:01:00.0: ****
>>>>>>> iommu_device_use_default_domain no group
>>>>>>> [   10.569435] e1000e 0004:01:00.0: pci_dma_configure
>>>>>>> iommu_device_use_default_domain returned 0
>>>>>>> [   10.569492] e1000e 0004:01:00.0: Adding to iommu group 3
>>>>>>>
>>>>>>> ^^^the group is added after the
>>>>>>> iommu_device_use_default_domain() call
>>>>>>> So the group->owner_cnt is not incremented as expected.
>>>>>>
>>>>>> Thank you for reporting this. Do you have any idea why the driver is
>>>>>> loaded before iommu_probe_device()?
>>>>>
>>>>> Urgh, this is the horrible firmware-data-ordering thing again. The
>>>>> stuff I've been saying about having to rework the whole .dma_configure
>>>>> mechanism in the near future is to fix this properly.
>>>>>
>>>>> The summary is that in patch #4, calling
>>>>> iommu_device_use_default_domain() *before* {of,acpi}_dma_configure is
>>>>> currently a problem. As things stand, the IOMMU driver ignored the
>>>>> initial iommu_probe_device() call when the device was added, since at
>>>>> that point it had no fwspec yet. In this situation,
>>>>> {of,acpi}_iommu_configure() are retriggering iommu_probe_device()
>>>>> after the IOMMU driver has seen the firmware data via .of_xlate to
>>>>> learn that it it actually responsible for the given device.
>>>>
>>>> thank you for providing the info. Hope this is something Lu can work
>>>> around.
>>>
>>> Hopefully it's just a case of flipping the calls around, so that
>>> iommu_use_default_domain() goes at the end, and calls
>>> arch_teardown_dma_ops() if it fails. From a quick skim I *think* that
>>> should still work out to the desired behaviour (or at least close
>>> enough that we can move forward without a circular dependency between
>>> fixes...)
>>
>> This is a reasonable solution to me. Thank you for the information and
>> suggestion.
>>
>> Eric, I have updated the patch #4 and uploaded a new version here:
>>
>> https://github.com/LuBaolu/intel-iommu/commits/iommu-dma-ownership-v8
> 
> with v8 I do not hit the warning anymore and the owner accounting seems
> to work as expected.

Thank you, Eric! I will post the v8 soon.

Best regards,
baolu
diff mbox series

Patch

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9208eca4b0d1..77972ef978b5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -675,6 +675,13 @@  struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 void iommu_sva_unbind_device(struct iommu_sva *handle);
 u32 iommu_sva_get_pasid(struct iommu_sva *handle);
 
+int iommu_device_use_default_domain(struct device *dev);
+void iommu_device_unuse_default_domain(struct device *dev);
+
+int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner);
+void iommu_group_release_dma_owner(struct iommu_group *group);
+bool iommu_group_dma_owner_claimed(struct iommu_group *group);
+
 #else /* CONFIG_IOMMU_API */
 
 struct iommu_ops {};
@@ -1031,6 +1038,30 @@  static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
 	return NULL;
 }
+
+static inline int iommu_device_use_default_domain(struct device *dev)
+{
+	return 0;
+}
+
+static inline void iommu_device_unuse_default_domain(struct device *dev)
+{
+}
+
+static inline int
+iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_group_release_dma_owner(struct iommu_group *group)
+{
+}
+
+static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)
+{
+	return false;
+}
 #endif /* CONFIG_IOMMU_API */
 
 /**
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f2c45b85b9fc..eba8e8ccf19d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -48,6 +48,8 @@  struct iommu_group {
 	struct iommu_domain *default_domain;
 	struct iommu_domain *domain;
 	struct list_head entry;
+	unsigned int owner_cnt;
+	void *owner;
 };
 
 struct group_device {
@@ -294,7 +296,11 @@  int iommu_probe_device(struct device *dev)
 	mutex_lock(&group->mutex);
 	iommu_alloc_default_domain(group, dev);
 
-	if (group->default_domain) {
+	/*
+	 * If device joined an existing group which has been claimed, don't
+	 * attach the default domain.
+	 */
+	if (group->default_domain && !group->owner) {
 		ret = __iommu_attach_device(group->default_domain, dev);
 		if (ret) {
 			mutex_unlock(&group->mutex);
@@ -2109,7 +2115,7 @@  static int __iommu_attach_group(struct iommu_domain *domain,
 {
 	int ret;
 
-	if (group->default_domain && group->domain != group->default_domain)
+	if (group->domain && group->domain != group->default_domain)
 		return -EBUSY;
 
 	ret = __iommu_group_for_each_dev(group, domain,
@@ -2146,7 +2152,11 @@  static void __iommu_detach_group(struct iommu_domain *domain,
 {
 	int ret;
 
-	if (!group->default_domain) {
+	/*
+	 * If the group has been claimed already, do not re-attach the default
+	 * domain.
+	 */
+	if (!group->default_domain || group->owner) {
 		__iommu_group_for_each_dev(group, domain,
 					   iommu_group_do_detach_device);
 		group->domain = NULL;
@@ -3095,3 +3105,140 @@  static ssize_t iommu_group_store_type(struct iommu_group *group,
 
 	return ret;
 }
+
+/**
+ * iommu_device_use_default_domain() - Device driver wants to handle device
+ *                                     DMA through the kernel DMA API.
+ * @dev: The device.
+ *
+ * The device driver about to bind @dev wants to do DMA through the kernel
+ * DMA API. Return 0 if it is allowed, otherwise an error.
+ */
+int iommu_device_use_default_domain(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get(dev);
+	int ret = 0;
+
+	if (!group)
+		return 0;
+
+	mutex_lock(&group->mutex);
+	if (group->owner_cnt) {
+		if (group->domain != group->default_domain ||
+		    group->owner) {
+			ret = -EBUSY;
+			goto unlock_out;
+		}
+	}
+
+	group->owner_cnt++;
+
+unlock_out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
+
+/**
+ * iommu_device_unuse_default_domain() - Device driver stops handling device
+ *                                       DMA through the kernel DMA API.
+ * @dev: The device.
+ *
+ * The device driver doesn't want to do DMA through kernel DMA API anymore.
+ * It must be called after iommu_device_use_default_domain().
+ */
+void iommu_device_unuse_default_domain(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get(dev);
+
+	if (!group)
+		return;
+
+	mutex_lock(&group->mutex);
+	if (!WARN_ON(!group->owner_cnt))
+		group->owner_cnt--;
+
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+}
+
+/**
+ * iommu_group_claim_dma_owner() - Set DMA ownership of a group
+ * @group: The group.
+ * @owner: Caller specified pointer. Used for exclusive ownership.
+ *
+ * This is to support backward compatibility for vfio which manages
+ * the dma ownership in iommu_group level. New invocations on this
+ * interface should be prohibited.
+ */
+int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
+{
+	int ret = 0;
+
+	mutex_lock(&group->mutex);
+	if (group->owner_cnt) {
+		ret = -EPERM;
+		goto unlock_out;
+	} else {
+		if (group->domain && group->domain != group->default_domain) {
+			ret = -EBUSY;
+			goto unlock_out;
+		}
+
+		group->owner = owner;
+		if (group->domain)
+			__iommu_detach_group(group->domain, group);
+	}
+
+	group->owner_cnt++;
+unlock_out:
+	mutex_unlock(&group->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner);
+
+/**
+ * iommu_group_release_dma_owner() - Release DMA ownership of a group
+ * @group: The group.
+ *
+ * Release the DMA ownership claimed by iommu_group_claim_dma_owner().
+ */
+void iommu_group_release_dma_owner(struct iommu_group *group)
+{
+	mutex_lock(&group->mutex);
+	if (WARN_ON(!group->owner_cnt || !group->owner))
+		goto unlock_out;
+
+	group->owner_cnt = 0;
+	/*
+	 * The UNMANAGED domain should be detached before all USER
+	 * owners have been released.
+	 */
+	if (!WARN_ON(group->domain) && group->default_domain)
+		__iommu_attach_group(group->default_domain, group);
+	group->owner = NULL;
+unlock_out:
+	mutex_unlock(&group->mutex);
+}
+EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner);
+
+/**
+ * iommu_group_dma_owner_claimed() - Query group dma ownership status
+ * @group: The group.
+ *
+ * This provides status query on a given group. It is racy and only for
+ * non-binding status reporting.
+ */
+bool iommu_group_dma_owner_claimed(struct iommu_group *group)
+{
+	unsigned int user;
+
+	mutex_lock(&group->mutex);
+	user = group->owner_cnt;
+	mutex_unlock(&group->mutex);
+
+	return user;
+}
+EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);