diff mbox series

[v2,03/14] iommu: Move bus setup to IOMMU device registration

Message ID 1faba5b5c094379df3d99b8fec924ab50ad75482.1650890638.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show
Series iommu: Retire bus_set_iommu() | expand

Commit Message

Robin Murphy April 28, 2022, 1:18 p.m. UTC
Move the bus setup to iommu_device_register(). This should allow
bus_iommu_probe() to be correctly replayed for multiple IOMMU instances,
and leaves bus_set_iommu() as a glorified no-op to be cleaned up next.

At this point we can also handle cleanup better than just rolling back
the most-recently-touched bus upon failure - which may release devices
owned by other already-registered instances, and still leave devices on
other buses with dangling pointers to the failed instance. Now it's easy
to clean up the exact footprint of a given instance, no more, no less.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 drivers/iommu/iommu.c | 51 +++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

Comments

Baolu Lu April 29, 2022, 6:57 a.m. UTC | #1
Hi Robin,

On 2022/4/28 21:18, Robin Murphy wrote:
> Move the bus setup to iommu_device_register(). This should allow
> bus_iommu_probe() to be correctly replayed for multiple IOMMU instances,
> and leaves bus_set_iommu() as a glorified no-op to be cleaned up next.

I re-fetched the latest patches on

https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/bus

and rolled back the head to "iommu: Cleanup bus_set_iommu".

The test machine still hangs during boot.

I went through the code. It seems that the .probe_device for Intel IOMMU
driver can't handle the probe replay well. It always assumes that the
device has never been probed.

Best regards,
baolu

> 
> At this point we can also handle cleanup better than just rolling back
> the most-recently-touched bus upon failure - which may release devices
> owned by other already-registered instances, and still leave devices on
> other buses with dangling pointers to the failed instance. Now it's easy
> to clean up the exact footprint of a given instance, no more, no less.
> 
> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>   drivers/iommu/iommu.c | 51 +++++++++++++++++++++++--------------------
>   1 file changed, 27 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 6c4621afc8cf..c89af4dc54c2 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -175,6 +175,14 @@ static int __init iommu_subsys_init(void)
>   }
>   subsys_initcall(iommu_subsys_init);
>   
> +static int remove_iommu_group(struct device *dev, void *data)
> +{
> +	if (dev->iommu && dev->iommu->iommu_dev == data)
> +		iommu_release_device(dev);
> +
> +	return 0;
> +}
> +
>   /**
>    * iommu_device_register() - Register an IOMMU hardware instance
>    * @iommu: IOMMU handle for the instance
> @@ -197,12 +205,29 @@ int iommu_device_register(struct iommu_device *iommu,
>   	spin_lock(&iommu_device_lock);
>   	list_add_tail(&iommu->list, &iommu_device_list);
>   	spin_unlock(&iommu_device_lock);
> +
> +	for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
> +		struct bus_type *bus = iommu_buses[i];
> +		int err;
> +
> +		WARN_ON(bus->iommu_ops && bus->iommu_ops != ops);
> +		bus->iommu_ops = ops;
> +		err = bus_iommu_probe(bus);
> +		if (err) {
> +			iommu_device_unregister(iommu);
> +			return err;
> +		}
> +	}
> +
>   	return 0;
>   }
>   EXPORT_SYMBOL_GPL(iommu_device_register);
>   
>   void iommu_device_unregister(struct iommu_device *iommu)
>   {
> +	for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++)
> +		bus_for_each_dev(iommu_buses[i], NULL, iommu, remove_iommu_group);
> +
>   	spin_lock(&iommu_device_lock);
>   	list_del(&iommu->list);
>   	spin_unlock(&iommu_device_lock);
> @@ -1655,13 +1680,6 @@ static int probe_iommu_group(struct device *dev, void *data)
>   	return ret;
>   }
>   
> -static int remove_iommu_group(struct device *dev, void *data)
> -{
> -	iommu_release_device(dev);
> -
> -	return 0;
> -}
> -
>   static int iommu_bus_notifier(struct notifier_block *nb,
>   			      unsigned long action, void *data)
>   {
> @@ -1884,27 +1902,12 @@ static int iommu_bus_init(struct bus_type *bus)
>    */
>   int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
>   {
> -	int err;
> -
> -	if (ops == NULL) {
> -		bus->iommu_ops = NULL;
> -		return 0;
> -	}
> -
> -	if (bus->iommu_ops != NULL)
> +	if (bus->iommu_ops && ops && bus->iommu_ops != ops)
>   		return -EBUSY;
>   
>   	bus->iommu_ops = ops;
>   
> -	/* Do IOMMU specific setup for this bus-type */
> -	err = bus_iommu_probe(bus);
> -	if (err) {
> -		/* Clean up */
> -		bus_for_each_dev(bus, NULL, NULL, remove_iommu_group);
> -		bus->iommu_ops = NULL;
> -	}
> -
> -	return err;
> +	return 0;
>   }
>   EXPORT_SYMBOL_GPL(bus_set_iommu);
>
Robin Murphy April 29, 2022, 8:50 a.m. UTC | #2
On 2022-04-29 07:57, Baolu Lu wrote:
> Hi Robin,
> 
> On 2022/4/28 21:18, Robin Murphy wrote:
>> Move the bus setup to iommu_device_register(). This should allow
>> bus_iommu_probe() to be correctly replayed for multiple IOMMU instances,
>> and leaves bus_set_iommu() as a glorified no-op to be cleaned up next.
> 
> I re-fetched the latest patches on
> 
> https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/bus
> 
> and rolled back the head to "iommu: Cleanup bus_set_iommu".
> 
> The test machine still hangs during boot.
> 
> I went through the code. It seems that the .probe_device for Intel IOMMU
> driver can't handle the probe replay well. It always assumes that the
> device has never been probed.

Hmm, but probe_iommu_group() is supposed to prevent the 
__iommu_probe_device() call even happening if the device *has* already 
been probed before :/

I've still got an old Intel box spare in the office so I'll rig that up 
and see if I can see what might be going on here...

Thanks,
Robin.

> 
> Best regards,
> baolu
> 
>>
>> At this point we can also handle cleanup better than just rolling back
>> the most-recently-touched bus upon failure - which may release devices
>> owned by other already-registered instances, and still leave devices on
>> other buses with dangling pointers to the failed instance. Now it's easy
>> to clean up the exact footprint of a given instance, no more, no less.
>>
>> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>   drivers/iommu/iommu.c | 51 +++++++++++++++++++++++--------------------
>>   1 file changed, 27 insertions(+), 24 deletions(-)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 6c4621afc8cf..c89af4dc54c2 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -175,6 +175,14 @@ static int __init iommu_subsys_init(void)
>>   }
>>   subsys_initcall(iommu_subsys_init);
>> +static int remove_iommu_group(struct device *dev, void *data)
>> +{
>> +    if (dev->iommu && dev->iommu->iommu_dev == data)
>> +        iommu_release_device(dev);
>> +
>> +    return 0;
>> +}
>> +
>>   /**
>>    * iommu_device_register() - Register an IOMMU hardware instance
>>    * @iommu: IOMMU handle for the instance
>> @@ -197,12 +205,29 @@ int iommu_device_register(struct iommu_device 
>> *iommu,
>>       spin_lock(&iommu_device_lock);
>>       list_add_tail(&iommu->list, &iommu_device_list);
>>       spin_unlock(&iommu_device_lock);
>> +
>> +    for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
>> +        struct bus_type *bus = iommu_buses[i];
>> +        int err;
>> +
>> +        WARN_ON(bus->iommu_ops && bus->iommu_ops != ops);
>> +        bus->iommu_ops = ops;
>> +        err = bus_iommu_probe(bus);
>> +        if (err) {
>> +            iommu_device_unregister(iommu);
>> +            return err;
>> +        }
>> +    }
>> +
>>       return 0;
>>   }
>>   EXPORT_SYMBOL_GPL(iommu_device_register);
>>   void iommu_device_unregister(struct iommu_device *iommu)
>>   {
>> +    for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++)
>> +        bus_for_each_dev(iommu_buses[i], NULL, iommu, 
>> remove_iommu_group);
>> +
>>       spin_lock(&iommu_device_lock);
>>       list_del(&iommu->list);
>>       spin_unlock(&iommu_device_lock);
>> @@ -1655,13 +1680,6 @@ static int probe_iommu_group(struct device 
>> *dev, void *data)
>>       return ret;
>>   }
>> -static int remove_iommu_group(struct device *dev, void *data)
>> -{
>> -    iommu_release_device(dev);
>> -
>> -    return 0;
>> -}
>> -
>>   static int iommu_bus_notifier(struct notifier_block *nb,
>>                     unsigned long action, void *data)
>>   {
>> @@ -1884,27 +1902,12 @@ static int iommu_bus_init(struct bus_type *bus)
>>    */
>>   int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
>>   {
>> -    int err;
>> -
>> -    if (ops == NULL) {
>> -        bus->iommu_ops = NULL;
>> -        return 0;
>> -    }
>> -
>> -    if (bus->iommu_ops != NULL)
>> +    if (bus->iommu_ops && ops && bus->iommu_ops != ops)
>>           return -EBUSY;
>>       bus->iommu_ops = ops;
>> -    /* Do IOMMU specific setup for this bus-type */
>> -    err = bus_iommu_probe(bus);
>> -    if (err) {
>> -        /* Clean up */
>> -        bus_for_each_dev(bus, NULL, NULL, remove_iommu_group);
>> -        bus->iommu_ops = NULL;
>> -    }
>> -
>> -    return err;
>> +    return 0;
>>   }
>>   EXPORT_SYMBOL_GPL(bus_set_iommu);
>
Robin Murphy April 29, 2022, 6:06 p.m. UTC | #3
On 29/04/2022 9:50 am, Robin Murphy wrote:
> On 2022-04-29 07:57, Baolu Lu wrote:
>> Hi Robin,
>>
>> On 2022/4/28 21:18, Robin Murphy wrote:
>>> Move the bus setup to iommu_device_register(). This should allow
>>> bus_iommu_probe() to be correctly replayed for multiple IOMMU instances,
>>> and leaves bus_set_iommu() as a glorified no-op to be cleaned up next.
>>
>> I re-fetched the latest patches on
>>
>> https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/bus
>>
>> and rolled back the head to "iommu: Cleanup bus_set_iommu".
>>
>> The test machine still hangs during boot.
>>
>> I went through the code. It seems that the .probe_device for Intel IOMMU
>> driver can't handle the probe replay well. It always assumes that the
>> device has never been probed.
> 
> Hmm, but probe_iommu_group() is supposed to prevent the 
> __iommu_probe_device() call even happening if the device *has* already 
> been probed before :/
> 
> I've still got an old Intel box spare in the office so I'll rig that up 
> and see if I can see what might be going on here...

OK, on a Xeon with two DMAR units, this seems to boot OK with or without 
patch #1, so it doesn't seem to be a general problem with replaying in 
iommu_device_register(), or with platform devices. Not sure where to go 
from here... :/

Cheers,
Robin.
Baolu Lu July 5, 2022, 4:51 a.m. UTC | #4
Hi Robin,

On 2022/4/30 02:06, Robin Murphy wrote:
> On 29/04/2022 9:50 am, Robin Murphy wrote:
>> On 2022-04-29 07:57, Baolu Lu wrote:
>>> Hi Robin,
>>>
>>> On 2022/4/28 21:18, Robin Murphy wrote:
>>>> Move the bus setup to iommu_device_register(). This should allow
>>>> bus_iommu_probe() to be correctly replayed for multiple IOMMU 
>>>> instances,
>>>> and leaves bus_set_iommu() as a glorified no-op to be cleaned up next.
>>>
>>> I re-fetched the latest patches on
>>>
>>> https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/bus
>>>
>>> and rolled back the head to "iommu: Cleanup bus_set_iommu".
>>>
>>> The test machine still hangs during boot.
>>>
>>> I went through the code. It seems that the .probe_device for Intel IOMMU
>>> driver can't handle the probe replay well. It always assumes that the
>>> device has never been probed.
>>
>> Hmm, but probe_iommu_group() is supposed to prevent the 
>> __iommu_probe_device() call even happening if the device *has* already 
>> been probed before :/
>>
>> I've still got an old Intel box spare in the office so I'll rig that 
>> up and see if I can see what might be going on here...
> 
> OK, on a Xeon with two DMAR units, this seems to boot OK with or without 
> patch #1, so it doesn't seem to be a general problem with replaying in 
> iommu_device_register(), or with platform devices. Not sure where to go 
> from here... :/

The kernel boot panic message:

[    6.639432] BUG: kernel NULL pointer dereference, address: 
0000000000000028
[    6.743829] #PF: supervisor read access in kernel mode
[    6.743829] #PF: error_code(0x0000) - not-present page
[    6.743829] PGD 0
[    6.743829] Oops: 0000 [#1] PREEMPT SMP NOPTI
[    6.743829] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G          I 
  5.19.0-rc3+ #182
[    6.743829] RIP: 0010:__iommu_probe_device+0x115/0x200
[    6.743829] Code: 89 ff e8 3e e1 ff ff 48 85 c0 0f 85 47 ff ff ff 41 
bd f4 ff ff ff eb 83 48 8b 83 d8 02 00 00 48 89 df 48 8b 40 38 48 8b 40 
10 <48> 8b 40 28 ff d0 0f 1f 00 48 89 c1 48 85 c0 0f 84 b7 00 00 00 48
[    6.743829] RSP: 0000:ff30605c00063d40 EFLAGS: 00010246
[    6.743829] RAX: 0000000000000000 RBX: ff128b9c5fdc90d0 RCX: 
0000000000000000
[    6.743829] RDX: 0000000080000001 RSI: 0000000000000246 RDI: 
ff128b9c5fdc90d0
[    6.743829] RBP: ffffffffb60c34e0 R08: ffffffffb68664d0 R09: 
ff128b9501d4ce40
[    6.743829] R10: ffffffffb6267096 R11: ff128b950014c267 R12: 
ff30605c00063de0
[    6.743829] R13: 00000000001b9d28 R14: ff128b95001b9d28 R15: 
ff128b9c5fdc93a8
[    6.743829] FS:  0000000000000000(0000) GS:ff128b9c5fc00000(0000) 
knlGS:0000000000000000
[    6.743829] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    6.743829] CR2: 0000000000000028 CR3: 0000000149210001 CR4: 
0000000000771ef0
[    6.743829] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[    6.743829] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 
0000000000000400
[    6.743829] PKRU: 55555554
[    6.743829] Call Trace:
[    6.743829]  <TASK>
[    6.743829]  ? _raw_spin_lock_irqsave+0x17/0x40
[    6.743829]  ? __iommu_probe_device+0x200/0x200
[    6.743829]  probe_iommu_group+0x2d/0x40
[    6.743829]  bus_for_each_dev+0x74/0xc0
[    6.743829]  bus_iommu_probe+0x48/0x2d0
[    6.743829]  iommu_device_register+0xde/0x120
[    6.743829]  intel_iommu_init+0x35f/0x5f2
[    6.743829]  ? iommu_setup+0x27d/0x27d
[    6.743829]  ? rdinit_setup+0x2c/0x2c
[    6.743829]  pci_iommu_init+0xe/0x32
[    6.743829]  do_one_initcall+0x41/0x200
[    6.743829]  kernel_init_freeable+0x1de/0x228
[    6.743829]  ? rest_init+0xc0/0xc0
[    6.743829]  kernel_init+0x16/0x120
[    6.743829]  ret_from_fork+0x1f/0x30
[    6.743829]  </TASK>
[    6.743829] Modules linked in:
[    6.743829] CR2: 0000000000000028
[    6.743829] ---[ end trace 0000000000000000 ]---
[    6.743829] RIP: 0010:__iommu_probe_device+0x115/0x200
[    6.743829] Code: 89 ff e8 3e e1 ff ff 48 85 c0 0f 85 47 ff ff ff 41 
bd f4 ff ff ff eb 83 48 8b 83 d8 02 00 00 48 89 df 48 8b 40 38 48 8b 40 
10 <48> 8b 40 28 ff d0 0f 1f 00 48 89 c1 48 85 c0 0f 84 b7 00 00 00 48
[    6.743829] RSP: 0000:ff30605c00063d40 EFLAGS: 00010246
[    6.743829] RAX: 0000000000000000 RBX: ff128b9c5fdc90d0 RCX: 
0000000000000000
[    6.743829] RDX: 0000000080000001 RSI: 0000000000000246 RDI: 
ff128b9c5fdc90d0
[    6.743829] RBP: ffffffffb60c34e0 R08: ffffffffb68664d0 R09: 
ff128b9501d4ce40
[    6.743829] R10: ffffffffb6267096 R11: ff128b950014c267 R12: 
ff30605c00063de0
[    6.743829] R13: 00000000001b9d28 R14: ff128b95001b9d28 R15: 
ff128b9c5fdc93a8
[    6.743829] FS:  0000000000000000(0000) GS:ff128b9c5fc00000(0000) 
knlGS:0000000000000000
[    6.743829] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    6.743829] CR2: 0000000000000028 CR3: 0000000149210001 CR4: 
0000000000771ef0
[    6.743829] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[    6.743829] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 
0000000000000400
[    6.743829] PKRU: 55555554
[    6.743829] Kernel panic - not syncing: Fatal exception
[    6.743829] ---[ end Kernel panic - not syncing: Fatal exception ]---

The "NULL pointer dereference" happens at line 1620 of below code.

1610 static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
1611 {
1612         const struct iommu_ops *ops = dev_iommu_ops(dev);
1613         struct iommu_group *group;
1614         int ret;
1615
1616         group = iommu_group_get(dev);
1617         if (group)
1618                 return group;
1619
1620         group = ops->device_group(dev);
1621         if (WARN_ON_ONCE(group == NULL))
1622                 return ERR_PTR(-EINVAL);
1623
1624         if (IS_ERR(group))
1625                 return group;

This platform has multiple IOMMU units, each serving different PCI
devices. The ops field of each iommu_device is initialized in
iommu_device_register(), hence when the first IOMMU device gets
registered, ops fields of other iommu_devices are NULL.

Unfortunately bus_iommu_probe() called in iommu_device_register() probes
*all* PCI devices. This probably leads to above NULL pointer dereference
issue.

Please correct me if I overlooked anything.

Best regards,
baolu
Robin Murphy July 5, 2022, 9:06 a.m. UTC | #5
On 2022-07-05 05:51, Baolu Lu wrote:
> Hi Robin,
> 
> On 2022/4/30 02:06, Robin Murphy wrote:
>> On 29/04/2022 9:50 am, Robin Murphy wrote:
>>> On 2022-04-29 07:57, Baolu Lu wrote:
>>>> Hi Robin,
>>>>
>>>> On 2022/4/28 21:18, Robin Murphy wrote:
>>>>> Move the bus setup to iommu_device_register(). This should allow
>>>>> bus_iommu_probe() to be correctly replayed for multiple IOMMU 
>>>>> instances,
>>>>> and leaves bus_set_iommu() as a glorified no-op to be cleaned up next.
>>>>
>>>> I re-fetched the latest patches on
>>>>
>>>> https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/bus
>>>>
>>>> and rolled back the head to "iommu: Cleanup bus_set_iommu".
>>>>
>>>> The test machine still hangs during boot.
>>>>
>>>> I went through the code. It seems that the .probe_device for Intel 
>>>> IOMMU
>>>> driver can't handle the probe replay well. It always assumes that the
>>>> device has never been probed.
>>>
>>> Hmm, but probe_iommu_group() is supposed to prevent the 
>>> __iommu_probe_device() call even happening if the device *has* 
>>> already been probed before :/
>>>
>>> I've still got an old Intel box spare in the office so I'll rig that 
>>> up and see if I can see what might be going on here...
>>
>> OK, on a Xeon with two DMAR units, this seems to boot OK with or 
>> without patch #1, so it doesn't seem to be a general problem with 
>> replaying in iommu_device_register(), or with platform devices. Not 
>> sure where to go from here... :/
> 
> The kernel boot panic message:
> 
> [    6.639432] BUG: kernel NULL pointer dereference, address: 
> 0000000000000028
> [    6.743829] #PF: supervisor read access in kernel mode
> [    6.743829] #PF: error_code(0x0000) - not-present page
> [    6.743829] PGD 0
> [    6.743829] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [    6.743829] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G          I 
>   5.19.0-rc3+ #182
> [    6.743829] RIP: 0010:__iommu_probe_device+0x115/0x200
> [    6.743829] Code: 89 ff e8 3e e1 ff ff 48 85 c0 0f 85 47 ff ff ff 41 
> bd f4 ff ff ff eb 83 48 8b 83 d8 02 00 00 48 89 df 48 8b 40 38 48 8b 40 
> 10 <48> 8b 40 28 ff d0 0f 1f 00 48 89 c1 48 85 c0 0f 84 b7 00 00 00 48
> [    6.743829] RSP: 0000:ff30605c00063d40 EFLAGS: 00010246
> [    6.743829] RAX: 0000000000000000 RBX: ff128b9c5fdc90d0 RCX: 
> 0000000000000000
> [    6.743829] RDX: 0000000080000001 RSI: 0000000000000246 RDI: 
> ff128b9c5fdc90d0
> [    6.743829] RBP: ffffffffb60c34e0 R08: ffffffffb68664d0 R09: 
> ff128b9501d4ce40
> [    6.743829] R10: ffffffffb6267096 R11: ff128b950014c267 R12: 
> ff30605c00063de0
> [    6.743829] R13: 00000000001b9d28 R14: ff128b95001b9d28 R15: 
> ff128b9c5fdc93a8
> [    6.743829] FS:  0000000000000000(0000) GS:ff128b9c5fc00000(0000) 
> knlGS:0000000000000000
> [    6.743829] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    6.743829] CR2: 0000000000000028 CR3: 0000000149210001 CR4: 
> 0000000000771ef0
> [    6.743829] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [    6.743829] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 
> 0000000000000400
> [    6.743829] PKRU: 55555554
> [    6.743829] Call Trace:
> [    6.743829]  <TASK>
> [    6.743829]  ? _raw_spin_lock_irqsave+0x17/0x40
> [    6.743829]  ? __iommu_probe_device+0x200/0x200
> [    6.743829]  probe_iommu_group+0x2d/0x40
> [    6.743829]  bus_for_each_dev+0x74/0xc0
> [    6.743829]  bus_iommu_probe+0x48/0x2d0
> [    6.743829]  iommu_device_register+0xde/0x120
> [    6.743829]  intel_iommu_init+0x35f/0x5f2
> [    6.743829]  ? iommu_setup+0x27d/0x27d
> [    6.743829]  ? rdinit_setup+0x2c/0x2c
> [    6.743829]  pci_iommu_init+0xe/0x32
> [    6.743829]  do_one_initcall+0x41/0x200
> [    6.743829]  kernel_init_freeable+0x1de/0x228
> [    6.743829]  ? rest_init+0xc0/0xc0
> [    6.743829]  kernel_init+0x16/0x120
> [    6.743829]  ret_from_fork+0x1f/0x30
> [    6.743829]  </TASK>
> [    6.743829] Modules linked in:
> [    6.743829] CR2: 0000000000000028
> [    6.743829] ---[ end trace 0000000000000000 ]---
> [    6.743829] RIP: 0010:__iommu_probe_device+0x115/0x200
> [    6.743829] Code: 89 ff e8 3e e1 ff ff 48 85 c0 0f 85 47 ff ff ff 41 
> bd f4 ff ff ff eb 83 48 8b 83 d8 02 00 00 48 89 df 48 8b 40 38 48 8b 40 
> 10 <48> 8b 40 28 ff d0 0f 1f 00 48 89 c1 48 85 c0 0f 84 b7 00 00 00 48
> [    6.743829] RSP: 0000:ff30605c00063d40 EFLAGS: 00010246
> [    6.743829] RAX: 0000000000000000 RBX: ff128b9c5fdc90d0 RCX: 
> 0000000000000000
> [    6.743829] RDX: 0000000080000001 RSI: 0000000000000246 RDI: 
> ff128b9c5fdc90d0
> [    6.743829] RBP: ffffffffb60c34e0 R08: ffffffffb68664d0 R09: 
> ff128b9501d4ce40
> [    6.743829] R10: ffffffffb6267096 R11: ff128b950014c267 R12: 
> ff30605c00063de0
> [    6.743829] R13: 00000000001b9d28 R14: ff128b95001b9d28 R15: 
> ff128b9c5fdc93a8
> [    6.743829] FS:  0000000000000000(0000) GS:ff128b9c5fc00000(0000) 
> knlGS:0000000000000000
> [    6.743829] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    6.743829] CR2: 0000000000000028 CR3: 0000000149210001 CR4: 
> 0000000000771ef0
> [    6.743829] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [    6.743829] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 
> 0000000000000400
> [    6.743829] PKRU: 55555554
> [    6.743829] Kernel panic - not syncing: Fatal exception
> [    6.743829] ---[ end Kernel panic - not syncing: Fatal exception ]---
> 
> The "NULL pointer dereference" happens at line 1620 of below code.
> 
> 1610 static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
> 1611 {
> 1612         const struct iommu_ops *ops = dev_iommu_ops(dev);
> 1613         struct iommu_group *group;
> 1614         int ret;
> 1615
> 1616         group = iommu_group_get(dev);
> 1617         if (group)
> 1618                 return group;
> 1619
> 1620         group = ops->device_group(dev);
> 1621         if (WARN_ON_ONCE(group == NULL))
> 1622                 return ERR_PTR(-EINVAL);
> 1623
> 1624         if (IS_ERR(group))
> 1625                 return group;
> 
> This platform has multiple IOMMU units, each serving different PCI
> devices. The ops field of each iommu_device is initialized in
> iommu_device_register(), hence when the first IOMMU device gets
> registered, ops fields of other iommu_devices are NULL.
> 
> Unfortunately bus_iommu_probe() called in iommu_device_register() probes
> *all* PCI devices. This probably leads to above NULL pointer dereference
> issue.
> 
> Please correct me if I overlooked anything.

Ha, as it happens I also just figured this out yesterday (after 
remembering the important detail of "intel_iommu=on", trying the lockdep 
test), so it's good to have confirmation, thanks! I'll test a fix today 
and send v3 soon.

Cheers,
Robin.
diff mbox series

Patch

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6c4621afc8cf..c89af4dc54c2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -175,6 +175,14 @@  static int __init iommu_subsys_init(void)
 }
 subsys_initcall(iommu_subsys_init);
 
+static int remove_iommu_group(struct device *dev, void *data)
+{
+	if (dev->iommu && dev->iommu->iommu_dev == data)
+		iommu_release_device(dev);
+
+	return 0;
+}
+
 /**
  * iommu_device_register() - Register an IOMMU hardware instance
  * @iommu: IOMMU handle for the instance
@@ -197,12 +205,29 @@  int iommu_device_register(struct iommu_device *iommu,
 	spin_lock(&iommu_device_lock);
 	list_add_tail(&iommu->list, &iommu_device_list);
 	spin_unlock(&iommu_device_lock);
+
+	for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
+		struct bus_type *bus = iommu_buses[i];
+		int err;
+
+		WARN_ON(bus->iommu_ops && bus->iommu_ops != ops);
+		bus->iommu_ops = ops;
+		err = bus_iommu_probe(bus);
+		if (err) {
+			iommu_device_unregister(iommu);
+			return err;
+		}
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_device_register);
 
 void iommu_device_unregister(struct iommu_device *iommu)
 {
+	for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++)
+		bus_for_each_dev(iommu_buses[i], NULL, iommu, remove_iommu_group);
+
 	spin_lock(&iommu_device_lock);
 	list_del(&iommu->list);
 	spin_unlock(&iommu_device_lock);
@@ -1655,13 +1680,6 @@  static int probe_iommu_group(struct device *dev, void *data)
 	return ret;
 }
 
-static int remove_iommu_group(struct device *dev, void *data)
-{
-	iommu_release_device(dev);
-
-	return 0;
-}
-
 static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data)
 {
@@ -1884,27 +1902,12 @@  static int iommu_bus_init(struct bus_type *bus)
  */
 int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
 {
-	int err;
-
-	if (ops == NULL) {
-		bus->iommu_ops = NULL;
-		return 0;
-	}
-
-	if (bus->iommu_ops != NULL)
+	if (bus->iommu_ops && ops && bus->iommu_ops != ops)
 		return -EBUSY;
 
 	bus->iommu_ops = ops;
 
-	/* Do IOMMU specific setup for this bus-type */
-	err = bus_iommu_probe(bus);
-	if (err) {
-		/* Clean up */
-		bus_for_each_dev(bus, NULL, NULL, remove_iommu_group);
-		bus->iommu_ops = NULL;
-	}
-
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(bus_set_iommu);