diff mbox series

scsi: alua: fix the race between alua_bus_detach and alua_rtpg

Message ID 1600167537-12509-1-git-send-email-jitendra.khasdev@oracle.com
State Superseded
Headers show
Series scsi: alua: fix the race between alua_bus_detach and alua_rtpg | expand

Commit Message

Jitendra Khasdev Sept. 15, 2020, 10:58 a.m. UTC
This is patch to fix the race occurs between bus detach and alua_rtpg.

It fluses the all pending workqueue in bus detach handler, so it can avoid
race between alua_bus_detach and alua_rtpg.

Here is call trace where race got detected.

multipathd call stack:
[exception RIP: native_queued_spin_lock_slowpath+100]
--- <NMI exception stack> ---
native_queued_spin_lock_slowpath at ffffffff89307f54
queued_spin_lock_slowpath at ffffffff89307c18
_raw_spin_lock_irq at ffffffff89bd797b
alua_bus_detach at ffffffff8984dcc8
scsi_dh_release_device at ffffffff8984b6f2
scsi_device_dev_release_usercontext at ffffffff89846edf
execute_in_process_context at ffffffff892c3e60
scsi_device_dev_release at ffffffff8984637c
device_release at ffffffff89800fbc
kobject_cleanup at ffffffff89bb1196
kobject_put at ffffffff89bb12ea
put_device at ffffffff89801283
scsi_device_put at ffffffff89838d5b
scsi_disk_put at ffffffffc051f650 [sd_mod]
sd_release at ffffffffc051f8a2 [sd_mod]
__blkdev_put at ffffffff8952c79e
blkdev_put at ffffffff8952c80c
blkdev_close at ffffffff8952c8b5
__fput at ffffffff894e55e6
____fput at ffffffff894e57ee
task_work_run at ffffffff892c94dc
exit_to_usermode_loop at ffffffff89204b12
do_syscall_64 at ffffffff892044da
entry_SYSCALL_64_after_hwframe at ffffffff89c001b8

kworker:
[exception RIP: alua_rtpg+2003]
account_entity_dequeue at ffffffff892e42c1
alua_rtpg_work at ffffffff8984f097
process_one_work at ffffffff892c4c29
worker_thread at ffffffff892c5a4f
kthread at ffffffff892cb135
ret_from_fork at ffffffff89c00354

Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
---
 drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
 1 file changed, 3 insertions(+)

Comments

Brian Bunker Sept. 15, 2020, 9:08 p.m. UTC | #1
Hello Jitendra,

It seems that we are in the same place trying to fix the same thing for what is likely our same shared customer. Do you want to try to incorporate anything from our fix from PURE? Like maybe remove the BUG_ON lines from alua_rtpg if you are sure that the race is eliminated with your patch?

See:
https://marc.info/?l=linux-scsi&m=159984129611701&w=2
https://marc.info/?l=linux-scsi&m=159983931810954&w=2
https://marc.info/?l=linux-scsi&m=159971849210795&w=2

Thanks,
Brian

Brian Bunker
SW Eng
brian@purestorage.com



> On Sep 15, 2020, at 3:58 AM, Jitendra Khasdev <jitendra.khasdev@oracle.com> wrote:
> 
> This is patch to fix the race occurs between bus detach and alua_rtpg.
> 
> It fluses the all pending workqueue in bus detach handler, so it can avoid
> race between alua_bus_detach and alua_rtpg.
> 
> Here is call trace where race got detected.
> 
> multipathd call stack:
> [exception RIP: native_queued_spin_lock_slowpath+100]
> --- <NMI exception stack> ---
> native_queued_spin_lock_slowpath at ffffffff89307f54
> queued_spin_lock_slowpath at ffffffff89307c18
> _raw_spin_lock_irq at ffffffff89bd797b
> alua_bus_detach at ffffffff8984dcc8
> scsi_dh_release_device at ffffffff8984b6f2
> scsi_device_dev_release_usercontext at ffffffff89846edf
> execute_in_process_context at ffffffff892c3e60
> scsi_device_dev_release at ffffffff8984637c
> device_release at ffffffff89800fbc
> kobject_cleanup at ffffffff89bb1196
> kobject_put at ffffffff89bb12ea
> put_device at ffffffff89801283
> scsi_device_put at ffffffff89838d5b
> scsi_disk_put at ffffffffc051f650 [sd_mod]
> sd_release at ffffffffc051f8a2 [sd_mod]
> __blkdev_put at ffffffff8952c79e
> blkdev_put at ffffffff8952c80c
> blkdev_close at ffffffff8952c8b5
> __fput at ffffffff894e55e6
> ____fput at ffffffff894e57ee
> task_work_run at ffffffff892c94dc
> exit_to_usermode_loop at ffffffff89204b12
> do_syscall_64 at ffffffff892044da
> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
> 
> kworker:
> [exception RIP: alua_rtpg+2003]
> account_entity_dequeue at ffffffff892e42c1
> alua_rtpg_work at ffffffff8984f097
> process_one_work at ffffffff892c4c29
> worker_thread at ffffffff892c5a4f
> kthread at ffffffff892cb135
> ret_from_fork at ffffffff89c00354
> 
> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
> ---
> drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
> 1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
> index f32da0c..024a752 100644
> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
> 	struct alua_dh_data *h = sdev->handler_data;
> 	struct alua_port_group *pg;
> 
> +	sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
> +	flush_workqueue(kaluad_wq);
> +
> 	spin_lock(&h->pg_lock);
> 	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
> 	rcu_assign_pointer(h->pg, NULL);
> -- 
> 1.8.3.1
>
Jitendra Khasdev Sept. 17, 2020, 9:43 a.m. UTC | #2
Hi Brian,

On 9/16/20 2:38 AM, Brian Bunker wrote:
> Hello Jitendra,
> 
> It seems that we are in the same place trying to fix the same thing for what is likely our same shared customer. Do you want to try to incorporate anything from our fix from PURE? Like maybe remove the BUG_ON lines from alua_rtpg if you are sure that the race is eliminated with your patch?
> 
> See:
> https://urldefense.com/v3/__https://marc.info/?l=linux-scsi&m=159984129611701&w=2__;!!GqivPVa7Brio!I4yOcJ5ukf2JyZxomXPkZdfh8vQTLSzBjHiZhWwhsSXyBgPCMqrS0xp0i3fa-5GELI3NNw$ 
> https://urldefense.com/v3/__https://marc.info/?l=linux-scsi&m=159983931810954&w=2__;!!GqivPVa7Brio!I4yOcJ5ukf2JyZxomXPkZdfh8vQTLSzBjHiZhWwhsSXyBgPCMqrS0xp0i3fa-5EMk1YDDA$ 
> https://urldefense.com/v3/__https://marc.info/?l=linux-scsi&m=159971849210795&w=2__;!!GqivPVa7Brio!I4yOcJ5ukf2JyZxomXPkZdfh8vQTLSzBjHiZhWwhsSXyBgPCMqrS0xp0i3fa-5H7zlg1Nw$ 
> 
> Thanks,
> Brian
> 
> Brian Bunker
> SW Eng
> brian@purestorage.com
> 
> 
> 
>> On Sep 15, 2020, at 3:58 AM, Jitendra Khasdev <jitendra.khasdev@oracle.com> wrote:
>>
>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>
>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>> race between alua_bus_detach and alua_rtpg.
>>
>> Here is call trace where race got detected.
>>
>> multipathd call stack:
>> [exception RIP: native_queued_spin_lock_slowpath+100]
>> --- <NMI exception stack> ---
>> native_queued_spin_lock_slowpath at ffffffff89307f54
>> queued_spin_lock_slowpath at ffffffff89307c18
>> _raw_spin_lock_irq at ffffffff89bd797b
>> alua_bus_detach at ffffffff8984dcc8
>> scsi_dh_release_device at ffffffff8984b6f2
>> scsi_device_dev_release_usercontext at ffffffff89846edf
>> execute_in_process_context at ffffffff892c3e60
>> scsi_device_dev_release at ffffffff8984637c
>> device_release at ffffffff89800fbc
>> kobject_cleanup at ffffffff89bb1196
>> kobject_put at ffffffff89bb12ea
>> put_device at ffffffff89801283
>> scsi_device_put at ffffffff89838d5b
>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>> sd_release at ffffffffc051f8a2 [sd_mod]
>> __blkdev_put at ffffffff8952c79e
>> blkdev_put at ffffffff8952c80c
>> blkdev_close at ffffffff8952c8b5
>> __fput at ffffffff894e55e6
>> ____fput at ffffffff894e57ee
>> task_work_run at ffffffff892c94dc
>> exit_to_usermode_loop at ffffffff89204b12
>> do_syscall_64 at ffffffff892044da
>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>
>> kworker:
>> [exception RIP: alua_rtpg+2003]
>> account_entity_dequeue at ffffffff892e42c1
>> alua_rtpg_work at ffffffff8984f097
>> process_one_work at ffffffff892c4c29
>> worker_thread at ffffffff892c5a4f
>> kthread at ffffffff892cb135
>> ret_from_fork at ffffffff89c00354
>>
>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>> ---
>> drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>> 1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>> index f32da0c..024a752 100644
>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>> 	struct alua_dh_data *h = sdev->handler_data;
>> 	struct alua_port_group *pg;
>>
>> +	sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>> +	flush_workqueue(kaluad_wq);
>> +
>> 	spin_lock(&h->pg_lock);
>> 	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>> 	rcu_assign_pointer(h->pg, NULL);
>> -- 
>> 1.8.3.1
>>
> 


Yes, looks we are fixing same problem. I looked into your patch, and thought removing BUG_ON could be last resort. Would you mind trying out my patch since it is reproducing at your site, because it looks me more cleaner way of doing it.

---
Jitendra
Ewan D. Milne Sept. 17, 2020, 5:30 p.m. UTC | #3
On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
> This is patch to fix the race occurs between bus detach and alua_rtpg.
> 
> It fluses the all pending workqueue in bus detach handler, so it can avoid
> race between alua_bus_detach and alua_rtpg.
> 
> Here is call trace where race got detected.
> 
> multipathd call stack:
> [exception RIP: native_queued_spin_lock_slowpath+100]
> --- <NMI exception stack> ---
> native_queued_spin_lock_slowpath at ffffffff89307f54
> queued_spin_lock_slowpath at ffffffff89307c18
> _raw_spin_lock_irq at ffffffff89bd797b
> alua_bus_detach at ffffffff8984dcc8
> scsi_dh_release_device at ffffffff8984b6f2
> scsi_device_dev_release_usercontext at ffffffff89846edf
> execute_in_process_context at ffffffff892c3e60
> scsi_device_dev_release at ffffffff8984637c
> device_release at ffffffff89800fbc
> kobject_cleanup at ffffffff89bb1196
> kobject_put at ffffffff89bb12ea
> put_device at ffffffff89801283
> scsi_device_put at ffffffff89838d5b
> scsi_disk_put at ffffffffc051f650 [sd_mod]
> sd_release at ffffffffc051f8a2 [sd_mod]
> __blkdev_put at ffffffff8952c79e
> blkdev_put at ffffffff8952c80c
> blkdev_close at ffffffff8952c8b5
> __fput at ffffffff894e55e6
> ____fput at ffffffff894e57ee
> task_work_run at ffffffff892c94dc
> exit_to_usermode_loop at ffffffff89204b12
> do_syscall_64 at ffffffff892044da
> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
> 
> kworker:
> [exception RIP: alua_rtpg+2003]
> account_entity_dequeue at ffffffff892e42c1
> alua_rtpg_work at ffffffff8984f097
> process_one_work at ffffffff892c4c29
> worker_thread at ffffffff892c5a4f
> kthread at ffffffff892cb135
> ret_from_fork at ffffffff89c00354
> 
> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
> ---
>  drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
> index f32da0c..024a752 100644
> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>  	struct alua_dh_data *h = sdev->handler_data;
>  	struct alua_port_group *pg;
>  
> +	sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
> +	flush_workqueue(kaluad_wq);
> +
>  	spin_lock(&h->pg_lock);
>  	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>  	rcu_assign_pointer(h->pg, NULL);

I'm not sure this is the best solution.  The current code
references h->sdev when the dh_list is traversed.  So it needs
to remain valid.  Fixing it by flushing the workqueue to avoid
the list traversal code running leaves open the possibility that
future code alterations may expose this problem again.

-Ewan
Jitendra Khasdev Sept. 18, 2020, 3:49 a.m. UTC | #4
On 9/17/20 11:00 PM, Ewan D. Milne wrote:
> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>
>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>> race between alua_bus_detach and alua_rtpg.
>>
>> Here is call trace where race got detected.
>>
>> multipathd call stack:
>> [exception RIP: native_queued_spin_lock_slowpath+100]
>> --- <NMI exception stack> ---
>> native_queued_spin_lock_slowpath at ffffffff89307f54
>> queued_spin_lock_slowpath at ffffffff89307c18
>> _raw_spin_lock_irq at ffffffff89bd797b
>> alua_bus_detach at ffffffff8984dcc8
>> scsi_dh_release_device at ffffffff8984b6f2
>> scsi_device_dev_release_usercontext at ffffffff89846edf
>> execute_in_process_context at ffffffff892c3e60
>> scsi_device_dev_release at ffffffff8984637c
>> device_release at ffffffff89800fbc
>> kobject_cleanup at ffffffff89bb1196
>> kobject_put at ffffffff89bb12ea
>> put_device at ffffffff89801283
>> scsi_device_put at ffffffff89838d5b
>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>> sd_release at ffffffffc051f8a2 [sd_mod]
>> __blkdev_put at ffffffff8952c79e
>> blkdev_put at ffffffff8952c80c
>> blkdev_close at ffffffff8952c8b5
>> __fput at ffffffff894e55e6
>> ____fput at ffffffff894e57ee
>> task_work_run at ffffffff892c94dc
>> exit_to_usermode_loop at ffffffff89204b12
>> do_syscall_64 at ffffffff892044da
>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>
>> kworker:
>> [exception RIP: alua_rtpg+2003]
>> account_entity_dequeue at ffffffff892e42c1
>> alua_rtpg_work at ffffffff8984f097
>> process_one_work at ffffffff892c4c29
>> worker_thread at ffffffff892c5a4f
>> kthread at ffffffff892cb135
>> ret_from_fork at ffffffff89c00354
>>
>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>> ---
>>  drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>> index f32da0c..024a752 100644
>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>  	struct alua_dh_data *h = sdev->handler_data;
>>  	struct alua_port_group *pg;
>>  
>> +	sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>> +	flush_workqueue(kaluad_wq);
>> +
>>  	spin_lock(&h->pg_lock);
>>  	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>  	rcu_assign_pointer(h->pg, NULL);
> 
> I'm not sure this is the best solution.  The current code
> references h->sdev when the dh_list is traversed.  So it needs
> to remain valid.  Fixing it by flushing the workqueue to avoid
> the list traversal code running leaves open the possibility that
> future code alterations may expose this problem again.
> 
> -Ewan
> 
> 

I see your point, but as we are in detach handler and this code path only execute when device is being detached. So, before detaching, flush work-queue will take care of any current code references h->sdev where dh_list is being traversed.

IMO, I do not think it would create any problem for future code alterations. Or may be I am missing something over here, what could be possible scenario for that?

---
Jitendra
Hannes Reinecke Sept. 23, 2020, 8:17 a.m. UTC | #5
On 9/18/20 5:49 AM, jitendra.khasdev@oracle.com wrote:
> 
> 
> On 9/17/20 11:00 PM, Ewan D. Milne wrote:
>> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>>
>>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>>> race between alua_bus_detach and alua_rtpg.
>>>
>>> Here is call trace where race got detected.
>>>
>>> multipathd call stack:
>>> [exception RIP: native_queued_spin_lock_slowpath+100]
>>> --- <NMI exception stack> ---
>>> native_queued_spin_lock_slowpath at ffffffff89307f54
>>> queued_spin_lock_slowpath at ffffffff89307c18
>>> _raw_spin_lock_irq at ffffffff89bd797b
>>> alua_bus_detach at ffffffff8984dcc8
>>> scsi_dh_release_device at ffffffff8984b6f2
>>> scsi_device_dev_release_usercontext at ffffffff89846edf
>>> execute_in_process_context at ffffffff892c3e60
>>> scsi_device_dev_release at ffffffff8984637c
>>> device_release at ffffffff89800fbc
>>> kobject_cleanup at ffffffff89bb1196
>>> kobject_put at ffffffff89bb12ea
>>> put_device at ffffffff89801283
>>> scsi_device_put at ffffffff89838d5b
>>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>>> sd_release at ffffffffc051f8a2 [sd_mod]
>>> __blkdev_put at ffffffff8952c79e
>>> blkdev_put at ffffffff8952c80c
>>> blkdev_close at ffffffff8952c8b5
>>> __fput at ffffffff894e55e6
>>> ____fput at ffffffff894e57ee
>>> task_work_run at ffffffff892c94dc
>>> exit_to_usermode_loop at ffffffff89204b12
>>> do_syscall_64 at ffffffff892044da
>>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>>
>>> kworker:
>>> [exception RIP: alua_rtpg+2003]
>>> account_entity_dequeue at ffffffff892e42c1
>>> alua_rtpg_work at ffffffff8984f097
>>> process_one_work at ffffffff892c4c29
>>> worker_thread at ffffffff892c5a4f
>>> kthread at ffffffff892cb135
>>> ret_from_fork at ffffffff89c00354
>>>
>>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>>> ---
>>>   drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>>   1 file changed, 3 insertions(+)
>>>
>>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>>> index f32da0c..024a752 100644
>>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>>   	struct alua_dh_data *h = sdev->handler_data;
>>>   	struct alua_port_group *pg;
>>>   
>>> +	sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>>> +	flush_workqueue(kaluad_wq);
>>> +
>>>   	spin_lock(&h->pg_lock);
>>>   	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>>   	rcu_assign_pointer(h->pg, NULL);
>>
>> I'm not sure this is the best solution.  The current code
>> references h->sdev when the dh_list is traversed.  So it needs
>> to remain valid.  Fixing it by flushing the workqueue to avoid
>> the list traversal code running leaves open the possibility that
>> future code alterations may expose this problem again.
>>
>> -Ewan
>>
>>
> 
> I see your point, but as we are in detach handler and this code path
> only execute when device is being detached. So, before detaching, flush
> work-queue will take care of any current code references h->sdev where
> dh_list is being traversed.
> 
Flushing the workqueue is a bit of an overkill, seeing that we know 
exactly which workqueue element we're waiting for.

> IMO, I do not think it would create any problem for future code
> alterations. Or may be I am missing something over here, what could
> be possible scenario for that?
> 
Problem is more that I'd like to understand where exactly the race 
condition is. Can you figure out which spinlock is triggering in your 
stack trace?

Cheers,

Hannes
Jitendra Khasdev Sept. 23, 2020, 5:50 p.m. UTC | #6
On 9/23/20 1:47 PM, Hannes Reinecke wrote:
> On 9/18/20 5:49 AM, jitendra.khasdev@oracle.com wrote:
>>
>>
>> On 9/17/20 11:00 PM, Ewan D. Milne wrote:
>>> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>>>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>>>
>>>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>>>> race between alua_bus_detach and alua_rtpg.
>>>>
>>>> Here is call trace where race got detected.
>>>>
>>>> multipathd call stack:
>>>> [exception RIP: native_queued_spin_lock_slowpath+100]
>>>> --- <NMI exception stack> ---
>>>> native_queued_spin_lock_slowpath at ffffffff89307f54
>>>> queued_spin_lock_slowpath at ffffffff89307c18
>>>> _raw_spin_lock_irq at ffffffff89bd797b
>>>> alua_bus_detach at ffffffff8984dcc8
>>>> scsi_dh_release_device at ffffffff8984b6f2
>>>> scsi_device_dev_release_usercontext at ffffffff89846edf
>>>> execute_in_process_context at ffffffff892c3e60
>>>> scsi_device_dev_release at ffffffff8984637c
>>>> device_release at ffffffff89800fbc
>>>> kobject_cleanup at ffffffff89bb1196
>>>> kobject_put at ffffffff89bb12ea
>>>> put_device at ffffffff89801283
>>>> scsi_device_put at ffffffff89838d5b
>>>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>>>> sd_release at ffffffffc051f8a2 [sd_mod]
>>>> __blkdev_put at ffffffff8952c79e
>>>> blkdev_put at ffffffff8952c80c
>>>> blkdev_close at ffffffff8952c8b5
>>>> __fput at ffffffff894e55e6
>>>> ____fput at ffffffff894e57ee
>>>> task_work_run at ffffffff892c94dc
>>>> exit_to_usermode_loop at ffffffff89204b12
>>>> do_syscall_64 at ffffffff892044da
>>>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>>>
>>>> kworker:
>>>> [exception RIP: alua_rtpg+2003]
>>>> account_entity_dequeue at ffffffff892e42c1
>>>> alua_rtpg_work at ffffffff8984f097
>>>> process_one_work at ffffffff892c4c29
>>>> worker_thread at ffffffff892c5a4f
>>>> kthread at ffffffff892cb135
>>>> ret_from_fork at ffffffff89c00354
>>>>
>>>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>>>> ---
>>>>   drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>>>   1 file changed, 3 insertions(+)
>>>>
>>>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>> index f32da0c..024a752 100644
>>>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>>>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>>>       struct alua_dh_data *h = sdev->handler_data;
>>>>       struct alua_port_group *pg;
>>>>   +    sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>>>> +    flush_workqueue(kaluad_wq);
>>>> +
>>>>       spin_lock(&h->pg_lock);
>>>>       pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>>>       rcu_assign_pointer(h->pg, NULL);
>>>
>>> I'm not sure this is the best solution.  The current code
>>> references h->sdev when the dh_list is traversed.  So it needs
>>> to remain valid.  Fixing it by flushing the workqueue to avoid
>>> the list traversal code running leaves open the possibility that
>>> future code alterations may expose this problem again.
>>>
>>> -Ewan
>>>
>>>
>>
>> I see your point, but as we are in detach handler and this code path
>> only execute when device is being detached. So, before detaching, flush
>> work-queue will take care of any current code references h->sdev where
>> dh_list is being traversed.
>>
> Flushing the workqueue is a bit of an overkill, seeing that we know exactly which workqueue element we're waiting for.
> 
>> IMO, I do not think it would create any problem for future code
>> alterations. Or may be I am missing something over here, what could
>> be possible scenario for that?
>>
> Problem is more that I'd like to understand where exactly the race condition is. Can you figure out which spinlock is triggering in your stack trace?
> 
> Cheers,
> 
> Hannes

Hannes,

Race is between "alua_bus_detach" and "alua_rtpg_work". 

Whenever we perform fail-over or turn off the switch, the path goes down, which eventually triggers
blkdev_put -> .. -> scsi_device_dev_release -> .. ->  alua_bus_detach meanwhile another thread of alua_rtpg_work also running in parallel. Both threads are using sdev.

In alua_bus_detach, we are setting null to sdev. From above call trace (multipathd) we can see alua_bus_deatch ran first and set sdev to null. It keeps its execution continue and it does not have any problem.  

1138 /*                                                                              
1139  * alua_bus_detach - Detach device handler                                      
1140  * @sdev: device to be detached from                                            
1141  */                                                                             
1142 static void alua_bus_detach(struct scsi_device *sdev)                           
1143 {                                                                               
1144         struct alua_dh_data *h = sdev->handler_data;                            
1145         struct alua_port_group *pg;                                             
1146                                                                                 
1147         spin_lock(&h->pg_lock);                                                 
1148         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));    
1149         rcu_assign_pointer(h->pg, NULL);                                        
*1150*         h->sdev = NULL;  << Looks detach handler won the race and set sdev to null                                                         
1151         spin_unlock(&h->pg_lock);                                               
1152         if (pg) {                                                               
1153                 spin_lock_irq(&pg->lock); <<< from the call trace we can see that we just acquired the lock and got NMI 
exception because we encountered a BUG_ON from different thread.                                        
1154                 list_del_rcu(&h->node);        


Meanwhile alua_rtpg try to check for BUG_ON(!h->sdev); 

alua_rtpg_work -> alua_rtpg
----
 505 static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
 506 {
 .
 .
 .
 659                                         list_for_each_entry_rcu(h,              
 660                                                 &tmp_pg->dh_list, node) {       
 661                                                 /* h->sdev should always be valid */
 *662*                                                 BUG_ON(!h->sdev); <<<< 2nd call trace caused the panic due to this bug on.              
 663                                                 h->sdev->access_state = desc[0];
 664                                         }                                       
 665                                         rcu_read_unlock();                      
 666                                 }                                 
----

So it looks, alua_rtpg_work triggered to alua_rtpg. alua_rtpg function is in its initial execution but didn't reach to line number 662. Meanwhile alua_bus_detach thread comes in and executes line no. 1150 which set sdev to null. Now, if alua_rtpg reaches to line 662 then it would cause the BUG_ON which will result in panic.

---
Jitendra
Hannes Reinecke Sept. 24, 2020, 10:39 a.m. UTC | #7
On 9/23/20 7:50 PM, jitendra.khasdev@oracle.com wrote:
> 
> 
> On 9/23/20 1:47 PM, Hannes Reinecke wrote:
>> On 9/18/20 5:49 AM, jitendra.khasdev@oracle.com wrote:
>>>
>>>
>>> On 9/17/20 11:00 PM, Ewan D. Milne wrote:
>>>> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>>>>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>>>>
>>>>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>>>>> race between alua_bus_detach and alua_rtpg.
>>>>>
>>>>> Here is call trace where race got detected.
>>>>>
>>>>> multipathd call stack:
>>>>> [exception RIP: native_queued_spin_lock_slowpath+100]
>>>>> --- <NMI exception stack> ---
>>>>> native_queued_spin_lock_slowpath at ffffffff89307f54
>>>>> queued_spin_lock_slowpath at ffffffff89307c18
>>>>> _raw_spin_lock_irq at ffffffff89bd797b
>>>>> alua_bus_detach at ffffffff8984dcc8
>>>>> scsi_dh_release_device at ffffffff8984b6f2
>>>>> scsi_device_dev_release_usercontext at ffffffff89846edf
>>>>> execute_in_process_context at ffffffff892c3e60
>>>>> scsi_device_dev_release at ffffffff8984637c
>>>>> device_release at ffffffff89800fbc
>>>>> kobject_cleanup at ffffffff89bb1196
>>>>> kobject_put at ffffffff89bb12ea
>>>>> put_device at ffffffff89801283
>>>>> scsi_device_put at ffffffff89838d5b
>>>>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>>>>> sd_release at ffffffffc051f8a2 [sd_mod]
>>>>> __blkdev_put at ffffffff8952c79e
>>>>> blkdev_put at ffffffff8952c80c
>>>>> blkdev_close at ffffffff8952c8b5
>>>>> __fput at ffffffff894e55e6
>>>>> ____fput at ffffffff894e57ee
>>>>> task_work_run at ffffffff892c94dc
>>>>> exit_to_usermode_loop at ffffffff89204b12
>>>>> do_syscall_64 at ffffffff892044da
>>>>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>>>>
>>>>> kworker:
>>>>> [exception RIP: alua_rtpg+2003]
>>>>> account_entity_dequeue at ffffffff892e42c1
>>>>> alua_rtpg_work at ffffffff8984f097
>>>>> process_one_work at ffffffff892c4c29
>>>>> worker_thread at ffffffff892c5a4f
>>>>> kthread at ffffffff892cb135
>>>>> ret_from_fork at ffffffff89c00354
>>>>>
>>>>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>>>>> ---
>>>>>    drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>>>>    1 file changed, 3 insertions(+)
>>>>>
>>>>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>> index f32da0c..024a752 100644
>>>>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>>>>        struct alua_dh_data *h = sdev->handler_data;
>>>>>        struct alua_port_group *pg;
>>>>>    +    sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>>>>> +    flush_workqueue(kaluad_wq);
>>>>> +
>>>>>        spin_lock(&h->pg_lock);
>>>>>        pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>>>>        rcu_assign_pointer(h->pg, NULL);
>>>>
>>>> I'm not sure this is the best solution.  The current code
>>>> references h->sdev when the dh_list is traversed.  So it needs
>>>> to remain valid.  Fixing it by flushing the workqueue to avoid
>>>> the list traversal code running leaves open the possibility that
>>>> future code alterations may expose this problem again.
>>>>
>>>> -Ewan
>>>>
>>>>
>>>
>>> I see your point, but as we are in detach handler and this code path
>>> only execute when device is being detached. So, before detaching, flush
>>> work-queue will take care of any current code references h->sdev where
>>> dh_list is being traversed.
>>>
>> Flushing the workqueue is a bit of an overkill, seeing that we know exactly which workqueue element we're waiting for.
>>
>>> IMO, I do not think it would create any problem for future code
>>> alterations. Or may be I am missing something over here, what could
>>> be possible scenario for that?
>>>
>> Problem is more that I'd like to understand where exactly the race condition is. Can you figure out which spinlock is triggering in your stack trace?
>>
>> Cheers,
>>
>> Hannes
> 
> Hannes,
> 
> Race is between "alua_bus_detach" and "alua_rtpg_work".
> 
> Whenever we perform fail-over or turn off the switch, the path goes down, which eventually triggers
> blkdev_put -> .. -> scsi_device_dev_release -> .. ->  alua_bus_detach meanwhile another thread of alua_rtpg_work also running in parallel. Both threads are using sdev.
> 
> In alua_bus_detach, we are setting null to sdev. From above call trace (multipathd) we can see alua_bus_deatch ran first and set sdev to null. It keeps its execution continue and it does not have any problem.
> 
> 1138 /*
> 1139  * alua_bus_detach - Detach device handler
> 1140  * @sdev: device to be detached from
> 1141  */
> 1142 static void alua_bus_detach(struct scsi_device *sdev)
> 1143 {
> 1144         struct alua_dh_data *h = sdev->handler_data;
> 1145         struct alua_port_group *pg;
> 1146
> 1147         spin_lock(&h->pg_lock);
> 1148         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
> 1149         rcu_assign_pointer(h->pg, NULL);
> *1150*         h->sdev = NULL;  << Looks detach handler won the race and set sdev to null
> 1151         spin_unlock(&h->pg_lock);
> 1152         if (pg) {
> 1153                 spin_lock_irq(&pg->lock); <<< from the call trace we can see that we just acquired the lock and got NMI
> exception because we encountered a BUG_ON from different thread.
> 1154                 list_del_rcu(&h->node);
> 
> 
> Meanwhile alua_rtpg try to check for BUG_ON(!h->sdev);
> 
> alua_rtpg_work -> alua_rtpg
> ----
>   505 static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
>   506 {
>   .
>   .
>   .
>   659                                         list_for_each_entry_rcu(h,
>   660                                                 &tmp_pg->dh_list, node) {
>   661                                                 /* h->sdev should always be valid */
>   *662*                                                 BUG_ON(!h->sdev); <<<< 2nd call trace caused the panic due to this bug on.
>   663                                                 h->sdev->access_state = desc[0];
>   664                                         }
>   665                                         rcu_read_unlock();
>   666                                 }
> ----
> 
Ah, yes.

We would need to take 'h->lock' here before checking 'h->sdev'.
Alternatively, we should be able to fix it by not setting h->sdev to 
NULL, and issuing rcu_synchronize() before issuing kfree(h):

@@ -1147,7 +1148,6 @@ static void alua_bus_detach(struct scsi_device *sdev)
         spin_lock(&h->pg_lock);
         pg = rcu_dereference_protected(h->pg, 
lockdep_is_held(&h->pg_lock));
         rcu_assign_pointer(h->pg, NULL);
-       h->sdev = NULL;
         spin_unlock(&h->pg_lock);
         if (pg) {
                 spin_lock_irq(&pg->lock);
@@ -1156,6 +1156,7 @@ static void alua_bus_detach(struct scsi_device *sdev)
                 kref_put(&pg->kref, release_port_group);
         }
         sdev->handler_data = NULL;
+       rcu_synchronize();
         kfree(h);
  }

The 'rcu_synchronize()' will ensure that any concurrent thread has left 
the rcu-critical section (ie the loop mentioned above), and the issue 
will be avoided.
Additionally, we could replace the BUG_ON() with

if (!h->sdev)
     continue;

and the problem should be solved.

Cheers,

Hannes
Jitendra Khasdev Sept. 29, 2020, 5:44 p.m. UTC | #8
On 9/24/20 4:09 PM, Hannes Reinecke wrote:
> On 9/23/20 7:50 PM, jitendra.khasdev@oracle.com wrote:
>>
>>
>> On 9/23/20 1:47 PM, Hannes Reinecke wrote:
>>> On 9/18/20 5:49 AM, jitendra.khasdev@oracle.com wrote:
>>>>
>>>>
>>>> On 9/17/20 11:00 PM, Ewan D. Milne wrote:
>>>>> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>>>>>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>>>>>
>>>>>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>>>>>> race between alua_bus_detach and alua_rtpg.
>>>>>>
>>>>>> Here is call trace where race got detected.
>>>>>>
>>>>>> multipathd call stack:
>>>>>> [exception RIP: native_queued_spin_lock_slowpath+100]
>>>>>> --- <NMI exception stack> ---
>>>>>> native_queued_spin_lock_slowpath at ffffffff89307f54
>>>>>> queued_spin_lock_slowpath at ffffffff89307c18
>>>>>> _raw_spin_lock_irq at ffffffff89bd797b
>>>>>> alua_bus_detach at ffffffff8984dcc8
>>>>>> scsi_dh_release_device at ffffffff8984b6f2
>>>>>> scsi_device_dev_release_usercontext at ffffffff89846edf
>>>>>> execute_in_process_context at ffffffff892c3e60
>>>>>> scsi_device_dev_release at ffffffff8984637c
>>>>>> device_release at ffffffff89800fbc
>>>>>> kobject_cleanup at ffffffff89bb1196
>>>>>> kobject_put at ffffffff89bb12ea
>>>>>> put_device at ffffffff89801283
>>>>>> scsi_device_put at ffffffff89838d5b
>>>>>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>>>>>> sd_release at ffffffffc051f8a2 [sd_mod]
>>>>>> __blkdev_put at ffffffff8952c79e
>>>>>> blkdev_put at ffffffff8952c80c
>>>>>> blkdev_close at ffffffff8952c8b5
>>>>>> __fput at ffffffff894e55e6
>>>>>> ____fput at ffffffff894e57ee
>>>>>> task_work_run at ffffffff892c94dc
>>>>>> exit_to_usermode_loop at ffffffff89204b12
>>>>>> do_syscall_64 at ffffffff892044da
>>>>>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>>>>>
>>>>>> kworker:
>>>>>> [exception RIP: alua_rtpg+2003]
>>>>>> account_entity_dequeue at ffffffff892e42c1
>>>>>> alua_rtpg_work at ffffffff8984f097
>>>>>> process_one_work at ffffffff892c4c29
>>>>>> worker_thread at ffffffff892c5a4f
>>>>>> kthread at ffffffff892cb135
>>>>>> ret_from_fork at ffffffff89c00354
>>>>>>
>>>>>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>>>>>> ---
>>>>>>    drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>>>>>    1 file changed, 3 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>> index f32da0c..024a752 100644
>>>>>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>>>>>        struct alua_dh_data *h = sdev->handler_data;
>>>>>>        struct alua_port_group *pg;
>>>>>>    +    sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>>>>>> +    flush_workqueue(kaluad_wq);
>>>>>> +
>>>>>>        spin_lock(&h->pg_lock);
>>>>>>        pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>>>>>        rcu_assign_pointer(h->pg, NULL);
>>>>>
>>>>> I'm not sure this is the best solution.  The current code
>>>>> references h->sdev when the dh_list is traversed.  So it needs
>>>>> to remain valid.  Fixing it by flushing the workqueue to avoid
>>>>> the list traversal code running leaves open the possibility that
>>>>> future code alterations may expose this problem again.
>>>>>
>>>>> -Ewan
>>>>>
>>>>>
>>>>
>>>> I see your point, but as we are in detach handler and this code path
>>>> only execute when device is being detached. So, before detaching, flush
>>>> work-queue will take care of any current code references h->sdev where
>>>> dh_list is being traversed.
>>>>
>>> Flushing the workqueue is a bit of an overkill, seeing that we know exactly which workqueue element we're waiting for.
>>>
>>>> IMO, I do not think it would create any problem for future code
>>>> alterations. Or may be I am missing something over here, what could
>>>> be possible scenario for that?
>>>>
>>> Problem is more that I'd like to understand where exactly the race condition is. Can you figure out which spinlock is triggering in your stack trace?
>>>
>>> Cheers,
>>>
>>> Hannes
>>
>> Hannes,
>>
>> Race is between "alua_bus_detach" and "alua_rtpg_work".
>>
>> Whenever we perform fail-over or turn off the switch, the path goes down, which eventually triggers
>> blkdev_put -> .. -> scsi_device_dev_release -> .. ->  alua_bus_detach meanwhile another thread of alua_rtpg_work also running in parallel. Both threads are using sdev.
>>
>> In alua_bus_detach, we are setting null to sdev. From above call trace (multipathd) we can see alua_bus_deatch ran first and set sdev to null. It keeps its execution continue and it does not have any problem.
>>
>> 1138 /*
>> 1139  * alua_bus_detach - Detach device handler
>> 1140  * @sdev: device to be detached from
>> 1141  */
>> 1142 static void alua_bus_detach(struct scsi_device *sdev)
>> 1143 {
>> 1144         struct alua_dh_data *h = sdev->handler_data;
>> 1145         struct alua_port_group *pg;
>> 1146
>> 1147         spin_lock(&h->pg_lock);
>> 1148         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>> 1149         rcu_assign_pointer(h->pg, NULL);
>> *1150*         h->sdev = NULL;  << Looks detach handler won the race and set sdev to null
>> 1151         spin_unlock(&h->pg_lock);
>> 1152         if (pg) {
>> 1153                 spin_lock_irq(&pg->lock); <<< from the call trace we can see that we just acquired the lock and got NMI
>> exception because we encountered a BUG_ON from different thread.
>> 1154                 list_del_rcu(&h->node);
>>
>>
>> Meanwhile alua_rtpg try to check for BUG_ON(!h->sdev);
>>
>> alua_rtpg_work -> alua_rtpg
>> ----
>>   505 static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
>>   506 {
>>   .
>>   .
>>   .
>>   659                                         list_for_each_entry_rcu(h,
>>   660                                                 &tmp_pg->dh_list, node) {
>>   661                                                 /* h->sdev should always be valid */
>>   *662*                                                 BUG_ON(!h->sdev); <<<< 2nd call trace caused the panic due to this bug on.
>>   663                                                 h->sdev->access_state = desc[0];
>>   664                                         }
>>   665                                         rcu_read_unlock();
>>   666                                 }
>> ----
>>
> Ah, yes.
> 
> We would need to take 'h->lock' here before checking 'h->sdev'.
> Alternatively, we should be able to fix it by not setting h->sdev to NULL, and issuing rcu_synchronize() before issuing kfree(h):
> 
> @@ -1147,7 +1148,6 @@ static void alua_bus_detach(struct scsi_device *sdev)
>         spin_lock(&h->pg_lock);
>         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>         rcu_assign_pointer(h->pg, NULL);
> -       h->sdev = NULL;
>         spin_unlock(&h->pg_lock);
>         if (pg) {
>                 spin_lock_irq(&pg->lock);
> @@ -1156,6 +1156,7 @@ static void alua_bus_detach(struct scsi_device *sdev)
>                 kref_put(&pg->kref, release_port_group);
>         }
>         sdev->handler_data = NULL;
> +       rcu_synchronize();
>         kfree(h);
>  }
> 
> The 'rcu_synchronize()' will ensure that any concurrent thread has left the rcu-critical section (ie the loop mentioned above), and the issue will be avoided.
> Additionally, we could replace the BUG_ON() with
> 
> if (!h->sdev)
>     continue;
> 
> and the problem should be solved.
> 
> Cheers,
> 
> Hannes

Thanks Hannes for patch, I am currently testing it.

---
Jitendra
Jitendra Khasdev Oct. 12, 2020, 8:52 a.m. UTC | #9
Hi Hannes,

On 9/24/20 4:09 PM, Hannes Reinecke wrote:
> On 9/23/20 7:50 PM, jitendra.khasdev@oracle.com wrote:
>>
>>
>> On 9/23/20 1:47 PM, Hannes Reinecke wrote:
>>> On 9/18/20 5:49 AM, jitendra.khasdev@oracle.com wrote:
>>>>
>>>>
>>>> On 9/17/20 11:00 PM, Ewan D. Milne wrote:
>>>>> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>>>>>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>>>>>
>>>>>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>>>>>> race between alua_bus_detach and alua_rtpg.
>>>>>>
>>>>>> Here is call trace where race got detected.
>>>>>>
>>>>>> multipathd call stack:
>>>>>> [exception RIP: native_queued_spin_lock_slowpath+100]
>>>>>> --- <NMI exception stack> ---
>>>>>> native_queued_spin_lock_slowpath at ffffffff89307f54
>>>>>> queued_spin_lock_slowpath at ffffffff89307c18
>>>>>> _raw_spin_lock_irq at ffffffff89bd797b
>>>>>> alua_bus_detach at ffffffff8984dcc8
>>>>>> scsi_dh_release_device at ffffffff8984b6f2
>>>>>> scsi_device_dev_release_usercontext at ffffffff89846edf
>>>>>> execute_in_process_context at ffffffff892c3e60
>>>>>> scsi_device_dev_release at ffffffff8984637c
>>>>>> device_release at ffffffff89800fbc
>>>>>> kobject_cleanup at ffffffff89bb1196
>>>>>> kobject_put at ffffffff89bb12ea
>>>>>> put_device at ffffffff89801283
>>>>>> scsi_device_put at ffffffff89838d5b
>>>>>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>>>>>> sd_release at ffffffffc051f8a2 [sd_mod]
>>>>>> __blkdev_put at ffffffff8952c79e
>>>>>> blkdev_put at ffffffff8952c80c
>>>>>> blkdev_close at ffffffff8952c8b5
>>>>>> __fput at ffffffff894e55e6
>>>>>> ____fput at ffffffff894e57ee
>>>>>> task_work_run at ffffffff892c94dc
>>>>>> exit_to_usermode_loop at ffffffff89204b12
>>>>>> do_syscall_64 at ffffffff892044da
>>>>>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>>>>>
>>>>>> kworker:
>>>>>> [exception RIP: alua_rtpg+2003]
>>>>>> account_entity_dequeue at ffffffff892e42c1
>>>>>> alua_rtpg_work at ffffffff8984f097
>>>>>> process_one_work at ffffffff892c4c29
>>>>>> worker_thread at ffffffff892c5a4f
>>>>>> kthread at ffffffff892cb135
>>>>>> ret_from_fork at ffffffff89c00354
>>>>>>
>>>>>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>>>>>> ---
>>>>>>    drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>>>>>    1 file changed, 3 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>> index f32da0c..024a752 100644
>>>>>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>>>>>        struct alua_dh_data *h = sdev->handler_data;
>>>>>>        struct alua_port_group *pg;
>>>>>>    +    sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>>>>>> +    flush_workqueue(kaluad_wq);
>>>>>> +
>>>>>>        spin_lock(&h->pg_lock);
>>>>>>        pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>>>>>        rcu_assign_pointer(h->pg, NULL);
>>>>>
>>>>> I'm not sure this is the best solution.  The current code
>>>>> references h->sdev when the dh_list is traversed.  So it needs
>>>>> to remain valid.  Fixing it by flushing the workqueue to avoid
>>>>> the list traversal code running leaves open the possibility that
>>>>> future code alterations may expose this problem again.
>>>>>
>>>>> -Ewan
>>>>>
>>>>>
>>>>
>>>> I see your point, but as we are in detach handler and this code path
>>>> only execute when device is being detached. So, before detaching, flush
>>>> work-queue will take care of any current code references h->sdev where
>>>> dh_list is being traversed.
>>>>
>>> Flushing the workqueue is a bit of an overkill, seeing that we know exactly which workqueue element we're waiting for.
>>>
>>>> IMO, I do not think it would create any problem for future code
>>>> alterations. Or may be I am missing something over here, what could
>>>> be possible scenario for that?
>>>>
>>> Problem is more that I'd like to understand where exactly the race condition is. Can you figure out which spinlock is triggering in your stack trace?
>>>
>>> Cheers,
>>>
>>> Hannes
>>
>> Hannes,
>>
>> Race is between "alua_bus_detach" and "alua_rtpg_work".
>>
>> Whenever we perform fail-over or turn off the switch, the path goes down, which eventually triggers
>> blkdev_put -> .. -> scsi_device_dev_release -> .. ->  alua_bus_detach meanwhile another thread of alua_rtpg_work also running in parallel. Both threads are using sdev.
>>
>> In alua_bus_detach, we are setting null to sdev. From above call trace (multipathd) we can see alua_bus_deatch ran first and set sdev to null. It keeps its execution continue and it does not have any problem.
>>
>> 1138 /*
>> 1139  * alua_bus_detach - Detach device handler
>> 1140  * @sdev: device to be detached from
>> 1141  */
>> 1142 static void alua_bus_detach(struct scsi_device *sdev)
>> 1143 {
>> 1144         struct alua_dh_data *h = sdev->handler_data;
>> 1145         struct alua_port_group *pg;
>> 1146
>> 1147         spin_lock(&h->pg_lock);
>> 1148         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>> 1149         rcu_assign_pointer(h->pg, NULL);
>> *1150*         h->sdev = NULL;  << Looks detach handler won the race and set sdev to null
>> 1151         spin_unlock(&h->pg_lock);
>> 1152         if (pg) {
>> 1153                 spin_lock_irq(&pg->lock); <<< from the call trace we can see that we just acquired the lock and got NMI
>> exception because we encountered a BUG_ON from different thread.
>> 1154                 list_del_rcu(&h->node);
>>
>>
>> Meanwhile alua_rtpg try to check for BUG_ON(!h->sdev);
>>
>> alua_rtpg_work -> alua_rtpg
>> ----
>>   505 static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
>>   506 {
>>   .
>>   .
>>   .
>>   659                                         list_for_each_entry_rcu(h,
>>   660                                                 &tmp_pg->dh_list, node) {
>>   661                                                 /* h->sdev should always be valid */
>>   *662*                                                 BUG_ON(!h->sdev); <<<< 2nd call trace caused the panic due to this bug on.
>>   663                                                 h->sdev->access_state = desc[0];
>>   664                                         }
>>   665                                         rcu_read_unlock();
>>   666                                 }
>> ----
>>
> Ah, yes.
> 
> We would need to take 'h->lock' here before checking 'h->sdev'.
> Alternatively, we should be able to fix it by not setting h->sdev to NULL, and issuing rcu_synchronize() before issuing kfree(h):
> 
> @@ -1147,7 +1148,6 @@ static void alua_bus_detach(struct scsi_device *sdev)
>         spin_lock(&h->pg_lock);
>         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>         rcu_assign_pointer(h->pg, NULL);
> -       h->sdev = NULL;
>         spin_unlock(&h->pg_lock);
>         if (pg) {
>                 spin_lock_irq(&pg->lock);
> @@ -1156,6 +1156,7 @@ static void alua_bus_detach(struct scsi_device *sdev)
>                 kref_put(&pg->kref, release_port_group);
>         }
>         sdev->handler_data = NULL;
> +       rcu_synchronize();
>         kfree(h);
>  }
> 
> The 'rcu_synchronize()' will ensure that any concurrent thread has left the rcu-critical section (ie the loop mentioned above), and the issue will be avoided.
> Additionally, we could replace the BUG_ON() with
> 
> if (!h->sdev)
>     continue;
> 
> and the problem should be solved.
> 
> Cheers,
> 
> Hannes


This patch works and avoid crash during fail-over. It looks good to me in testing.

---
Jitendra
Jitendra Khasdev Oct. 20, 2020, 1:13 p.m. UTC | #10
Hi Hannes,

On 10/12/20 2:22 PM, jitendra.khasdev@oracle.com wrote:
> Hi Hannes,
> 
> On 9/24/20 4:09 PM, Hannes Reinecke wrote:
>> On 9/23/20 7:50 PM, jitendra.khasdev@oracle.com wrote:
>>>
>>>
>>> On 9/23/20 1:47 PM, Hannes Reinecke wrote:
>>>> On 9/18/20 5:49 AM, jitendra.khasdev@oracle.com wrote:
>>>>>
>>>>>
>>>>> On 9/17/20 11:00 PM, Ewan D. Milne wrote:
>>>>>> On Tue, 2020-09-15 at 16:28 +0530, Jitendra Khasdev wrote:
>>>>>>> This is patch to fix the race occurs between bus detach and alua_rtpg.
>>>>>>>
>>>>>>> It fluses the all pending workqueue in bus detach handler, so it can avoid
>>>>>>> race between alua_bus_detach and alua_rtpg.
>>>>>>>
>>>>>>> Here is call trace where race got detected.
>>>>>>>
>>>>>>> multipathd call stack:
>>>>>>> [exception RIP: native_queued_spin_lock_slowpath+100]
>>>>>>> --- <NMI exception stack> ---
>>>>>>> native_queued_spin_lock_slowpath at ffffffff89307f54
>>>>>>> queued_spin_lock_slowpath at ffffffff89307c18
>>>>>>> _raw_spin_lock_irq at ffffffff89bd797b
>>>>>>> alua_bus_detach at ffffffff8984dcc8
>>>>>>> scsi_dh_release_device at ffffffff8984b6f2
>>>>>>> scsi_device_dev_release_usercontext at ffffffff89846edf
>>>>>>> execute_in_process_context at ffffffff892c3e60
>>>>>>> scsi_device_dev_release at ffffffff8984637c
>>>>>>> device_release at ffffffff89800fbc
>>>>>>> kobject_cleanup at ffffffff89bb1196
>>>>>>> kobject_put at ffffffff89bb12ea
>>>>>>> put_device at ffffffff89801283
>>>>>>> scsi_device_put at ffffffff89838d5b
>>>>>>> scsi_disk_put at ffffffffc051f650 [sd_mod]
>>>>>>> sd_release at ffffffffc051f8a2 [sd_mod]
>>>>>>> __blkdev_put at ffffffff8952c79e
>>>>>>> blkdev_put at ffffffff8952c80c
>>>>>>> blkdev_close at ffffffff8952c8b5
>>>>>>> __fput at ffffffff894e55e6
>>>>>>> ____fput at ffffffff894e57ee
>>>>>>> task_work_run at ffffffff892c94dc
>>>>>>> exit_to_usermode_loop at ffffffff89204b12
>>>>>>> do_syscall_64 at ffffffff892044da
>>>>>>> entry_SYSCALL_64_after_hwframe at ffffffff89c001b8
>>>>>>>
>>>>>>> kworker:
>>>>>>> [exception RIP: alua_rtpg+2003]
>>>>>>> account_entity_dequeue at ffffffff892e42c1
>>>>>>> alua_rtpg_work at ffffffff8984f097
>>>>>>> process_one_work at ffffffff892c4c29
>>>>>>> worker_thread at ffffffff892c5a4f
>>>>>>> kthread at ffffffff892cb135
>>>>>>> ret_from_fork at ffffffff89c00354
>>>>>>>
>>>>>>> Signed-off-by: Jitendra Khasdev <jitendra.khasdev@oracle.com>
>>>>>>> ---
>>>>>>>    drivers/scsi/device_handler/scsi_dh_alua.c | 3 +++
>>>>>>>    1 file changed, 3 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>>> index f32da0c..024a752 100644
>>>>>>> --- a/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>>> +++ b/drivers/scsi/device_handler/scsi_dh_alua.c
>>>>>>> @@ -1144,6 +1144,9 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>>>>>>        struct alua_dh_data *h = sdev->handler_data;
>>>>>>>        struct alua_port_group *pg;
>>>>>>>    +    sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
>>>>>>> +    flush_workqueue(kaluad_wq);
>>>>>>> +
>>>>>>>        spin_lock(&h->pg_lock);
>>>>>>>        pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>>>>>>        rcu_assign_pointer(h->pg, NULL);
>>>>>>
>>>>>> I'm not sure this is the best solution.  The current code
>>>>>> references h->sdev when the dh_list is traversed.  So it needs
>>>>>> to remain valid.  Fixing it by flushing the workqueue to avoid
>>>>>> the list traversal code running leaves open the possibility that
>>>>>> future code alterations may expose this problem again.
>>>>>>
>>>>>> -Ewan
>>>>>>
>>>>>>
>>>>>
>>>>> I see your point, but as we are in detach handler and this code path
>>>>> only execute when device is being detached. So, before detaching, flush
>>>>> work-queue will take care of any current code references h->sdev where
>>>>> dh_list is being traversed.
>>>>>
>>>> Flushing the workqueue is a bit of an overkill, seeing that we know exactly which workqueue element we're waiting for.
>>>>
>>>>> IMO, I do not think it would create any problem for future code
>>>>> alterations. Or may be I am missing something over here, what could
>>>>> be possible scenario for that?
>>>>>
>>>> Problem is more that I'd like to understand where exactly the race condition is. Can you figure out which spinlock is triggering in your stack trace?
>>>>
>>>> Cheers,
>>>>
>>>> Hannes
>>>
>>> Hannes,
>>>
>>> Race is between "alua_bus_detach" and "alua_rtpg_work".
>>>
>>> Whenever we perform fail-over or turn off the switch, the path goes down, which eventually triggers
>>> blkdev_put -> .. -> scsi_device_dev_release -> .. ->  alua_bus_detach meanwhile another thread of alua_rtpg_work also running in parallel. Both threads are using sdev.
>>>
>>> In alua_bus_detach, we are setting null to sdev. From above call trace (multipathd) we can see alua_bus_deatch ran first and set sdev to null. It keeps its execution continue and it does not have any problem.
>>>
>>> 1138 /*
>>> 1139  * alua_bus_detach - Detach device handler
>>> 1140  * @sdev: device to be detached from
>>> 1141  */
>>> 1142 static void alua_bus_detach(struct scsi_device *sdev)
>>> 1143 {
>>> 1144         struct alua_dh_data *h = sdev->handler_data;
>>> 1145         struct alua_port_group *pg;
>>> 1146
>>> 1147         spin_lock(&h->pg_lock);
>>> 1148         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>> 1149         rcu_assign_pointer(h->pg, NULL);
>>> *1150*         h->sdev = NULL;  << Looks detach handler won the race and set sdev to null
>>> 1151         spin_unlock(&h->pg_lock);
>>> 1152         if (pg) {
>>> 1153                 spin_lock_irq(&pg->lock); <<< from the call trace we can see that we just acquired the lock and got NMI
>>> exception because we encountered a BUG_ON from different thread.
>>> 1154                 list_del_rcu(&h->node);
>>>
>>>
>>> Meanwhile alua_rtpg try to check for BUG_ON(!h->sdev);
>>>
>>> alua_rtpg_work -> alua_rtpg
>>> ----
>>>   505 static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
>>>   506 {
>>>   .
>>>   .
>>>   .
>>>   659                                         list_for_each_entry_rcu(h,
>>>   660                                                 &tmp_pg->dh_list, node) {
>>>   661                                                 /* h->sdev should always be valid */
>>>   *662*                                                 BUG_ON(!h->sdev); <<<< 2nd call trace caused the panic due to this bug on.
>>>   663                                                 h->sdev->access_state = desc[0];
>>>   664                                         }
>>>   665                                         rcu_read_unlock();
>>>   666                                 }
>>> ----
>>>
>> Ah, yes.
>>
>> We would need to take 'h->lock' here before checking 'h->sdev'.
>> Alternatively, we should be able to fix it by not setting h->sdev to NULL, and issuing rcu_synchronize() before issuing kfree(h):
>>
>> @@ -1147,7 +1148,6 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>         spin_lock(&h->pg_lock);
>>         pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
>>         rcu_assign_pointer(h->pg, NULL);
>> -       h->sdev = NULL;
>>         spin_unlock(&h->pg_lock);
>>         if (pg) {
>>                 spin_lock_irq(&pg->lock);
>> @@ -1156,6 +1156,7 @@ static void alua_bus_detach(struct scsi_device *sdev)
>>                 kref_put(&pg->kref, release_port_group);
>>         }
>>         sdev->handler_data = NULL;
>> +       rcu_synchronize();
>>         kfree(h);
>>  }
>>
>> The 'rcu_synchronize()' will ensure that any concurrent thread has left the rcu-critical section (ie the loop mentioned above), and the issue will be avoided.
>> Additionally, we could replace the BUG_ON() with
>>
>> if (!h->sdev)
>>     continue;
>>
>> and the problem should be solved.
>>
>> Cheers,
>>
>> Hannes
> 
> 
> This patch works and avoid crash during fail-over. It looks good to me in testing.
> 
> ---
> Jitendra
> 


Gentle reminder, I am wondering if we can proceed to integrate this patch to mainline. 

---
Jitendra
diff mbox series

Patch

diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index f32da0c..024a752 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -1144,6 +1144,9 @@  static void alua_bus_detach(struct scsi_device *sdev)
 	struct alua_dh_data *h = sdev->handler_data;
 	struct alua_port_group *pg;
 
+	sdev_printk(KERN_INFO, sdev, "%s: flushing workqueues\n", ALUA_DH_NAME);
+	flush_workqueue(kaluad_wq);
+
 	spin_lock(&h->pg_lock);
 	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
 	rcu_assign_pointer(h->pg, NULL);