diff mbox series

scsi: fix ata_port_wait_eh() hung caused by missing to wake up eh thread

Message ID 1553504726-40837-1-git-send-email-zhengbin13@huawei.com (mailing list archive)
State Changes Requested
Headers show
Series scsi: fix ata_port_wait_eh() hung caused by missing to wake up eh thread | expand

Commit Message

Zheng Bin March 25, 2019, 9:05 a.m. UTC
When I use fio test kernel in the following steps:
1.The sas controller mixes SAS/SATA disks
2.Use fio test all disks
3.Simultaneous enable/disable/link_reset/hard_reset PHY

it will hung in ata_port_wait_eh
Call trace:
 __switch_to+0xb4/0x1b8
 __schedule+0x1e8/0x718
 schedule+0x38/0x90
 ata_port_wait_eh+0x70/0xf8
 sas_ata_wait_eh+0x24/0x30 [libsas]
 transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
 phy_reset_work+0x20/0x30 [libsas]
 process_one_work+0x1e4/0x460
 worker_thread+0x40/0x450
 kthread+0x12c/0x130
 ret_from_fork+0x10/0x18

The key code process is like this:
scsi_dec_host_busy
	atomic_dec(&shost->host_busy);
	if (unlikely(scsi_host_in_recovery(shost))) {
		spin_lock_irqsave(shost->host_lock, flags);
		...
		scsi_eh_wakeup(shost)
		...
	}

scsi_schedule_eh
	spin_lock_irqsave(shost->host_lock, flags);
	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
		...
		scsi_eh_wakeup(shost);
	}

scsi_eh_wakeup
	if (scsi_host_busy(shost) == shost->host_failed)
		wake_up_process(shost->ehandler);

In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
function wakes up the SCSI error handler in the following timing:

CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
LOAD shost_state(!=recovery)
                                  scsi_host_set_state(SHOST_RECOVERY)
                                  scsi_eh_wakeup(host_busy != host_failed)
atomic_dec(&shost->host_busy);
if (scsi_host_in_recovery(shost))

Add a smp_mb between host_busy and shost_state.

Signed-off-by: zhengbin <zhengbin13@huawei.com>
---
 drivers/scsi/scsi_error.c | 7 +++++++
 drivers/scsi/scsi_lib.c   | 5 +++++
 2 files changed, 12 insertions(+)

--
2.7.4

Comments

Pavel Tikhomirov March 25, 2019, 10:02 a.m. UTC | #1
Likely we should do the same for host_eh_scheduled++ as we do for 
host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These 
way rcu_read_lock would be enough: if we don't see RECOVERY state in 
scsi_dec_host_busy, that means that host_eh_scheduled++ and 
corresponding scsi_eh_wakeup not yet happened, and they will handle 
wakeup for us.

Bart did these rcu-way to make common path faster, so better we do it 
without slow mem-barrier.

On 3/25/19 12:05 PM, zhengbin wrote:
> When I use fio test kernel in the following steps:
> 1.The sas controller mixes SAS/SATA disks
> 2.Use fio test all disks
> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
> 
> it will hung in ata_port_wait_eh
> Call trace:
>   __switch_to+0xb4/0x1b8
>   __schedule+0x1e8/0x718
>   schedule+0x38/0x90
>   ata_port_wait_eh+0x70/0xf8
>   sas_ata_wait_eh+0x24/0x30 [libsas]
>   transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>   phy_reset_work+0x20/0x30 [libsas]
>   process_one_work+0x1e4/0x460
>   worker_thread+0x40/0x450
>   kthread+0x12c/0x130
>   ret_from_fork+0x10/0x18
> 
> The key code process is like this:
> scsi_dec_host_busy
> 	atomic_dec(&shost->host_busy);
> 	if (unlikely(scsi_host_in_recovery(shost))) {
> 		spin_lock_irqsave(shost->host_lock, flags);
> 		...
> 		scsi_eh_wakeup(shost)
> 		...
> 	}
> 
> scsi_schedule_eh
> 	spin_lock_irqsave(shost->host_lock, flags);
> 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
> 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
> 		...
> 		scsi_eh_wakeup(shost);
> 	}
> 
> scsi_eh_wakeup
> 	if (scsi_host_busy(shost) == shost->host_failed)
> 		wake_up_process(shost->ehandler);
> 
> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
> function wakes up the SCSI error handler in the following timing:
> 
> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
> LOAD shost_state(!=recovery)
>                                    scsi_host_set_state(SHOST_RECOVERY)
>                                    scsi_eh_wakeup(host_busy != host_failed)
> atomic_dec(&shost->host_busy);
> if (scsi_host_in_recovery(shost))
> 
> Add a smp_mb between host_busy and shost_state.
> 
> Signed-off-by: zhengbin <zhengbin13@huawei.com>
> ---
>   drivers/scsi/scsi_error.c | 7 +++++++
>   drivers/scsi/scsi_lib.c   | 5 +++++
>   2 files changed, 12 insertions(+)
> 
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index 1b8378f..c605a71 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
> 
>   	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>   	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
> +		/*
> +		 * We have to order shost_state store above and test of
> +		 * the host_busy(scsi_eh_wakeup will test it), because
> +		 * scsi_dec_host_busy accesses these variables without
> +		 * host_lock.
> +		 */
> +		smp_mb();
>   		shost->host_eh_scheduled++;
>   		scsi_eh_wakeup(shost);
>   	}
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 601b9f1..9094d20 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
> 
>   	rcu_read_lock();
>   	atomic_dec(&shost->host_busy);
> +	/*
> +	 * We have to order host_busy dec above and test of the shost_state
> +	 * below outside the host_lock.
> +	 */
> +	smp_mb();
>   	if (unlikely(scsi_host_in_recovery(shost))) {
>   		spin_lock_irqsave(shost->host_lock, flags);
>   		if (shost->host_failed || shost->host_eh_scheduled)
> --
> 2.7.4
>
Zheng Bin March 25, 2019, 11:55 a.m. UTC | #2
On 2019/3/25 18:02, Pavel Tikhomirov wrote:
> Likely we should do the same for host_eh_scheduled++ as we do for 
> host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These 
> way rcu_read_lock would be enough: if we don't see RECOVERY state in 
> scsi_dec_host_busy, that means that host_eh_scheduled++ and 
> corresponding scsi_eh_wakeup not yet happened, and they will handle 
> wakeup for us.
> 
> Bart did these rcu-way to make common path faster, so better we do it 
> without slow mem-barrier.
If we use rcu for host_eh_scheduled. we should add rcu_head in Scsi_Host,
replace scsi_schedule_eh(ap->scsi_host) with call_rcu in ata_std_sched_eh, sas_queue_reset??
This will trigger a kernel hang or oops because of double or more call_rcu() call.

Any more suggestions?
> 
> On 3/25/19 12:05 PM, zhengbin wrote:
>> When I use fio test kernel in the following steps:
>> 1.The sas controller mixes SAS/SATA disks
>> 2.Use fio test all disks
>> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
>>
>> it will hung in ata_port_wait_eh
>> Call trace:
>>   __switch_to+0xb4/0x1b8
>>   __schedule+0x1e8/0x718
>>   schedule+0x38/0x90
>>   ata_port_wait_eh+0x70/0xf8
>>   sas_ata_wait_eh+0x24/0x30 [libsas]
>>   transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>>   phy_reset_work+0x20/0x30 [libsas]
>>   process_one_work+0x1e4/0x460
>>   worker_thread+0x40/0x450
>>   kthread+0x12c/0x130
>>   ret_from_fork+0x10/0x18
>>
>> The key code process is like this:
>> scsi_dec_host_busy
>> 	atomic_dec(&shost->host_busy);
>> 	if (unlikely(scsi_host_in_recovery(shost))) {
>> 		spin_lock_irqsave(shost->host_lock, flags);
>> 		...
>> 		scsi_eh_wakeup(shost)
>> 		...
>> 	}
>>
>> scsi_schedule_eh
>> 	spin_lock_irqsave(shost->host_lock, flags);
>> 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>> 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>> 		...
>> 		scsi_eh_wakeup(shost);
>> 	}
>>
>> scsi_eh_wakeup
>> 	if (scsi_host_busy(shost) == shost->host_failed)
>> 		wake_up_process(shost->ehandler);
>>
>> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
>> function wakes up the SCSI error handler in the following timing:
>>
>> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
>> LOAD shost_state(!=recovery)
>>                                    scsi_host_set_state(SHOST_RECOVERY)
>>                                    scsi_eh_wakeup(host_busy != host_failed)
>> atomic_dec(&shost->host_busy);
>> if (scsi_host_in_recovery(shost))
>>
>> Add a smp_mb between host_busy and shost_state.
>>
>> Signed-off-by: zhengbin <zhengbin13@huawei.com>
>> ---
>>   drivers/scsi/scsi_error.c | 7 +++++++
>>   drivers/scsi/scsi_lib.c   | 5 +++++
>>   2 files changed, 12 insertions(+)
>>
>> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
>> index 1b8378f..c605a71 100644
>> --- a/drivers/scsi/scsi_error.c
>> +++ b/drivers/scsi/scsi_error.c
>> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>>
>>   	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>   	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>> +		/*
>> +		 * We have to order shost_state store above and test of
>> +		 * the host_busy(scsi_eh_wakeup will test it), because
>> +		 * scsi_dec_host_busy accesses these variables without
>> +		 * host_lock.
>> +		 */
>> +		smp_mb();
>>   		shost->host_eh_scheduled++;
>>   		scsi_eh_wakeup(shost);
>>   	}
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index 601b9f1..9094d20 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
>>
>>   	rcu_read_lock();
>>   	atomic_dec(&shost->host_busy);
>> +	/*
>> +	 * We have to order host_busy dec above and test of the shost_state
>> +	 * below outside the host_lock.
>> +	 */
>> +	smp_mb();
>>   	if (unlikely(scsi_host_in_recovery(shost))) {
>>   		spin_lock_irqsave(shost->host_lock, flags);
>>   		if (shost->host_failed || shost->host_eh_scheduled)
>> --
>> 2.7.4
>>
>
John Garry March 25, 2019, 12:20 p.m. UTC | #3
On 25/03/2019 11:55, zhengbin (A) wrote:
> On 2019/3/25 18:02, Pavel Tikhomirov wrote:
>> Likely we should do the same for host_eh_scheduled++ as we do for
>> host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These
>> way rcu_read_lock would be enough: if we don't see RECOVERY state in
>> scsi_dec_host_busy, that means that host_eh_scheduled++ and
>> corresponding scsi_eh_wakeup not yet happened, and they will handle
>> wakeup for us.
>>
>> Bart did these rcu-way to make common path faster, so better we do it
>> without slow mem-barrier.

Out of curiousity, which platform did you observe this on?

Thanks,
John

> If we use rcu for host_eh_scheduled. we should add rcu_head in Scsi_Host,
> replace scsi_schedule_eh(ap->scsi_host) with call_rcu in ata_std_sched_eh, sas_queue_reset??
> This will trigger a kernel hang or oops because of double or more call_rcu() call.
>
> Any more suggestions?
>>
>> On 3/25/19 12:05 PM, zhengbin wrote:
>>> When I use fio test kernel in the following steps:
>>> 1.The sas controller mixes SAS/SATA disks
>>> 2.Use fio test all disks
>>> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
>>>
>>> it will hung in ata_port_wait_eh
>>> Call trace:
>>>   __switch_to+0xb4/0x1b8
>>>   __schedule+0x1e8/0x718
>>>   schedule+0x38/0x90
>>>   ata_port_wait_eh+0x70/0xf8
>>>   sas_ata_wait_eh+0x24/0x30 [libsas]
>>>   transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>>>   phy_reset_work+0x20/0x30 [libsas]
>>>   process_one_work+0x1e4/0x460
>>>   worker_thread+0x40/0x450
>>>   kthread+0x12c/0x130
>>>   ret_from_fork+0x10/0x18
>>>
>>> The key code process is like this:
>>> scsi_dec_host_busy
>>> 	atomic_dec(&shost->host_busy);
>>> 	if (unlikely(scsi_host_in_recovery(shost))) {
>>> 		spin_lock_irqsave(shost->host_lock, flags);
>>> 		...
>>> 		scsi_eh_wakeup(shost)
>>> 		...
>>> 	}
>>>
>>> scsi_schedule_eh
>>> 	spin_lock_irqsave(shost->host_lock, flags);
>>> 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>> 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>> 		...
>>> 		scsi_eh_wakeup(shost);
>>> 	}
>>>
>>> scsi_eh_wakeup
>>> 	if (scsi_host_busy(shost) == shost->host_failed)
>>> 		wake_up_process(shost->ehandler);
>>>
>>> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
>>> function wakes up the SCSI error handler in the following timing:
>>>
>>> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
>>> LOAD shost_state(!=recovery)
>>>                                    scsi_host_set_state(SHOST_RECOVERY)
>>>                                    scsi_eh_wakeup(host_busy != host_failed)
>>> atomic_dec(&shost->host_busy);
>>> if (scsi_host_in_recovery(shost))
>>>
>>> Add a smp_mb between host_busy and shost_state.
>>>
>>> Signed-off-by: zhengbin <zhengbin13@huawei.com>
>>> ---
>>>   drivers/scsi/scsi_error.c | 7 +++++++
>>>   drivers/scsi/scsi_lib.c   | 5 +++++
>>>   2 files changed, 12 insertions(+)
>>>
>>> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
>>> index 1b8378f..c605a71 100644
>>> --- a/drivers/scsi/scsi_error.c
>>> +++ b/drivers/scsi/scsi_error.c
>>> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>>>
>>>   	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>   	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>> +		/*
>>> +		 * We have to order shost_state store above and test of
>>> +		 * the host_busy(scsi_eh_wakeup will test it), because
>>> +		 * scsi_dec_host_busy accesses these variables without
>>> +		 * host_lock.
>>> +		 */
>>> +		smp_mb();
>>>   		shost->host_eh_scheduled++;
>>>   		scsi_eh_wakeup(shost);
>>>   	}
>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>> index 601b9f1..9094d20 100644
>>> --- a/drivers/scsi/scsi_lib.c
>>> +++ b/drivers/scsi/scsi_lib.c
>>> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
>>>
>>>   	rcu_read_lock();
>>>   	atomic_dec(&shost->host_busy);
>>> +	/*
>>> +	 * We have to order host_busy dec above and test of the shost_state
>>> +	 * below outside the host_lock.
>>> +	 */
>>> +	smp_mb();
>>>   	if (unlikely(scsi_host_in_recovery(shost))) {
>>>   		spin_lock_irqsave(shost->host_lock, flags);
>>>   		if (shost->host_failed || shost->host_eh_scheduled)
>>> --
>>> 2.7.4
>>>
>>
>
>
> .
>
Zheng Bin March 25, 2019, 12:49 p.m. UTC | #4
On 2019/3/25 20:20, John Garry wrote:
> On 25/03/2019 11:55, zhengbin (A) wrote:
>> On 2019/3/25 18:02, Pavel Tikhomirov wrote:
>>> Likely we should do the same for host_eh_scheduled++ as we do for
>>> host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These
>>> way rcu_read_lock would be enough: if we don't see RECOVERY state in
>>> scsi_dec_host_busy, that means that host_eh_scheduled++ and
>>> corresponding scsi_eh_wakeup not yet happened, and they will handle
>>> wakeup for us.
>>>
>>> Bart did these rcu-way to make common path faster, so better we do it
>>> without slow mem-barrier.
> 
> Out of curiousity, which platform did you observe this on?
ARM64
> 
> Thanks,
> John
> 
>> If we use rcu for host_eh_scheduled. we should add rcu_head in Scsi_Host,
>> replace scsi_schedule_eh(ap->scsi_host) with call_rcu in ata_std_sched_eh, sas_queue_reset??
>> This will trigger a kernel hang or oops because of double or more call_rcu() call.
>>
>> Any more suggestions?
>>>
>>> On 3/25/19 12:05 PM, zhengbin wrote:
>>>> When I use fio test kernel in the following steps:
>>>> 1.The sas controller mixes SAS/SATA disks
>>>> 2.Use fio test all disks
>>>> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
>>>>
>>>> it will hung in ata_port_wait_eh
>>>> Call trace:
>>>>   __switch_to+0xb4/0x1b8
>>>>   __schedule+0x1e8/0x718
>>>>   schedule+0x38/0x90
>>>>   ata_port_wait_eh+0x70/0xf8
>>>>   sas_ata_wait_eh+0x24/0x30 [libsas]
>>>>   transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>>>>   phy_reset_work+0x20/0x30 [libsas]
>>>>   process_one_work+0x1e4/0x460
>>>>   worker_thread+0x40/0x450
>>>>   kthread+0x12c/0x130
>>>>   ret_from_fork+0x10/0x18
>>>>
>>>> The key code process is like this:
>>>> scsi_dec_host_busy
>>>>     atomic_dec(&shost->host_busy);
>>>>     if (unlikely(scsi_host_in_recovery(shost))) {
>>>>         spin_lock_irqsave(shost->host_lock, flags);
>>>>         ...
>>>>         scsi_eh_wakeup(shost)
>>>>         ...
>>>>     }
>>>>
>>>> scsi_schedule_eh
>>>>     spin_lock_irqsave(shost->host_lock, flags);
>>>>     if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>>         scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>>>         ...
>>>>         scsi_eh_wakeup(shost);
>>>>     }
>>>>
>>>> scsi_eh_wakeup
>>>>     if (scsi_host_busy(shost) == shost->host_failed)
>>>>         wake_up_process(shost->ehandler);
>>>>
>>>> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
>>>> function wakes up the SCSI error handler in the following timing:
>>>>
>>>> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
>>>> LOAD shost_state(!=recovery)
>>>>                                    scsi_host_set_state(SHOST_RECOVERY)
>>>>                                    scsi_eh_wakeup(host_busy != host_failed)
>>>> atomic_dec(&shost->host_busy);
>>>> if (scsi_host_in_recovery(shost))
>>>>
>>>> Add a smp_mb between host_busy and shost_state.
>>>>
>>>> Signed-off-by: zhengbin <zhengbin13@huawei.com>
>>>> ---
>>>>   drivers/scsi/scsi_error.c | 7 +++++++
>>>>   drivers/scsi/scsi_lib.c   | 5 +++++
>>>>   2 files changed, 12 insertions(+)
>>>>
>>>> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
>>>> index 1b8378f..c605a71 100644
>>>> --- a/drivers/scsi/scsi_error.c
>>>> +++ b/drivers/scsi/scsi_error.c
>>>> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>>>>
>>>>       if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>>           scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>>> +        /*
>>>> +         * We have to order shost_state store above and test of
>>>> +         * the host_busy(scsi_eh_wakeup will test it), because
>>>> +         * scsi_dec_host_busy accesses these variables without
>>>> +         * host_lock.
>>>> +         */
>>>> +        smp_mb();
>>>>           shost->host_eh_scheduled++;
>>>>           scsi_eh_wakeup(shost);
>>>>       }
>>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>>> index 601b9f1..9094d20 100644
>>>> --- a/drivers/scsi/scsi_lib.c
>>>> +++ b/drivers/scsi/scsi_lib.c
>>>> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
>>>>
>>>>       rcu_read_lock();
>>>>       atomic_dec(&shost->host_busy);
>>>> +    /*
>>>> +     * We have to order host_busy dec above and test of the shost_state
>>>> +     * below outside the host_lock.
>>>> +     */
>>>> +    smp_mb();
>>>>       if (unlikely(scsi_host_in_recovery(shost))) {
>>>>           spin_lock_irqsave(shost->host_lock, flags);
>>>>           if (shost->host_failed || shost->host_eh_scheduled)
>>>> -- 
>>>> 2.7.4
>>>>
>>>
>>
>>
>> .
>>
> 
> 
> 
> .
>
Zheng Bin March 26, 2019, 1:29 p.m. UTC | #5
On 2019/3/25 19:55, zhengbin (A) wrote:
> On 2019/3/25 18:02, Pavel Tikhomirov wrote:
>> Likely we should do the same for host_eh_scheduled++ as we do for 
>> host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These 
>> way rcu_read_lock would be enough: if we don't see RECOVERY state in 
>> scsi_dec_host_busy, that means that host_eh_scheduled++ and 
>> corresponding scsi_eh_wakeup not yet happened, and they will handle 
>> wakeup for us.
>>
>> Bart did these rcu-way to make common path faster, so better we do it 
>> without slow mem-barrier.
Bart did these rcu-way, and protecting the host_failed checks with the SCSI host lock.
If we use rcu-way, maybe we need to put host_busy & shost_state in host_lock too.
I think we should use smp_mb().

Looking forward to reply, thanks.
> If we use rcu for host_eh_scheduled. we should add rcu_head in Scsi_Host,
> replace scsi_schedule_eh(ap->scsi_host) with call_rcu in ata_std_sched_eh, sas_queue_reset??
> This will trigger a kernel hang or oops because of double or more call_rcu() call.
> 
> Any more suggestions?>>
>> On 3/25/19 12:05 PM, zhengbin wrote:
>>> When I use fio test kernel in the following steps:
>>> 1.The sas controller mixes SAS/SATA disks
>>> 2.Use fio test all disks
>>> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
>>>
>>> it will hung in ata_port_wait_eh
>>> Call trace:
>>>   __switch_to+0xb4/0x1b8
>>>   __schedule+0x1e8/0x718
>>>   schedule+0x38/0x90
>>>   ata_port_wait_eh+0x70/0xf8
>>>   sas_ata_wait_eh+0x24/0x30 [libsas]
>>>   transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>>>   phy_reset_work+0x20/0x30 [libsas]
>>>   process_one_work+0x1e4/0x460
>>>   worker_thread+0x40/0x450
>>>   kthread+0x12c/0x130
>>>   ret_from_fork+0x10/0x18
>>>
>>> The key code process is like this:
>>> scsi_dec_host_busy
>>> 	atomic_dec(&shost->host_busy);
>>> 	if (unlikely(scsi_host_in_recovery(shost))) {
>>> 		spin_lock_irqsave(shost->host_lock, flags);
>>> 		...
>>> 		scsi_eh_wakeup(shost)
>>> 		...
>>> 	}
>>>
>>> scsi_schedule_eh
>>> 	spin_lock_irqsave(shost->host_lock, flags);
>>> 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>> 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>> 		...
>>> 		scsi_eh_wakeup(shost);
>>> 	}
>>>
>>> scsi_eh_wakeup
>>> 	if (scsi_host_busy(shost) == shost->host_failed)
>>> 		wake_up_process(shost->ehandler);
>>>
>>> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
>>> function wakes up the SCSI error handler in the following timing:
>>>
>>> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
>>> LOAD shost_state(!=recovery)
>>>                                    scsi_host_set_state(SHOST_RECOVERY)
>>>                                    scsi_eh_wakeup(host_busy != host_failed)
>>> atomic_dec(&shost->host_busy);
>>> if (scsi_host_in_recovery(shost))
>>>
>>> Add a smp_mb between host_busy and shost_state.
>>>
>>> Signed-off-by: zhengbin <zhengbin13@huawei.com>
>>> ---
>>>   drivers/scsi/scsi_error.c | 7 +++++++
>>>   drivers/scsi/scsi_lib.c   | 5 +++++
>>>   2 files changed, 12 insertions(+)
>>>
>>> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
>>> index 1b8378f..c605a71 100644
>>> --- a/drivers/scsi/scsi_error.c
>>> +++ b/drivers/scsi/scsi_error.c
>>> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>>>
>>>   	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>   	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>> +		/*
>>> +		 * We have to order shost_state store above and test of
>>> +		 * the host_busy(scsi_eh_wakeup will test it), because
>>> +		 * scsi_dec_host_busy accesses these variables without
>>> +		 * host_lock.
>>> +		 */
>>> +		smp_mb();
>>>   		shost->host_eh_scheduled++;
>>>   		scsi_eh_wakeup(shost);
>>>   	}
>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>> index 601b9f1..9094d20 100644
>>> --- a/drivers/scsi/scsi_lib.c
>>> +++ b/drivers/scsi/scsi_lib.c
>>> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
>>>
>>>   	rcu_read_lock();
>>>   	atomic_dec(&shost->host_busy);
>>> +	/*
>>> +	 * We have to order host_busy dec above and test of the shost_state
>>> +	 * below outside the host_lock.
>>> +	 */
>>> +	smp_mb();
>>>   	if (unlikely(scsi_host_in_recovery(shost))) {
>>>   		spin_lock_irqsave(shost->host_lock, flags);
>>>   		if (shost->host_failed || shost->host_eh_scheduled)
>>> --
>>> 2.7.4
>>>
>>
Zheng Bin March 30, 2019, 7:38 a.m. UTC | #6
ping

On 2019/3/26 21:29, zhengbin (A) wrote:
> On 2019/3/25 19:55, zhengbin (A) wrote:
>> On 2019/3/25 18:02, Pavel Tikhomirov wrote:
>>> Likely we should do the same for host_eh_scheduled++ as we do for 
>>> host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These 
>>> way rcu_read_lock would be enough: if we don't see RECOVERY state in 
>>> scsi_dec_host_busy, that means that host_eh_scheduled++ and 
>>> corresponding scsi_eh_wakeup not yet happened, and they will handle 
>>> wakeup for us.
>>>
>>> Bart did these rcu-way to make common path faster, so better we do it 
>>> without slow mem-barrier.
> Bart did these rcu-way, and protecting the host_failed checks with the SCSI host lock.
> If we use rcu-way, maybe we need to put host_busy & shost_state in host_lock too.
> I think we should use smp_mb().
> 
> Looking forward to reply, thanks.
>> If we use rcu for host_eh_scheduled. we should add rcu_head in Scsi_Host,
>> replace scsi_schedule_eh(ap->scsi_host) with call_rcu in ata_std_sched_eh, sas_queue_reset??
>> This will trigger a kernel hang or oops because of double or more call_rcu() call.
>>
>> Any more suggestions?>>
>>> On 3/25/19 12:05 PM, zhengbin wrote:
>>>> When I use fio test kernel in the following steps:
>>>> 1.The sas controller mixes SAS/SATA disks
>>>> 2.Use fio test all disks
>>>> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
>>>>
>>>> it will hung in ata_port_wait_eh
>>>> Call trace:
>>>>   __switch_to+0xb4/0x1b8
>>>>   __schedule+0x1e8/0x718
>>>>   schedule+0x38/0x90
>>>>   ata_port_wait_eh+0x70/0xf8
>>>>   sas_ata_wait_eh+0x24/0x30 [libsas]
>>>>   transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>>>>   phy_reset_work+0x20/0x30 [libsas]
>>>>   process_one_work+0x1e4/0x460
>>>>   worker_thread+0x40/0x450
>>>>   kthread+0x12c/0x130
>>>>   ret_from_fork+0x10/0x18
>>>>
>>>> The key code process is like this:
>>>> scsi_dec_host_busy
>>>> 	atomic_dec(&shost->host_busy);
>>>> 	if (unlikely(scsi_host_in_recovery(shost))) {
>>>> 		spin_lock_irqsave(shost->host_lock, flags);
>>>> 		...
>>>> 		scsi_eh_wakeup(shost)
>>>> 		...
>>>> 	}
>>>>
>>>> scsi_schedule_eh
>>>> 	spin_lock_irqsave(shost->host_lock, flags);
>>>> 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>> 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>>> 		...
>>>> 		scsi_eh_wakeup(shost);
>>>> 	}
>>>>
>>>> scsi_eh_wakeup
>>>> 	if (scsi_host_busy(shost) == shost->host_failed)
>>>> 		wake_up_process(shost->ehandler);
>>>>
>>>> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
>>>> function wakes up the SCSI error handler in the following timing:
>>>>
>>>> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
>>>> LOAD shost_state(!=recovery)
>>>>                                    scsi_host_set_state(SHOST_RECOVERY)
>>>>                                    scsi_eh_wakeup(host_busy != host_failed)
>>>> atomic_dec(&shost->host_busy);
>>>> if (scsi_host_in_recovery(shost))
>>>>
>>>> Add a smp_mb between host_busy and shost_state.
>>>>
>>>> Signed-off-by: zhengbin <zhengbin13@huawei.com>
>>>> ---
>>>>   drivers/scsi/scsi_error.c | 7 +++++++
>>>>   drivers/scsi/scsi_lib.c   | 5 +++++
>>>>   2 files changed, 12 insertions(+)
>>>>
>>>> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
>>>> index 1b8378f..c605a71 100644
>>>> --- a/drivers/scsi/scsi_error.c
>>>> +++ b/drivers/scsi/scsi_error.c
>>>> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>>>>
>>>>   	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>>   	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>>> +		/*
>>>> +		 * We have to order shost_state store above and test of
>>>> +		 * the host_busy(scsi_eh_wakeup will test it), because
>>>> +		 * scsi_dec_host_busy accesses these variables without
>>>> +		 * host_lock.
>>>> +		 */
>>>> +		smp_mb();
>>>>   		shost->host_eh_scheduled++;
>>>>   		scsi_eh_wakeup(shost);
>>>>   	}
>>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>>> index 601b9f1..9094d20 100644
>>>> --- a/drivers/scsi/scsi_lib.c
>>>> +++ b/drivers/scsi/scsi_lib.c
>>>> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
>>>>
>>>>   	rcu_read_lock();
>>>>   	atomic_dec(&shost->host_busy);
>>>> +	/*
>>>> +	 * We have to order host_busy dec above and test of the shost_state
>>>> +	 * below outside the host_lock.
>>>> +	 */
>>>> +	smp_mb();
>>>>   	if (unlikely(scsi_host_in_recovery(shost))) {
>>>>   		spin_lock_irqsave(shost->host_lock, flags);
>>>>   		if (shost->host_failed || shost->host_eh_scheduled)
>>>> --
>>>> 2.7.4
>>>>
>>>
Pavel Tikhomirov April 1, 2019, 9:49 a.m. UTC | #7
On 3/30/19 10:38 AM, zhengbin (A) wrote:
> ping
> 
> On 2019/3/26 21:29, zhengbin (A) wrote:
>> On 2019/3/25 19:55, zhengbin (A) wrote:
>>> On 2019/3/25 18:02, Pavel Tikhomirov wrote:
>>>> Likely we should do the same for host_eh_scheduled++ as we do for
>>>> host_failed++ in scsi_eh_inc_host_failed. Just put it in call_rcu. These
>>>> way rcu_read_lock would be enough: if we don't see RECOVERY state in
>>>> scsi_dec_host_busy, that means that host_eh_scheduled++ and
>>>> corresponding scsi_eh_wakeup not yet happened, and they will handle
>>>> wakeup for us.
>>>>
>>>> Bart did these rcu-way to make common path faster, so better we do it
>>>> without slow mem-barrier.
>> Bart did these rcu-way, and protecting the host_failed checks with the SCSI host lock.
>> If we use rcu-way, maybe we need to put host_busy & shost_state in host_lock too.
>> I think we should use smp_mb().

I'm not staying against using barrier(or spinlock implicit one). 
Actually I still use my old 'slow' spinlock fix because it is much simpler.

>>
>> Looking forward to reply, thanks.
>>> If we use rcu for host_eh_scheduled. we should add rcu_head in Scsi_Host,
>>> replace scsi_schedule_eh(ap->scsi_host) with call_rcu in ata_std_sched_eh, sas_queue_reset??
>>> This will trigger a kernel hang or oops because of double or more call_rcu() call.

I'm not an expert in rcu but if double rcu warning is a protection from 
leaking resources like double free, and as we don't free anything from 
our rcu callback, double calling it might be fine.

>>>
>>> Any more suggestions?>>
>>>> On 3/25/19 12:05 PM, zhengbin wrote:
>>>>> When I use fio test kernel in the following steps:
>>>>> 1.The sas controller mixes SAS/SATA disks
>>>>> 2.Use fio test all disks
>>>>> 3.Simultaneous enable/disable/link_reset/hard_reset PHY
>>>>>
>>>>> it will hung in ata_port_wait_eh
>>>>> Call trace:
>>>>>    __switch_to+0xb4/0x1b8
>>>>>    __schedule+0x1e8/0x718
>>>>>    schedule+0x38/0x90
>>>>>    ata_port_wait_eh+0x70/0xf8
>>>>>    sas_ata_wait_eh+0x24/0x30 [libsas]
>>>>>    transport_sas_phy_reset.isra.3+0x128/0x160 [libsas]
>>>>>    phy_reset_work+0x20/0x30 [libsas]
>>>>>    process_one_work+0x1e4/0x460
>>>>>    worker_thread+0x40/0x450
>>>>>    kthread+0x12c/0x130
>>>>>    ret_from_fork+0x10/0x18
>>>>>
>>>>> The key code process is like this:
>>>>> scsi_dec_host_busy
>>>>> 	atomic_dec(&shost->host_busy);
>>>>> 	if (unlikely(scsi_host_in_recovery(shost))) {
>>>>> 		spin_lock_irqsave(shost->host_lock, flags);
>>>>> 		...
>>>>> 		scsi_eh_wakeup(shost)
>>>>> 		...
>>>>> 	}
>>>>>
>>>>> scsi_schedule_eh
>>>>> 	spin_lock_irqsave(shost->host_lock, flags);
>>>>> 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>>> 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>>>> 		...
>>>>> 		scsi_eh_wakeup(shost);
>>>>> 	}
>>>>>
>>>>> scsi_eh_wakeup
>>>>> 	if (scsi_host_busy(shost) == shost->host_failed)
>>>>> 		wake_up_process(shost->ehandler);
>>>>>
>>>>> In scsi_dec_host_busy, host_busy & shost_state not in spinlock. Neither
>>>>> function wakes up the SCSI error handler in the following timing:
>>>>>
>>>>> CPU 0(call scsi_dec_host_busy)    CPU 1(call scsi_schedule_eh)
>>>>> LOAD shost_state(!=recovery)
>>>>>                                     scsi_host_set_state(SHOST_RECOVERY)
>>>>>                                     scsi_eh_wakeup(host_busy != host_failed)
>>>>> atomic_dec(&shost->host_busy);
>>>>> if (scsi_host_in_recovery(shost))
>>>>>
>>>>> Add a smp_mb between host_busy and shost_state.
>>>>>
>>>>> Signed-off-by: zhengbin <zhengbin13@huawei.com>
>>>>> ---
>>>>>    drivers/scsi/scsi_error.c | 7 +++++++
>>>>>    drivers/scsi/scsi_lib.c   | 5 +++++
>>>>>    2 files changed, 12 insertions(+)
>>>>>
>>>>> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
>>>>> index 1b8378f..c605a71 100644
>>>>> --- a/drivers/scsi/scsi_error.c
>>>>> +++ b/drivers/scsi/scsi_error.c
>>>>> @@ -88,6 +88,13 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
>>>>>
>>>>>    	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
>>>>>    	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
>>>>> +		/*
>>>>> +		 * We have to order shost_state store above and test of
>>>>> +		 * the host_busy(scsi_eh_wakeup will test it), because
>>>>> +		 * scsi_dec_host_busy accesses these variables without
>>>>> +		 * host_lock.
>>>>> +		 */
>>>>> +		smp_mb();
>>>>>    		shost->host_eh_scheduled++;
>>>>>    		scsi_eh_wakeup(shost);
>>>>>    	}
>>>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>>>> index 601b9f1..9094d20 100644
>>>>> --- a/drivers/scsi/scsi_lib.c
>>>>> +++ b/drivers/scsi/scsi_lib.c
>>>>> @@ -337,6 +337,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost)
>>>>>
>>>>>    	rcu_read_lock();
>>>>>    	atomic_dec(&shost->host_busy);
>>>>> +	/*
>>>>> +	 * We have to order host_busy dec above and test of the shost_state
>>>>> +	 * below outside the host_lock.
>>>>> +	 */
>>>>> +	smp_mb();
>>>>>    	if (unlikely(scsi_host_in_recovery(shost))) {
>>>>>    		spin_lock_irqsave(shost->host_lock, flags);
>>>>>    		if (shost->host_failed || shost->host_eh_scheduled)
>>>>> --
>>>>> 2.7.4
>>>>>
>>>>
>
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 1b8378f..c605a71 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -88,6 +88,13 @@  void scsi_schedule_eh(struct Scsi_Host *shost)

 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
+		/*
+		 * We have to order shost_state store above and test of
+		 * the host_busy(scsi_eh_wakeup will test it), because
+		 * scsi_dec_host_busy accesses these variables without
+		 * host_lock.
+		 */
+		smp_mb();
 		shost->host_eh_scheduled++;
 		scsi_eh_wakeup(shost);
 	}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 601b9f1..9094d20 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -337,6 +337,11 @@  static void scsi_dec_host_busy(struct Scsi_Host *shost)

 	rcu_read_lock();
 	atomic_dec(&shost->host_busy);
+	/*
+	 * We have to order host_busy dec above and test of the shost_state
+	 * below outside the host_lock.
+	 */
+	smp_mb();
 	if (unlikely(scsi_host_in_recovery(shost))) {
 		spin_lock_irqsave(shost->host_lock, flags);
 		if (shost->host_failed || shost->host_eh_scheduled)