diff mbox series

[RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING

Message ID 20231101230214.57190-1-junxiao.bi@oracle.com (mailing list archive)
State Superseded, archived
Delegated to: Song Liu
Headers show
Series [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING | expand

Commit Message

Junxiao Bi Nov. 1, 2023, 11:02 p.m. UTC
Looks like there is a race between md_write_start() and raid5d() which
can cause deadlock. Run into this issue while raid_check is running.

 md_write_start:                                                                        raid5d:
 if (mddev->safemode == 1)
     mddev->safemode = 0;
 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
 if (mddev->in_sync || mddev->sync_checkers) {
     spin_lock(&mddev->lock);
     if (mddev->in_sync) {
         mddev->in_sync = 0;
         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
         set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
                                                                                        >>> running before md_write_start wake up it
                                                                                        if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
                                                                                              spin_unlock_irq(&conf->device_lock);
                                                                                              md_check_recovery(mddev);
                                                                                              spin_lock_irq(&conf->device_lock);

                                                                                              /*
                                                                                               * Waiting on MD_SB_CHANGE_PENDING below may deadlock
                                                                                               * seeing md_check_recovery() is needed to clear
                                                                                               * the flag when using mdmon.
                                                                                               */
                                                                                              continue;
                                                                                         }

                                                                                         wait_event_lock_irq(mddev->sb_wait,     >>>>>>>>>>> hung
                                                                                             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
                                                                                             conf->device_lock);
         md_wakeup_thread(mddev->thread);
         did_change = 1;
     }
     spin_unlock(&mddev->lock);
 }

 ...

 wait_event(mddev->sb_wait,    >>>>>>>>>> hung
    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
    mddev->suspended);

commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
introduced this issue, since it want to a reschedule for flushing blk_plug,
let do it explicitly to address that issue.

Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
---
 block/blk-core.c   | 1 +
 drivers/md/raid5.c | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

Comments

Yu Kuai Nov. 2, 2023, 1:24 a.m. UTC | #1
Hi,

在 2023/11/02 7:02, Junxiao Bi 写道:
> Looks like there is a race between md_write_start() and raid5d() which

Is this a real issue or just based on code review?
> can cause deadlock. Run into this issue while raid_check is running.
> 
>   md_write_start:                                                                        raid5d:
>   if (mddev->safemode == 1)
>       mddev->safemode = 0;
>   /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
>   if (mddev->in_sync || mddev->sync_checkers) {
>       spin_lock(&mddev->lock);
>       if (mddev->in_sync) {
>           mddev->in_sync = 0;
>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>                                                                                          >>> running before md_write_start wake up it
>                                                                                          if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>                                                                                                spin_unlock_irq(&conf->device_lock);
>                                                                                                md_check_recovery(mddev);
>                                                                                                spin_lock_irq(&conf->device_lock);
> 
>                                                                                                /*
>                                                                                                 * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>                                                                                                 * seeing md_check_recovery() is needed to clear
>                                                                                                 * the flag when using mdmon.
>                                                                                                 */
>                                                                                                continue;
>                                                                                           }
> 
>                                                                                           wait_event_lock_irq(mddev->sb_wait,     >>>>>>>>>>> hung
>                                                                                               !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>                                                                                               conf->device_lock);
>           md_wakeup_thread(mddev->thread);
>           did_change = 1;
>       }
>       spin_unlock(&mddev->lock);
>   }
> 
>   ...
> 
>   wait_event(mddev->sb_wait,    >>>>>>>>>> hung
>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>      mddev->suspended);
> 

This is not correct, if daemon thread is running, md_wakeup_thread()
will make sure that daemon thread will run again, see details how
THREAD_WAKEUP worked in md_thread().

Thanks,
Kuai

> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
> introduced this issue, since it want to a reschedule for flushing blk_plug,
> let do it explicitly to address that issue.
> 
> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>   block/blk-core.c   | 1 +
>   drivers/md/raid5.c | 9 +++++----
>   2 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 9d51e9894ece..bc8757a78477 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
>   	if (unlikely(!rq_list_empty(plug->cached_rq)))
>   		blk_mq_free_plug_rqs(plug);
>   }
> +EXPORT_SYMBOL(__blk_flush_plug);
>   
>   /**
>    * blk_finish_plug - mark the end of a batch of submitted I/O
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 284cd71bcc68..25ec82f2813f 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>   			 * the flag when using mdmon.
>   			 */
>   			continue;
> +		} else {
> +			spin_unlock_irq(&conf->device_lock);
> +			blk_flush_plug(current);
> +			cond_resched();
> +			spin_lock_irq(&conf->device_lock);
>   		}
> -
> -		wait_event_lock_irq(mddev->sb_wait,
> -			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
> -			conf->device_lock);
>   	}
>   	pr_debug("%d stripes handled\n", handled);
>   
>
Junxiao Bi Nov. 2, 2023, 4:32 a.m. UTC | #2
On 11/1/23 6:24 PM, Yu Kuai wrote:

> Hi,
>
> 在 2023/11/02 7:02, Junxiao Bi 写道:
>> Looks like there is a race between md_write_start() and raid5d() which
>
> Is this a real issue or just based on code review?

It's a real issue, we see this hung in a production system, it's with 
v5.4, but i didn't see these function has much difference.

crash> bt 2683
PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
  #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
  #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
  #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
  #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
  #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
  #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
crash> ps -m 2683
[ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   
COMMAND: "md0_raid5"
crash>
crash> bt 96352
PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
  #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
  #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
  #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
  #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
  #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
  #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
  #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
  #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
  #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
  #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
crash> ps -m 96352
[ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   
COMMAND: "kworker/64:0"
crash>
crash> bt 25542
PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
  #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
  #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
  #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
  #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
  #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
  #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
crash>
crash> ps -m 25542
[ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   
COMMAND: "md0_resync"


>> can cause deadlock. Run into this issue while raid_check is running.
>>
>> md_write_start: raid5d:
>>   if (mddev->safemode == 1)
>>       mddev->safemode = 0;
>>   /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
>>   if (mddev->in_sync || mddev->sync_checkers) {
>>       spin_lock(&mddev->lock);
>>       if (mddev->in_sync) {
>>           mddev->in_sync = 0;
>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>> >>> running before md_write_start wake up it
>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>> spin_unlock_irq(&conf->device_lock);
>> md_check_recovery(mddev);
>> spin_lock_irq(&conf->device_lock);
>>
>> /*
>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>> * seeing md_check_recovery() is needed to clear
>> * the flag when using mdmon.
>> */
>> continue;
>> }
>>
>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>> conf->device_lock);
>>           md_wakeup_thread(mddev->thread);
>>           did_change = 1;
>>       }
>>       spin_unlock(&mddev->lock);
>>   }
>>
>>   ...
>>
>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>      mddev->suspended);
>>
>
> This is not correct, if daemon thread is running, md_wakeup_thread()
> will make sure that daemon thread will run again, see details how
> THREAD_WAKEUP worked in md_thread().

The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, even 
wake up it, it will hung again as that flag is still not cleared?

Thanks,

Junxiao.

>
> Thanks,
> Kuai
>
>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>> raid5d")
>> introduced this issue, since it want to a reschedule for flushing 
>> blk_plug,
>> let do it explicitly to address that issue.
>>
>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>> raid5d")
>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>> ---
>>   block/blk-core.c   | 1 +
>>   drivers/md/raid5.c | 9 +++++----
>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 9d51e9894ece..bc8757a78477 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>> bool from_schedule)
>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>           blk_mq_free_plug_rqs(plug);
>>   }
>> +EXPORT_SYMBOL(__blk_flush_plug);
>>     /**
>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 284cd71bcc68..25ec82f2813f 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>                * the flag when using mdmon.
>>                */
>>               continue;
>> +        } else {
>> +            spin_unlock_irq(&conf->device_lock);
>> +            blk_flush_plug(current);
>> +            cond_resched();
>> +            spin_lock_irq(&conf->device_lock);
>>           }
>> -
>> -        wait_event_lock_irq(mddev->sb_wait,
>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>> -            conf->device_lock);
>>       }
>>       pr_debug("%d stripes handled\n", handled);
>>
>
Yu Kuai Nov. 2, 2023, 7:16 a.m. UTC | #3
Hi,

在 2023/11/02 12:32, junxiao.bi@oracle.com 写道:
> On 11/1/23 6:24 PM, Yu Kuai wrote:
> 
>> Hi,
>>
>> 在 2023/11/02 7:02, Junxiao Bi 写道:
>>> Looks like there is a race between md_write_start() and raid5d() which
>>
>> Is this a real issue or just based on code review?
> 
> It's a real issue, we see this hung in a production system, it's with 
> v5.4, but i didn't see these function has much difference.
> 
> crash> bt 2683
> PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
>   #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
>   #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
>   #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
>   #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
>   #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
>   #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
> crash> ps -m 2683
> [ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65 
> COMMAND: "md0_raid5"
> crash>
> crash> bt 96352
> PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
>   #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
>   #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
>   #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
>   #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
>   #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
>   #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
>   #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
>   #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
>   #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
>   #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
> crash> ps -m 96352
> [ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00  CPU: 64 
> COMMAND: "kworker/64:0"
> crash>
> crash> bt 25542
> PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
>   #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
>   #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
>   #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
>   #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
>   #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
>   #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
> crash>
> crash> ps -m 25542
> [ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000  CPU: 32 
> COMMAND: "md0_resync"
> 
> 
>>> can cause deadlock. Run into this issue while raid_check is running.
>>>
>>> md_write_start: raid5d:
>>>   if (mddev->safemode == 1)
>>>       mddev->safemode = 0;
>>>   /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
>>>   if (mddev->in_sync || mddev->sync_checkers) {
>>>       spin_lock(&mddev->lock);
>>>       if (mddev->in_sync) {
>>>           mddev->in_sync = 0;
>>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>>> >>> running before md_write_start wake up it
>>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>>> spin_unlock_irq(&conf->device_lock);
>>> md_check_recovery(mddev);
>>> spin_lock_irq(&conf->device_lock);
>>>
>>> /*
>>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>>> * seeing md_check_recovery() is needed to clear
>>> * the flag when using mdmon.
>>> */
>>> continue;
>>> }
>>>
>>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>> conf->device_lock);
>>>           md_wakeup_thread(mddev->thread);
>>>           did_change = 1;
>>>       }
>>>       spin_unlock(&mddev->lock);
>>>   }
>>>
>>>   ...
>>>
>>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>>      mddev->suspended);
>>>
>>
>> This is not correct, if daemon thread is running, md_wakeup_thread()
>> will make sure that daemon thread will run again, see details how
>> THREAD_WAKEUP worked in md_thread().
> 
> The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, even 
> wake up it, it will hung again as that flag is still not cleared?

I aggree that daemon thread should not use wait_event(), however, take a
look at 5e2cf333b7bd, I think this is a common issue for all
personalities, and the better fix is that let bio submitted from
md_write_super() bypass wbt, this is reasonable because wbt is used to
throttle backgroup writeback io, and writing superblock should not be
throttled by wbt.

Thanks,
Kuai

> 
> Thanks,
> 
> Junxiao.
> 
>>
>> Thanks,
>> Kuai
>>
>>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>> raid5d")
>>> introduced this issue, since it want to a reschedule for flushing 
>>> blk_plug,
>>> let do it explicitly to address that issue.
>>>
>>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>> raid5d")
>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>>> ---
>>>   block/blk-core.c   | 1 +
>>>   drivers/md/raid5.c | 9 +++++----
>>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 9d51e9894ece..bc8757a78477 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>>> bool from_schedule)
>>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>>           blk_mq_free_plug_rqs(plug);
>>>   }
>>> +EXPORT_SYMBOL(__blk_flush_plug);
>>>     /**
>>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>> index 284cd71bcc68..25ec82f2813f 100644
>>> --- a/drivers/md/raid5.c
>>> +++ b/drivers/md/raid5.c
>>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>>                * the flag when using mdmon.
>>>                */
>>>               continue;
>>> +        } else {
>>> +            spin_unlock_irq(&conf->device_lock);
>>> +            blk_flush_plug(current);
>>> +            cond_resched();
>>> +            spin_lock_irq(&conf->device_lock);
>>>           }
>>> -
>>> -        wait_event_lock_irq(mddev->sb_wait,
>>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>> -            conf->device_lock);
>>>       }
>>>       pr_debug("%d stripes handled\n", handled);
>>>
>>
> .
>
Junxiao Bi Nov. 3, 2023, 7:11 p.m. UTC | #4
On 11/2/23 12:16 AM, Yu Kuai wrote:

> Hi,
>
> 在 2023/11/02 12:32, junxiao.bi@oracle.com 写道:
>> On 11/1/23 6:24 PM, Yu Kuai wrote:
>>
>>> Hi,
>>>
>>> 在 2023/11/02 7:02, Junxiao Bi 写道:
>>>> Looks like there is a race between md_write_start() and raid5d() which
>>>
>>> Is this a real issue or just based on code review?
>>
>> It's a real issue, we see this hung in a production system, it's with 
>> v5.4, but i didn't see these function has much difference.
>>
>> crash> bt 2683
>> PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
>>   #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
>>   #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
>>   #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
>>   #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
>>   #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
>>   #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
>> crash> ps -m 2683
>> [ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00 CPU: 65 
>> COMMAND: "md0_raid5"
>> crash>
>> crash> bt 96352
>> PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
>>   #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
>>   #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
>>   #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
>>   #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
>>   #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
>>   #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
>>   #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
>>   #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
>>   #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
>>   #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
>> crash> ps -m 96352
>> [ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00 CPU: 64 
>> COMMAND: "kworker/64:0"
>> crash>
>> crash> bt 25542
>> PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
>>   #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
>>   #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
>>   #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
>>   #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
>>   #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
>>   #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
>> crash>
>> crash> ps -m 25542
>> [ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000 CPU: 32 
>> COMMAND: "md0_resync"
>>
>>
>>>> can cause deadlock. Run into this issue while raid_check is running.
>>>>
>>>> md_write_start: raid5d:
>>>>   if (mddev->safemode == 1)
>>>>       mddev->safemode = 0;
>>>>   /* sync_checkers is always 0 when writes_pending is in per-cpu 
>>>> mode */
>>>>   if (mddev->in_sync || mddev->sync_checkers) {
>>>>       spin_lock(&mddev->lock);
>>>>       if (mddev->in_sync) {
>>>>           mddev->in_sync = 0;
>>>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>>>> >>> running before md_write_start wake up it
>>>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>>>> spin_unlock_irq(&conf->device_lock);
>>>> md_check_recovery(mddev);
>>>> spin_lock_irq(&conf->device_lock);
>>>>
>>>> /*
>>>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>>>> * seeing md_check_recovery() is needed to clear
>>>> * the flag when using mdmon.
>>>> */
>>>> continue;
>>>> }
>>>>
>>>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>>>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>> conf->device_lock);
>>>>           md_wakeup_thread(mddev->thread);
>>>>           did_change = 1;
>>>>       }
>>>>       spin_unlock(&mddev->lock);
>>>>   }
>>>>
>>>>   ...
>>>>
>>>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>>>      mddev->suspended);
>>>>
>>>
>>> This is not correct, if daemon thread is running, md_wakeup_thread()
>>> will make sure that daemon thread will run again, see details how
>>> THREAD_WAKEUP worked in md_thread().
>>
>> The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, 
>> even wake up it, it will hung again as that flag is still not cleared?
>
> I aggree that daemon thread should not use wait_event(), however, take a
> look at 5e2cf333b7bd, I think this is a common issue for all
> personalities, and the better fix is that let bio submitted from
> md_write_super() bypass wbt, this is reasonable because wbt is used to
> throttle backgroup writeback io, and writing superblock should not be
> throttled by wbt.

So the fix is the following plus reverting commit 5e2cf333b7bd?


diff --git a/drivers/md/md.c b/drivers/md/md.c
index 839e79e567ee..841bd4459817 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -931,7 +931,7 @@ void md_super_write(struct mddev *mddev, struct 
md_rdev *rdev,

         bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : 
rdev->bdev,
                                1,
-                              REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | 
REQ_FUA,
+                              REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | 
REQ_PREFLUSH | REQ_FUA,
                                GFP_NOIO, &mddev->sync_set);

         atomic_inc(&rdev->nr_pending);


Thanks,

Junxiao.

>
> Thanks,
> Kuai
>
>>
>> Thanks,
>>
>> Junxiao.
>>
>>>
>>> Thanks,
>>> Kuai
>>>
>>>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>> raid5d")
>>>> introduced this issue, since it want to a reschedule for flushing 
>>>> blk_plug,
>>>> let do it explicitly to address that issue.
>>>>
>>>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>> raid5d")
>>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>> ---
>>>>   block/blk-core.c   | 1 +
>>>>   drivers/md/raid5.c | 9 +++++----
>>>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 9d51e9894ece..bc8757a78477 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>>>> bool from_schedule)
>>>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>>>           blk_mq_free_plug_rqs(plug);
>>>>   }
>>>> +EXPORT_SYMBOL(__blk_flush_plug);
>>>>     /**
>>>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> index 284cd71bcc68..25ec82f2813f 100644
>>>> --- a/drivers/md/raid5.c
>>>> +++ b/drivers/md/raid5.c
>>>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>>>                * the flag when using mdmon.
>>>>                */
>>>>               continue;
>>>> +        } else {
>>>> +            spin_unlock_irq(&conf->device_lock);
>>>> +            blk_flush_plug(current);
>>>> +            cond_resched();
>>>> +            spin_lock_irq(&conf->device_lock);
>>>>           }
>>>> -
>>>> -        wait_event_lock_irq(mddev->sb_wait,
>>>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>> -            conf->device_lock);
>>>>       }
>>>>       pr_debug("%d stripes handled\n", handled);
>>>>
>>>
>> .
>>
>
Yu Kuai Nov. 4, 2023, 1:56 a.m. UTC | #5
Hi,

在 2023/11/04 3:11, junxiao.bi@oracle.com 写道:
> On 11/2/23 12:16 AM, Yu Kuai wrote:
> 
>> Hi,
>>
>> 在 2023/11/02 12:32, junxiao.bi@oracle.com 写道:
>>> On 11/1/23 6:24 PM, Yu Kuai wrote:
>>>
>>>> Hi,
>>>>
>>>> 在 2023/11/02 7:02, Junxiao Bi 写道:
>>>>> Looks like there is a race between md_write_start() and raid5d() which
>>>>
>>>> Is this a real issue or just based on code review?
>>>
>>> It's a real issue, we see this hung in a production system, it's with 
>>> v5.4, but i didn't see these function has much difference.
>>>
>>> crash> bt 2683
>>> PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
>>>   #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
>>>   #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
>>>   #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
>>>   #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
>>>   #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
>>>   #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
>>> crash> ps -m 2683
>>> [ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00 CPU: 65 
>>> COMMAND: "md0_raid5"
>>> crash>
>>> crash> bt 96352
>>> PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
>>>   #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
>>>   #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
>>>   #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
>>>   #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
>>>   #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
>>>   #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
>>>   #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
>>>   #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
>>>   #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
>>>   #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
>>> crash> ps -m 96352
>>> [ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00 CPU: 64 
>>> COMMAND: "kworker/64:0"
>>> crash>
>>> crash> bt 25542
>>> PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
>>>   #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
>>>   #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
>>>   #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
>>>   #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
>>>   #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
>>>   #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
>>> crash>
>>> crash> ps -m 25542
>>> [ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000 CPU: 32 
>>> COMMAND: "md0_resync"
>>>
>>>
>>>>> can cause deadlock. Run into this issue while raid_check is running.
>>>>>
>>>>> md_write_start: raid5d:
>>>>>   if (mddev->safemode == 1)
>>>>>       mddev->safemode = 0;
>>>>>   /* sync_checkers is always 0 when writes_pending is in per-cpu 
>>>>> mode */
>>>>>   if (mddev->in_sync || mddev->sync_checkers) {
>>>>>       spin_lock(&mddev->lock);
>>>>>       if (mddev->in_sync) {
>>>>>           mddev->in_sync = 0;
>>>>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>>>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>>>>> >>> running before md_write_start wake up it
>>>>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>>>>> spin_unlock_irq(&conf->device_lock);
>>>>> md_check_recovery(mddev);
>>>>> spin_lock_irq(&conf->device_lock);
>>>>>
>>>>> /*
>>>>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>>>>> * seeing md_check_recovery() is needed to clear
>>>>> * the flag when using mdmon.
>>>>> */
>>>>> continue;
>>>>> }
>>>>>
>>>>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>>>>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>>> conf->device_lock);
>>>>>           md_wakeup_thread(mddev->thread);
>>>>>           did_change = 1;
>>>>>       }
>>>>>       spin_unlock(&mddev->lock);
>>>>>   }
>>>>>
>>>>>   ...
>>>>>
>>>>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>>>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>>>>      mddev->suspended);
>>>>>
>>>>
>>>> This is not correct, if daemon thread is running, md_wakeup_thread()
>>>> will make sure that daemon thread will run again, see details how
>>>> THREAD_WAKEUP worked in md_thread().
>>>
>>> The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, 
>>> even wake up it, it will hung again as that flag is still not cleared?
>>
>> I aggree that daemon thread should not use wait_event(), however, take a
>> look at 5e2cf333b7bd, I think this is a common issue for all
>> personalities, and the better fix is that let bio submitted from
>> md_write_super() bypass wbt, this is reasonable because wbt is used to
>> throttle backgroup writeback io, and writing superblock should not be
>> throttled by wbt.
> 
> So the fix is the following plus reverting commit 5e2cf333b7bd?

Yes, I think this can work, and REQ_META should be added for the same
reason, see bio_issue_as_root_blkg().

Thanks,
Kuai

> 
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 839e79e567ee..841bd4459817 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -931,7 +931,7 @@ void md_super_write(struct mddev *mddev, struct 
> md_rdev *rdev,
> 
>          bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : 
> rdev->bdev,
>                                 1,
> -                              REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | 
> REQ_FUA,
> +                              REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | 
> REQ_PREFLUSH | REQ_FUA,
>                                 GFP_NOIO, &mddev->sync_set);
> 
>          atomic_inc(&rdev->nr_pending);
> 
> 
> Thanks,
> 
> Junxiao.
> 
>>
>> Thanks,
>> Kuai
>>
>>>
>>> Thanks,
>>>
>>> Junxiao.
>>>
>>>>
>>>> Thanks,
>>>> Kuai
>>>>
>>>>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>>> raid5d")
>>>>> introduced this issue, since it want to a reschedule for flushing 
>>>>> blk_plug,
>>>>> let do it explicitly to address that issue.
>>>>>
>>>>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>>> raid5d")
>>>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>> ---
>>>>>   block/blk-core.c   | 1 +
>>>>>   drivers/md/raid5.c | 9 +++++----
>>>>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>> index 9d51e9894ece..bc8757a78477 100644
>>>>> --- a/block/blk-core.c
>>>>> +++ b/block/blk-core.c
>>>>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>>>>> bool from_schedule)
>>>>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>>>>           blk_mq_free_plug_rqs(plug);
>>>>>   }
>>>>> +EXPORT_SYMBOL(__blk_flush_plug);
>>>>>     /**
>>>>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>> index 284cd71bcc68..25ec82f2813f 100644
>>>>> --- a/drivers/md/raid5.c
>>>>> +++ b/drivers/md/raid5.c
>>>>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>>>>                * the flag when using mdmon.
>>>>>                */
>>>>>               continue;
>>>>> +        } else {
>>>>> +            spin_unlock_irq(&conf->device_lock);
>>>>> +            blk_flush_plug(current);
>>>>> +            cond_resched();
>>>>> +            spin_lock_irq(&conf->device_lock);
>>>>>           }
>>>>> -
>>>>> -        wait_event_lock_irq(mddev->sb_wait,
>>>>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>>> -            conf->device_lock);
>>>>>       }
>>>>>       pr_debug("%d stripes handled\n", handled);
>>>>>
>>>>
>>> .
>>>
>>
> .
>
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 9d51e9894ece..bc8757a78477 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1149,6 +1149,7 @@  void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 	if (unlikely(!rq_list_empty(plug->cached_rq)))
 		blk_mq_free_plug_rqs(plug);
 }
+EXPORT_SYMBOL(__blk_flush_plug);
 
 /**
  * blk_finish_plug - mark the end of a batch of submitted I/O
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 284cd71bcc68..25ec82f2813f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6850,11 +6850,12 @@  static void raid5d(struct md_thread *thread)
 			 * the flag when using mdmon.
 			 */
 			continue;
+		} else {
+			spin_unlock_irq(&conf->device_lock);
+			blk_flush_plug(current);
+			cond_resched();
+			spin_lock_irq(&conf->device_lock);
 		}
-
-		wait_event_lock_irq(mddev->sb_wait,
-			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
-			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);