diff mbox series

[PATCHv3,2/3] scsi: Do not rely on blk-mq for double completions

Message ID 20181115175820.13391-3-keith.busch@intel.com (mailing list archive)
State Changes Requested
Headers show
Series scsi timeout handling updates | expand

Commit Message

Keith Busch Nov. 15, 2018, 5:58 p.m. UTC
The scsi timeout error handling had been directly updating the block
layer's request state to prevent a error handling and a natural completion
from completing the same request twice. Fix this layering violation
by having scsi control the fate of its commands with scsi owned flags
rather than use blk-mq's.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/scsi/scsi_error.c | 22 +++++++++++-----------
 drivers/scsi/scsi_lib.c   |  6 +++++-
 include/scsi/scsi_cmnd.h  |  5 ++++-
 3 files changed, 20 insertions(+), 13 deletions(-)

Comments

Hannes Reinecke Nov. 16, 2018, 9:53 a.m. UTC | #1
On 11/15/18 6:58 PM, Keith Busch wrote:
> The scsi timeout error handling had been directly updating the block
> layer's request state to prevent a error handling and a natural completion
> from completing the same request twice. Fix this layering violation
> by having scsi control the fate of its commands with scsi owned flags
> rather than use blk-mq's.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>   drivers/scsi/scsi_error.c | 22 +++++++++++-----------
>   drivers/scsi/scsi_lib.c   |  6 +++++-
>   include/scsi/scsi_cmnd.h  |  5 ++++-
>   3 files changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index dd338a8cd275..e92e088f636f 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
>   
>   	if (rtn == BLK_EH_DONE) {
>   		/*
> -		 * For blk-mq, we must set the request state to complete now
> -		 * before sending the request to the scsi error handler. This
> -		 * will prevent a use-after-free in the event the LLD manages
> -		 * to complete the request before the error handler finishes
> -		 * processing this timed out request.
> +		 * Set the command to complete first in order to prevent a real
> +		 * completion from releasing the command while error handling
> +		 * is using it. If the command was already completed, then the
> +		 * lower level driver beat the timeout handler, and it is safe
> +		 * to return without escalating error recovery.
>   		 *
> -		 * If the request was already completed, then the LLD beat the
> -		 * time out handler from transferring the request to the scsi
> -		 * error handler. In that case we can return immediately as no
> -		 * further action is required.
> +		 * If timeout handling lost the race to a real completion, the
> +		 * block layer may ignore that due to a fake timeout injection,
> +		 * so return RESET_TIMER to allow error handling another shot
> +		 * at this command.
>   		 */
> -		if (!blk_mq_mark_complete(req))
> -			return rtn;
> +		if (test_and_set_bit(__SCMD_COMPLETE, &scmd->flags))
> +			return BLK_EH_RESET_TIMER;
>   		if (scsi_abort_command(scmd) != SUCCESS) {
>   			set_host_byte(scmd, DID_TIME_OUT);
>   			scsi_eh_scmd_add(scmd);
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 5d83a162d03b..c1d5e4e36125 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
>   
>   static void scsi_mq_done(struct scsi_cmnd *cmd)
>   {
> +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
> +		return;
>   	trace_scsi_dispatch_cmd_done(cmd);
> -	blk_mq_complete_request(cmd->request);
> +	if (unlikely(!blk_mq_complete_request(cmd->request)))
> +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
>   }
>   
>   static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
> @@ -1701,6 +1704,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
>   			goto out_dec_host_busy;
>   		req->rq_flags |= RQF_DONTPREP;
>   	} else {
> +		cmd->flags &= ~SCMD_COMPLETE;
>   		blk_mq_start_request(req);
>   	}
>   
Why do you use a normal assignment here (and not a clear_bit() as in the 
above hunk)?

And shouldn't you add a barrier here to broadcast the state change to 
other cores?

Cheers,

Hannes
Bart Van Assche Nov. 16, 2018, 2:46 p.m. UTC | #2
On Fri, 2018-11-16 at 10:53 +0100, Hannes Reinecke wrote:
> On 11/15/18 6:58 PM, Keith Busch wrote:
> > The scsi timeout error handling had been directly updating the block
> > layer's request state to prevent a error handling and a natural completion
> > from completing the same request twice. Fix this layering violation
> > by having scsi control the fate of its commands with scsi owned flags
> > rather than use blk-mq's.
> > 
> > Signed-off-by: Keith Busch <keith.busch@intel.com>
> > ---
> >   drivers/scsi/scsi_error.c | 22 +++++++++++-----------
> >   drivers/scsi/scsi_lib.c   |  6 +++++-
> >   include/scsi/scsi_cmnd.h  |  5 ++++-
> >   3 files changed, 20 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> > index dd338a8cd275..e92e088f636f 100644
> > --- a/drivers/scsi/scsi_error.c
> > +++ b/drivers/scsi/scsi_error.c
> > @@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
> >   
> >   	if (rtn == BLK_EH_DONE) {
> >   		/*
> > -		 * For blk-mq, we must set the request state to complete now
> > -		 * before sending the request to the scsi error handler. This
> > -		 * will prevent a use-after-free in the event the LLD manages
> > -		 * to complete the request before the error handler finishes
> > -		 * processing this timed out request.
> > +		 * Set the command to complete first in order to prevent a real
> > +		 * completion from releasing the command while error handling
> > +		 * is using it. If the command was already completed, then the
> > +		 * lower level driver beat the timeout handler, and it is safe
> > +		 * to return without escalating error recovery.
> >   		 *
> > -		 * If the request was already completed, then the LLD beat the
> > -		 * time out handler from transferring the request to the scsi
> > -		 * error handler. In that case we can return immediately as no
> > -		 * further action is required.
> > +		 * If timeout handling lost the race to a real completion, the
> > +		 * block layer may ignore that due to a fake timeout injection,
> > +		 * so return RESET_TIMER to allow error handling another shot
> > +		 * at this command.
> >   		 */
> > -		if (!blk_mq_mark_complete(req))
> > -			return rtn;
> > +		if (test_and_set_bit(__SCMD_COMPLETE, &scmd->flags))
> > +			return BLK_EH_RESET_TIMER;
> >   		if (scsi_abort_command(scmd) != SUCCESS) {
> >   			set_host_byte(scmd, DID_TIME_OUT);
> >   			scsi_eh_scmd_add(scmd);
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index 5d83a162d03b..c1d5e4e36125 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
> >   
> >   static void scsi_mq_done(struct scsi_cmnd *cmd)
> >   {
> > +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
> > +		return;
> >   	trace_scsi_dispatch_cmd_done(cmd);
> > -	blk_mq_complete_request(cmd->request);
> > +	if (unlikely(!blk_mq_complete_request(cmd->request)))
> > +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
> >   }
> >   
> >   static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
> > @@ -1701,6 +1704,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
> >   			goto out_dec_host_busy;
> >   		req->rq_flags |= RQF_DONTPREP;
> >   	} else {
> > +		cmd->flags &= ~SCMD_COMPLETE;
> >   		blk_mq_start_request(req);
> >   	}
> >   
> 
> Why do you use a normal assignment here (and not a clear_bit() as in the 
> above hunk)?
> 
> And shouldn't you add a barrier here to broadcast the state change to 
> other cores?

Although I don't like it that request state information is being moved from 
the block layer core into the SCSI core, I think that this patch is fine.
scsi_queue_rq() is only called after a request structure has been recycled
so I don't think that there can be any kind of race between the code paths
that use atomic instructions to manipulate the __SCMD_COMPLETE bit and
scsi_queue_rq().

Bart.
Christoph Hellwig Nov. 19, 2018, 8:58 a.m. UTC | #3
> index 5d83a162d03b..c1d5e4e36125 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
>  
>  static void scsi_mq_done(struct scsi_cmnd *cmd)
>  {
> +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
> +		return;
>  	trace_scsi_dispatch_cmd_done(cmd);
> -	blk_mq_complete_request(cmd->request);
> +	if (unlikely(!blk_mq_complete_request(cmd->request)))
> +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
>  }

This looks a little odd to me.  If we didn't complete the command
someone else did.  Why would we clear the bit in this case?

>  static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
> @@ -1701,6 +1704,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
>  			goto out_dec_host_busy;
>  		req->rq_flags |= RQF_DONTPREP;
>  	} else {
> +		cmd->flags &= ~SCMD_COMPLETE;
>  		blk_mq_start_request(req);
>  	}
>  
> diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
> index d6fd2aba0380..ded7c7194a28 100644
> --- a/include/scsi/scsi_cmnd.h
> +++ b/include/scsi/scsi_cmnd.h
> @@ -58,6 +58,9 @@ struct scsi_pointer {
>  #define SCMD_TAGGED		(1 << 0)
>  #define SCMD_UNCHECKED_ISA_DMA	(1 << 1)
>  #define SCMD_INITIALIZED	(1 << 2)
> +
> +#define __SCMD_COMPLETE		3
> +#define SCMD_COMPLETE		(1 << __SCMD_COMPLETE)

This mixing of atomic and non-atomic bitops looks rather dangerous
to me.  Can you add a new atomic_flags just for the completed flag,
and always use the bitops on it for now? I think we can eventually
kill most of the existing flags except for SCMD_TAGGED over the
next merge window or two and then move that over as well.

Otherwise the concept looks fine to me.
Jens Axboe Nov. 19, 2018, 3:17 p.m. UTC | #4
On 11/19/18 1:58 AM, Christoph Hellwig wrote:
>> index 5d83a162d03b..c1d5e4e36125 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
>>  
>>  static void scsi_mq_done(struct scsi_cmnd *cmd)
>>  {
>> +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
>> +		return;
>>  	trace_scsi_dispatch_cmd_done(cmd);
>> -	blk_mq_complete_request(cmd->request);
>> +	if (unlikely(!blk_mq_complete_request(cmd->request)))
>> +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
>>  }
> 
> This looks a little odd to me.  If we didn't complete the command
> someone else did.  Why would we clear the bit in this case?

It's strictly for the fake timeout, it didn't complete it, it just
ignored it. This is an artifact of the weird way that works.
Keith Busch Nov. 19, 2018, 3:19 p.m. UTC | #5
On Mon, Nov 19, 2018 at 12:58:15AM -0800, Christoph Hellwig wrote:
> > index 5d83a162d03b..c1d5e4e36125 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
> >  
> >  static void scsi_mq_done(struct scsi_cmnd *cmd)
> >  {
> > +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
> > +		return;
> >  	trace_scsi_dispatch_cmd_done(cmd);
> > -	blk_mq_complete_request(cmd->request);
> > +	if (unlikely(!blk_mq_complete_request(cmd->request)))
> > +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
> >  }
> 
> This looks a little odd to me.  If we didn't complete the command
> someone else did.  Why would we clear the bit in this case?

It's only to go along with the fake timeout. If we don't clear the bit,
then then scsi timeout handler will believe it has nothing to do because
scsi did its required part. The block layer just pretends the LLD didn't
do its part, so scsi has to play along too.

> >  static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
> > @@ -1701,6 +1704,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  			goto out_dec_host_busy;
> >  		req->rq_flags |= RQF_DONTPREP;
> >  	} else {
> > +		cmd->flags &= ~SCMD_COMPLETE;
> >  		blk_mq_start_request(req);
> >  	}
> >  
> > diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
> > index d6fd2aba0380..ded7c7194a28 100644
> > --- a/include/scsi/scsi_cmnd.h
> > +++ b/include/scsi/scsi_cmnd.h
> > @@ -58,6 +58,9 @@ struct scsi_pointer {
> >  #define SCMD_TAGGED		(1 << 0)
> >  #define SCMD_UNCHECKED_ISA_DMA	(1 << 1)
> >  #define SCMD_INITIALIZED	(1 << 2)
> > +
> > +#define __SCMD_COMPLETE		3
> > +#define SCMD_COMPLETE		(1 << __SCMD_COMPLETE)
> 
> This mixing of atomic and non-atomic bitops looks rather dangerous
> to me.  Can you add a new atomic_flags just for the completed flag,
> and always use the bitops on it for now? I think we can eventually
> kill most of the existing flags except for SCMD_TAGGED over the
> next merge window or two and then move that over as well.

The only concurrent access is completion + timeout, otherwise access is
single-threaded. I'm using the atomic operations only where it is
needed.

We implicitly clear the SCMD_COMPLETED flag along with SCMD_TAGGED in
scsi_init_command() too, and I didn't want to add new overhead with
new atomics.

> Otherwise the concept looks fine to me.
Christoph Hellwig Nov. 21, 2018, 1:12 p.m. UTC | #6
On Mon, Nov 19, 2018 at 08:19:00AM -0700, Keith Busch wrote:
> On Mon, Nov 19, 2018 at 12:58:15AM -0800, Christoph Hellwig wrote:
> > > index 5d83a162d03b..c1d5e4e36125 100644
> > > --- a/drivers/scsi/scsi_lib.c
> > > +++ b/drivers/scsi/scsi_lib.c
> > > @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
> > >  
> > >  static void scsi_mq_done(struct scsi_cmnd *cmd)
> > >  {
> > > +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
> > > +		return;
> > >  	trace_scsi_dispatch_cmd_done(cmd);
> > > -	blk_mq_complete_request(cmd->request);
> > > +	if (unlikely(!blk_mq_complete_request(cmd->request)))
> > > +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
> > >  }
> > 
> > This looks a little odd to me.  If we didn't complete the command
> > someone else did.  Why would we clear the bit in this case?
> 
> It's only to go along with the fake timeout. If we don't clear the bit,
> then then scsi timeout handler will believe it has nothing to do because
> scsi did its required part. The block layer just pretends the LLD didn't
> do its part, so scsi has to play along too.

This just looks way to magic to me.  In other word - it needs a big fat
comment explaining the situation.

> > > +#define __SCMD_COMPLETE		3
> > > +#define SCMD_COMPLETE		(1 << __SCMD_COMPLETE)
> > 
> > This mixing of atomic and non-atomic bitops looks rather dangerous
> > to me.  Can you add a new atomic_flags just for the completed flag,
> > and always use the bitops on it for now? I think we can eventually
> > kill most of the existing flags except for SCMD_TAGGED over the
> > next merge window or two and then move that over as well.
> 
> The only concurrent access is completion + timeout, otherwise access is
> single-threaded. I'm using the atomic operations only where it is
> needed.
> 
> We implicitly clear the SCMD_COMPLETED flag along with SCMD_TAGGED in
> scsi_init_command() too, and I didn't want to add new overhead with
> new atomics.

In general mixing access types on a single field (nevermind bit)
is going to cause us problems further down the road sooner or later.

I'd be much happier with a separate field.
Keith Busch Nov. 26, 2018, 3:31 p.m. UTC | #7
On Mon, Nov 26, 2018 at 08:32:45AM -0700, Jens Axboe wrote:
> On 11/21/18 6:12 AM, Christoph Hellwig wrote:
> > On Mon, Nov 19, 2018 at 08:19:00AM -0700, Keith Busch wrote:
> >> On Mon, Nov 19, 2018 at 12:58:15AM -0800, Christoph Hellwig wrote:
> >>>> index 5d83a162d03b..c1d5e4e36125 100644
> >>>> --- a/drivers/scsi/scsi_lib.c
> >>>> +++ b/drivers/scsi/scsi_lib.c
> >>>> @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
> >>>>  
> >>>>  static void scsi_mq_done(struct scsi_cmnd *cmd)
> >>>>  {
> >>>> +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
> >>>> +		return;
> >>>>  	trace_scsi_dispatch_cmd_done(cmd);
> >>>> -	blk_mq_complete_request(cmd->request);
> >>>> +	if (unlikely(!blk_mq_complete_request(cmd->request)))
> >>>> +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
> >>>>  }
> >>>
> >>> This looks a little odd to me.  If we didn't complete the command
> >>> someone else did.  Why would we clear the bit in this case?
> >>
> >> It's only to go along with the fake timeout. If we don't clear the bit,
> >> then then scsi timeout handler will believe it has nothing to do because
> >> scsi did its required part. The block layer just pretends the LLD didn't
> >> do its part, so scsi has to play along too.
> > 
> > This just looks way to magic to me.  In other word - it needs a big fat
> > comment explaining the situation.
> > 
> >>>> +#define __SCMD_COMPLETE		3
> >>>> +#define SCMD_COMPLETE		(1 << __SCMD_COMPLETE)
> >>>
> >>> This mixing of atomic and non-atomic bitops looks rather dangerous
> >>> to me.  Can you add a new atomic_flags just for the completed flag,
> >>> and always use the bitops on it for now? I think we can eventually
> >>> kill most of the existing flags except for SCMD_TAGGED over the
> >>> next merge window or two and then move that over as well.
> >>
> >> The only concurrent access is completion + timeout, otherwise access is
> >> single-threaded. I'm using the atomic operations only where it is
> >> needed.
> >>
> >> We implicitly clear the SCMD_COMPLETED flag along with SCMD_TAGGED in
> >> scsi_init_command() too, and I didn't want to add new overhead with
> >> new atomics.
> > 
> > In general mixing access types on a single field (nevermind bit)
> > is going to cause us problems further down the road sooner or later.
> > 
> > I'd be much happier with a separate field.
> 
> Keith, will you please respin with the separate field? Would be nice
> to get this merged for 4.21.

I'll send out a new version today.
Jens Axboe Nov. 26, 2018, 3:32 p.m. UTC | #8
On 11/21/18 6:12 AM, Christoph Hellwig wrote:
> On Mon, Nov 19, 2018 at 08:19:00AM -0700, Keith Busch wrote:
>> On Mon, Nov 19, 2018 at 12:58:15AM -0800, Christoph Hellwig wrote:
>>>> index 5d83a162d03b..c1d5e4e36125 100644
>>>> --- a/drivers/scsi/scsi_lib.c
>>>> +++ b/drivers/scsi/scsi_lib.c
>>>> @@ -1635,8 +1635,11 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
>>>>  
>>>>  static void scsi_mq_done(struct scsi_cmnd *cmd)
>>>>  {
>>>> +	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
>>>> +		return;
>>>>  	trace_scsi_dispatch_cmd_done(cmd);
>>>> -	blk_mq_complete_request(cmd->request);
>>>> +	if (unlikely(!blk_mq_complete_request(cmd->request)))
>>>> +		clear_bit(__SCMD_COMPLETE, &cmd->flags);
>>>>  }
>>>
>>> This looks a little odd to me.  If we didn't complete the command
>>> someone else did.  Why would we clear the bit in this case?
>>
>> It's only to go along with the fake timeout. If we don't clear the bit,
>> then then scsi timeout handler will believe it has nothing to do because
>> scsi did its required part. The block layer just pretends the LLD didn't
>> do its part, so scsi has to play along too.
> 
> This just looks way to magic to me.  In other word - it needs a big fat
> comment explaining the situation.
> 
>>>> +#define __SCMD_COMPLETE		3
>>>> +#define SCMD_COMPLETE		(1 << __SCMD_COMPLETE)
>>>
>>> This mixing of atomic and non-atomic bitops looks rather dangerous
>>> to me.  Can you add a new atomic_flags just for the completed flag,
>>> and always use the bitops on it for now? I think we can eventually
>>> kill most of the existing flags except for SCMD_TAGGED over the
>>> next merge window or two and then move that over as well.
>>
>> The only concurrent access is completion + timeout, otherwise access is
>> single-threaded. I'm using the atomic operations only where it is
>> needed.
>>
>> We implicitly clear the SCMD_COMPLETED flag along with SCMD_TAGGED in
>> scsi_init_command() too, and I didn't want to add new overhead with
>> new atomics.
> 
> In general mixing access types on a single field (nevermind bit)
> is going to cause us problems further down the road sooner or later.
> 
> I'd be much happier with a separate field.

Keith, will you please respin with the separate field? Would be nice
to get this merged for 4.21.
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index dd338a8cd275..e92e088f636f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -297,19 +297,19 @@  enum blk_eh_timer_return scsi_times_out(struct request *req)
 
 	if (rtn == BLK_EH_DONE) {
 		/*
-		 * For blk-mq, we must set the request state to complete now
-		 * before sending the request to the scsi error handler. This
-		 * will prevent a use-after-free in the event the LLD manages
-		 * to complete the request before the error handler finishes
-		 * processing this timed out request.
+		 * Set the command to complete first in order to prevent a real
+		 * completion from releasing the command while error handling
+		 * is using it. If the command was already completed, then the
+		 * lower level driver beat the timeout handler, and it is safe
+		 * to return without escalating error recovery.
 		 *
-		 * If the request was already completed, then the LLD beat the
-		 * time out handler from transferring the request to the scsi
-		 * error handler. In that case we can return immediately as no
-		 * further action is required.
+		 * If timeout handling lost the race to a real completion, the
+		 * block layer may ignore that due to a fake timeout injection,
+		 * so return RESET_TIMER to allow error handling another shot
+		 * at this command.
 		 */
-		if (!blk_mq_mark_complete(req))
-			return rtn;
+		if (test_and_set_bit(__SCMD_COMPLETE, &scmd->flags))
+			return BLK_EH_RESET_TIMER;
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
 			scsi_eh_scmd_add(scmd);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5d83a162d03b..c1d5e4e36125 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1635,8 +1635,11 @@  static blk_status_t scsi_mq_prep_fn(struct request *req)
 
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
+	if (unlikely(test_and_set_bit(__SCMD_COMPLETE, &cmd->flags)))
+		return;
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request);
+	if (unlikely(!blk_mq_complete_request(cmd->request)))
+		clear_bit(__SCMD_COMPLETE, &cmd->flags);
 }
 
 static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
@@ -1701,6 +1704,7 @@  static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto out_dec_host_busy;
 		req->rq_flags |= RQF_DONTPREP;
 	} else {
+		cmd->flags &= ~SCMD_COMPLETE;
 		blk_mq_start_request(req);
 	}
 
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index d6fd2aba0380..ded7c7194a28 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -58,6 +58,9 @@  struct scsi_pointer {
 #define SCMD_TAGGED		(1 << 0)
 #define SCMD_UNCHECKED_ISA_DMA	(1 << 1)
 #define SCMD_INITIALIZED	(1 << 2)
+
+#define __SCMD_COMPLETE		3
+#define SCMD_COMPLETE		(1 << __SCMD_COMPLETE)
 /* flags preserved across unprep / reprep */
 #define SCMD_PRESERVED_FLAGS	(SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED)
 
@@ -144,7 +147,7 @@  struct scsi_cmnd {
 					 * to be at an address < 16Mb). */
 
 	int result;		/* Status code from lower level driver */
-	int flags;		/* Command flags */
+	unsigned long flags;	/* Command flags */
 
 	unsigned char tag;	/* SCSI-II queued command tag */
 };