diff mbox

[RFC,3/3] blk-mq: Remove generation seqeunce

Message ID 20180521231131.6685-4-keith.busch@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Keith Busch May 21, 2018, 11:11 p.m. UTC
This patch simplifies the timeout handling by relying on the request
reference counting to ensure the iterator is operating on an inflight
and truly timed out request. Since the reference counting prevents the
tag from being reallocated, the block layer no longer needs to prevent
drivers from completing their requests while the timeout handler is
operating on it: a driver completing a request is allowed to proceed to
the next state without additional syncronization with the block layer.

This also removes any need for generation sequence numbers since the
request lifetime is prevented from being reallocated as a new sequence
while timeout handling is operating on it.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 block/blk-core.c       |   6 --
 block/blk-mq-debugfs.c |   1 -
 block/blk-mq.c         | 259 ++++++++++---------------------------------------
 block/blk-mq.h         |  20 +---
 block/blk-timeout.c    |   1 -
 include/linux/blkdev.h |  26 +----
 6 files changed, 58 insertions(+), 255 deletions(-)

Comments

Bart Van Assche May 21, 2018, 11:29 p.m. UTC | #1
On Mon, 2018-05-21 at 17:11 -0600, Keith Busch wrote:
> @@ -650,27 +600,10 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)

>  void blk_mq_complete_request(struct request *rq)

>  {

>  	struct request_queue *q = rq->q;

> -	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);

> -	int srcu_idx;

>  

>  	if (unlikely(blk_should_fake_timeout(q)))

>  		return;

> -	[ ... ]

> +	__blk_mq_complete_request(rq);

>  }

>  EXPORT_SYMBOL(blk_mq_complete_request);

> [ ... ]

>  static void blk_mq_rq_timed_out(struct request *req, bool reserved)

>  {

>  	const struct blk_mq_ops *ops = req->q->mq_ops;

>  	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;

>  

> -	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;

> -

>  	if (ops->timeout)

>  		ret = ops->timeout(req, reserved);

>  

>  	switch (ret) {

>  	case BLK_EH_HANDLED:

> -		__blk_mq_complete_request(req);

> -		break;

> -	case BLK_EH_RESET_TIMER:

>  		/*

> -		 * As nothing prevents from completion happening while

> -		 * ->aborted_gstate is set, this may lead to ignored

> -		 * completions and further spurious timeouts.

> +		 * If the request is still in flight, the driver is requesting

> +		 * blk-mq complete it.

>  		 */

> -		blk_mq_rq_update_aborted_gstate(req, 0);

> +		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)

> +			__blk_mq_complete_request(req);

> +		break;

> +	case BLK_EH_RESET_TIMER:

>  		blk_add_timer(req);

>  		break;

>  	case BLK_EH_NOT_HANDLED:

> @@ -880,64 +782,64 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)

>  	}

>  }


I think the above changes can lead to concurrent calls of
__blk_mq_complete_request() from the regular completion path and the timeout
path. That's wrong: the __blk_mq_complete_request() caller should guarantee
that no concurrent calls from another context to that function can occur.

Bart.
Ming Lei May 22, 2018, 2:49 a.m. UTC | #2
On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
> This patch simplifies the timeout handling by relying on the request
> reference counting to ensure the iterator is operating on an inflight

The reference counting isn't free, what is the real benefit in this way?

> and truly timed out request. Since the reference counting prevents the
> tag from being reallocated, the block layer no longer needs to prevent
> drivers from completing their requests while the timeout handler is
> operating on it: a driver completing a request is allowed to proceed to
> the next state without additional syncronization with the block layer.

This might cause trouble for drivers, since the previous behaviour
is that one request is only completed from one path, and now you change
the behaviour.

> 
> This also removes any need for generation sequence numbers since the
> request lifetime is prevented from being reallocated as a new sequence
> while timeout handling is operating on it.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  block/blk-core.c       |   6 --
>  block/blk-mq-debugfs.c |   1 -
>  block/blk-mq.c         | 259 ++++++++++---------------------------------------
>  block/blk-mq.h         |  20 +---
>  block/blk-timeout.c    |   1 -
>  include/linux/blkdev.h |  26 +----
>  6 files changed, 58 insertions(+), 255 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 43370faee935..cee03cad99f2 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -198,12 +198,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
>  	rq->internal_tag = -1;
>  	rq->start_time_ns = ktime_get_ns();
>  	rq->part = NULL;
> -	seqcount_init(&rq->gstate_seq);
> -	u64_stats_init(&rq->aborted_gstate_sync);
> -	/*
> -	 * See comment of blk_mq_init_request
> -	 */
> -	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
>  }
>  EXPORT_SYMBOL(blk_rq_init);
>  
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 3080e18cb859..ffa622366922 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -344,7 +344,6 @@ static const char *const rqf_name[] = {
>  	RQF_NAME(STATS),
>  	RQF_NAME(SPECIAL_PAYLOAD),
>  	RQF_NAME(ZONE_WRITE_LOCKED),
> -	RQF_NAME(MQ_TIMEOUT_EXPIRED),
>  	RQF_NAME(MQ_POLL_SLEPT),
>  };
>  #undef RQF_NAME
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 66e5c768803f..4858876fd364 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -589,56 +589,6 @@ static void __blk_mq_complete_request(struct request *rq)
>  	put_cpu();
>  }
>  
> -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> -	__releases(hctx->srcu)
> -{
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> -		rcu_read_unlock();
> -	else
> -		srcu_read_unlock(hctx->srcu, srcu_idx);
> -}
> -
> -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> -	__acquires(hctx->srcu)
> -{
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> -		/* shut up gcc false positive */
> -		*srcu_idx = 0;
> -		rcu_read_lock();
> -	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> -}
> -
> -static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
> -{
> -	unsigned long flags;
> -
> -	/*
> -	 * blk_mq_rq_aborted_gstate() is used from the completion path and
> -	 * can thus be called from irq context.  u64_stats_fetch in the
> -	 * middle of update on the same CPU leads to lockup.  Disable irq
> -	 * while updating.
> -	 */
> -	local_irq_save(flags);
> -	u64_stats_update_begin(&rq->aborted_gstate_sync);
> -	rq->aborted_gstate = gstate;
> -	u64_stats_update_end(&rq->aborted_gstate_sync);
> -	local_irq_restore(flags);
> -}
> -
> -static u64 blk_mq_rq_aborted_gstate(struct request *rq)
> -{
> -	unsigned int start;
> -	u64 aborted_gstate;
> -
> -	do {
> -		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
> -		aborted_gstate = rq->aborted_gstate;
> -	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
> -
> -	return aborted_gstate;
> -}
> -
>  /**
>   * blk_mq_complete_request - end I/O on a request
>   * @rq:		the request being processed
> @@ -650,27 +600,10 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
>  void blk_mq_complete_request(struct request *rq)
>  {
>  	struct request_queue *q = rq->q;
> -	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
> -	int srcu_idx;
>  
>  	if (unlikely(blk_should_fake_timeout(q)))
>  		return;
> -
> -	/*
> -	 * If @rq->aborted_gstate equals the current instance, timeout is
> -	 * claiming @rq and we lost.  This is synchronized through
> -	 * hctx_lock().  See blk_mq_timeout_work() for details.
> -	 *
> -	 * Completion path never blocks and we can directly use RCU here
> -	 * instead of hctx_lock() which can be either RCU or SRCU.
> -	 * However, that would complicate paths which want to synchronize
> -	 * against us.  Let stay in sync with the issue path so that
> -	 * hctx_lock() covers both issue and completion paths.
> -	 */
> -	hctx_lock(hctx, &srcu_idx);
> -	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
> -		__blk_mq_complete_request(rq);
> -	hctx_unlock(hctx, srcu_idx);
> +	__blk_mq_complete_request(rq);
>  }
>  EXPORT_SYMBOL(blk_mq_complete_request);
>  
> @@ -699,26 +632,9 @@ void blk_mq_start_request(struct request *rq)
>  
>  	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
>  
> -	/*
> -	 * Mark @rq in-flight which also advances the generation number,
> -	 * and register for timeout.  Protect with a seqcount to allow the
> -	 * timeout path to read both @rq->gstate and @rq->deadline
> -	 * coherently.
> -	 *
> -	 * This is the only place where a request is marked in-flight.  If
> -	 * the timeout path reads an in-flight @rq->gstate, the
> -	 * @rq->deadline it reads together under @rq->gstate_seq is
> -	 * guaranteed to be the matching one.
> -	 */
> -	preempt_disable();
> -	write_seqcount_begin(&rq->gstate_seq);
> -
>  	blk_add_timer(rq);
>  	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
>  
> -	write_seqcount_end(&rq->gstate_seq);
> -	preempt_enable();
> -
>  	if (q->dma_drain_size && blk_rq_bytes(rq)) {
>  		/*
>  		 * Make sure space for the drain appears.  We know we can do
> @@ -730,11 +646,6 @@ void blk_mq_start_request(struct request *rq)
>  }
>  EXPORT_SYMBOL(blk_mq_start_request);
>  
> -/*
> - * When we reach here because queue is busy, it's safe to change the state
> - * to IDLE without checking @rq->aborted_gstate because we should still be
> - * holding the RCU read lock and thus protected against timeout.
> - */
>  static void __blk_mq_requeue_request(struct request *rq)
>  {
>  	struct request_queue *q = rq->q;
> @@ -843,33 +754,24 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
>  }
>  EXPORT_SYMBOL(blk_mq_tag_to_rq);
>  
> -struct blk_mq_timeout_data {
> -	unsigned long next;
> -	unsigned int next_set;
> -	unsigned int nr_expired;
> -};
> -
>  static void blk_mq_rq_timed_out(struct request *req, bool reserved)
>  {
>  	const struct blk_mq_ops *ops = req->q->mq_ops;
>  	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
>  
> -	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
> -
>  	if (ops->timeout)
>  		ret = ops->timeout(req, reserved);
>  
>  	switch (ret) {
>  	case BLK_EH_HANDLED:
> -		__blk_mq_complete_request(req);
> -		break;
> -	case BLK_EH_RESET_TIMER:
>  		/*
> -		 * As nothing prevents from completion happening while
> -		 * ->aborted_gstate is set, this may lead to ignored
> -		 * completions and further spurious timeouts.
> +		 * If the request is still in flight, the driver is requesting
> +		 * blk-mq complete it.
>  		 */
> -		blk_mq_rq_update_aborted_gstate(req, 0);
> +		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
> +			__blk_mq_complete_request(req);
> +		break;
> +	case BLK_EH_RESET_TIMER:
>  		blk_add_timer(req);
>  		break;
>  	case BLK_EH_NOT_HANDLED:
> @@ -880,64 +782,64 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
>  	}
>  }
>  
> -static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
> -		struct request *rq, void *priv, bool reserved)
> +static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
>  {
> -	struct blk_mq_timeout_data *data = priv;
> -	unsigned long gstate, deadline;
> -	int start;
> +	unsigned long deadline;
>  
> -	might_sleep();
> -
> -	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
> -		return;
> +	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
> +		return false;
>  
> -	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
> -	while (true) {
> -		start = read_seqcount_begin(&rq->gstate_seq);
> -		gstate = READ_ONCE(rq->gstate);
> -		deadline = blk_rq_deadline(rq);
> -		if (!read_seqcount_retry(&rq->gstate_seq, start))
> -			break;
> -		cond_resched();
> -	}
> +	deadline = blk_rq_deadline(rq);
> +	if (time_after_eq(jiffies, deadline))
> +		return true;
>  
> -	/* if in-flight && overdue, mark for abortion */
> -	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
> -	    time_after_eq(jiffies, deadline)) {
> -		blk_mq_rq_update_aborted_gstate(rq, gstate);
> -		data->nr_expired++;
> -		hctx->nr_expired++;
> -	} else if (!data->next_set || time_after(data->next, deadline)) {
> -		data->next = deadline;
> -		data->next_set = 1;
> -	}
> +	if (*next == 0)
> +		*next = deadline;
> +	else if (time_after(*next, deadline))
> +		*next = deadline;
> +	return false;
>  }
>  
> -static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
> +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
>  		struct request *rq, void *priv, bool reserved)
>  {
> +	unsigned long *next = priv;
> +
>  	/*
> -	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
> -	 * completions that we lost to, they would have finished and
> -	 * updated @rq->gstate by now; otherwise, the completion path is
> -	 * now guaranteed to see @rq->aborted_gstate and yield.  If
> -	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
> +	 * Just do a quick check if it is expired before locking the request in
> +	 * so we're not unnecessarilly synchronizing across CPUs.
>  	 */
> -	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
> -	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
> +	if (!blk_mq_req_expired(rq, next))
> +		return;
> +
> +	/*
> +	 * We have reason to believe the request may be expired. Take a
> +	 * reference on the request to lock this request lifetime into its
> +	 * currently allocated context to prevent it from being reallocated in
> +	 * the event the completion by-passes this timeout handler.
> +	 * 
> +	 * If the reference was already released, then the driver beat the
> +	 * timeout handler to posting a natural completion.
> +	 */
> +	if (!kref_get_unless_zero(&rq->ref))
> +		return;

If this request is just completed in normal path and its state isn't
updated yet, timeout will hold the request, and may complete this
request again, then this req can be completed two times.

Thanks,
Ming
Jens Axboe May 22, 2018, 3:16 a.m. UTC | #3
On 5/21/18 8:49 PM, Ming Lei wrote:
> On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
>> This patch simplifies the timeout handling by relying on the request
>> reference counting to ensure the iterator is operating on an inflight
> 
> The reference counting isn't free, what is the real benefit in this way?

Neither is the current scheme and locking, and this is a hell of a lot
simpler. I'd get rid of the kref stuff and just do a simple
atomic_dec_and_test(). Most of the time we should be uncontended on
that, which would make it pretty darn cheap. I'd be surprised if it
wasn't better than the current alternatives.
Ming Lei May 22, 2018, 3:47 a.m. UTC | #4
On Mon, May 21, 2018 at 09:16:33PM -0600, Jens Axboe wrote:
> On 5/21/18 8:49 PM, Ming Lei wrote:
> > On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
> >> This patch simplifies the timeout handling by relying on the request
> >> reference counting to ensure the iterator is operating on an inflight
> > 
> > The reference counting isn't free, what is the real benefit in this way?
> 
> Neither is the current scheme and locking, and this is a hell of a lot
> simpler. I'd get rid of the kref stuff and just do a simple
> atomic_dec_and_test(). Most of the time we should be uncontended on
> that, which would make it pretty darn cheap. I'd be surprised if it
> wasn't better than the current alternatives.

The explicit memory barriers by atomic_dec_and_test() isn't free.

Also the double completion issue need to be fixed before discussing
this approach further.


Thanks,
Ming
Jens Axboe May 22, 2018, 3:51 a.m. UTC | #5
On May 21, 2018, at 9:47 PM, Ming Lei <ming.lei@redhat.com> wrote:
> 
>> On Mon, May 21, 2018 at 09:16:33PM -0600, Jens Axboe wrote:
>>> On 5/21/18 8:49 PM, Ming Lei wrote:
>>>> On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
>>>> This patch simplifies the timeout handling by relying on the request
>>>> reference counting to ensure the iterator is operating on an inflight
>>> 
>>> The reference counting isn't free, what is the real benefit in this way?
>> 
>> Neither is the current scheme and locking, and this is a hell of a lot
>> simpler. I'd get rid of the kref stuff and just do a simple
>> atomic_dec_and_test(). Most of the time we should be uncontended on
>> that, which would make it pretty darn cheap. I'd be surprised if it
>> wasn't better than the current alternatives.
> 
> The explicit memory barriers by atomic_dec_and_test() isn't free.

I’m not saying it’s free. Neither is our current synchronization.

> Also the double completion issue need to be fixed before discussing
> this approach further.

Certainly. Also not saying that the current patch is perfect. But it’s a lot more palatable than the alternatives, hence my interest. And I’d like for this issue to get solved, we seem to be a bit stuck atm. 


I’ll take a closer look tomorrow.
Ming Lei May 22, 2018, 8:51 a.m. UTC | #6
On Mon, May 21, 2018 at 09:51:19PM -0600, Jens Axboe wrote:
> On May 21, 2018, at 9:47 PM, Ming Lei <ming.lei@redhat.com> wrote:
> > 
> >> On Mon, May 21, 2018 at 09:16:33PM -0600, Jens Axboe wrote:
> >>> On 5/21/18 8:49 PM, Ming Lei wrote:
> >>>> On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
> >>>> This patch simplifies the timeout handling by relying on the request
> >>>> reference counting to ensure the iterator is operating on an inflight
> >>> 
> >>> The reference counting isn't free, what is the real benefit in this way?
> >> 
> >> Neither is the current scheme and locking, and this is a hell of a lot
> >> simpler. I'd get rid of the kref stuff and just do a simple
> >> atomic_dec_and_test(). Most of the time we should be uncontended on
> >> that, which would make it pretty darn cheap. I'd be surprised if it
> >> wasn't better than the current alternatives.
> > 
> > The explicit memory barriers by atomic_dec_and_test() isn't free.
> 
> I’m not saying it’s free. Neither is our current synchronization.
> 
> > Also the double completion issue need to be fixed before discussing
> > this approach further.
> 
> Certainly. Also not saying that the current patch is perfect. But it’s a lot more palatable than the alternatives, hence my interest. And I’d like for this issue to get solved, we seem to be a bit stuck atm. 
> 

It may not be something we are stuck at, and seems no alternatives for
this patchset.

It is a new requirement from NVMe, and Keith wants driver to complete
timed-out request in .timeout(). We never support that before for both
mq and non-mq code path.

Thanks,
Ming
Keith Busch May 22, 2018, 2:15 p.m. UTC | #7
On Mon, May 21, 2018 at 11:29:06PM +0000, Bart Van Assche wrote:
> On Mon, 2018-05-21 at 17:11 -0600, Keith Busch wrote:
> >  	switch (ret) {
> >  	case BLK_EH_HANDLED:
> > -		__blk_mq_complete_request(req);
> > -		break;
> > -	case BLK_EH_RESET_TIMER:
> >  		/*
> > -		 * As nothing prevents from completion happening while
> > -		 * ->aborted_gstate is set, this may lead to ignored
> > -		 * completions and further spurious timeouts.
> > +		 * If the request is still in flight, the driver is requesting
> > +		 * blk-mq complete it.
> >  		 */
> > -		blk_mq_rq_update_aborted_gstate(req, 0);
> > +		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
> > +			__blk_mq_complete_request(req);
> > +		break;
> > +	case BLK_EH_RESET_TIMER:
> >  		blk_add_timer(req);
> >  		break;
> >  	case BLK_EH_NOT_HANDLED:
> > @@ -880,64 +782,64 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
> >  	}
> >  }
> 
> I think the above changes can lead to concurrent calls of
> __blk_mq_complete_request() from the regular completion path and the timeout
> path. That's wrong: the __blk_mq_complete_request() caller should guarantee
> that no concurrent calls from another context to that function can occur.

Hi Bart,

This shouldn't be introducing any new concorrent calls to
__blk_mq_complete_request if they don't already exist. The timeout work
calls it only if the driver's timeout returns BLK_EH_HANDLED, which means
the driver is claiming the command is now done, but the driver didn't call
blk_mq_complete_request as indicated by the request's IN_FLIGHT state.

In order to get a second call to __blk_mq_complete_request(), then,
the driver would have to end up calling blk_mq_complete_request()
in another context. But there's nothing stopping that from happening
today, and would be bad if any driver actually did that: the request
may have been re-allocated and issued as a new command, and calling
blk_mq_complete_request() the second time introduces data corruption.

Thanks,
Keith
Keith Busch May 22, 2018, 2:20 p.m. UTC | #8
On Tue, May 22, 2018 at 10:49:11AM +0800, Ming Lei wrote:
> On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
> > -static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
> > +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
> >  		struct request *rq, void *priv, bool reserved)
> >  {
> > +	unsigned long *next = priv;
> > +
> >  	/*
> > -	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
> > -	 * completions that we lost to, they would have finished and
> > -	 * updated @rq->gstate by now; otherwise, the completion path is
> > -	 * now guaranteed to see @rq->aborted_gstate and yield.  If
> > -	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
> > +	 * Just do a quick check if it is expired before locking the request in
> > +	 * so we're not unnecessarilly synchronizing across CPUs.
> >  	 */
> > -	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
> > -	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
> > +	if (!blk_mq_req_expired(rq, next))
> > +		return;
> > +
> > +	/*
> > +	 * We have reason to believe the request may be expired. Take a
> > +	 * reference on the request to lock this request lifetime into its
> > +	 * currently allocated context to prevent it from being reallocated in
> > +	 * the event the completion by-passes this timeout handler.
> > +	 * 
> > +	 * If the reference was already released, then the driver beat the
> > +	 * timeout handler to posting a natural completion.
> > +	 */
> > +	if (!kref_get_unless_zero(&rq->ref))
> > +		return;
> 
> If this request is just completed in normal path and its state isn't
> updated yet, timeout will hold the request, and may complete this
> request again, then this req can be completed two times.

Hi Ming,

In the event the driver requests a normal completion, the timeout work
releasing the last reference doesn't do a second completion: it only
releases the request's tag back for re-allocation.

Thanks,
Keith
Jens Axboe May 22, 2018, 2:35 p.m. UTC | #9
On 5/22/18 2:51 AM, Ming Lei wrote:
> On Mon, May 21, 2018 at 09:51:19PM -0600, Jens Axboe wrote:
>> On May 21, 2018, at 9:47 PM, Ming Lei <ming.lei@redhat.com> wrote:
>>>
>>>> On Mon, May 21, 2018 at 09:16:33PM -0600, Jens Axboe wrote:
>>>>> On 5/21/18 8:49 PM, Ming Lei wrote:
>>>>>> On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
>>>>>> This patch simplifies the timeout handling by relying on the request
>>>>>> reference counting to ensure the iterator is operating on an inflight
>>>>>
>>>>> The reference counting isn't free, what is the real benefit in this way?
>>>>
>>>> Neither is the current scheme and locking, and this is a hell of a lot
>>>> simpler. I'd get rid of the kref stuff and just do a simple
>>>> atomic_dec_and_test(). Most of the time we should be uncontended on
>>>> that, which would make it pretty darn cheap. I'd be surprised if it
>>>> wasn't better than the current alternatives.
>>>
>>> The explicit memory barriers by atomic_dec_and_test() isn't free.
>>
>> I’m not saying it’s free. Neither is our current synchronization.
>>
>>> Also the double completion issue need to be fixed before discussing
>>> this approach further.
>>
>> Certainly. Also not saying that the current patch is perfect. But
>> it’s a lot more palatable than the alternatives, hence my interest.
>> And I’d like for this issue to get solved, we seem to be a bit stuck
>> atm. 
>>
> 
> It may not be something we are stuck at, and seems no alternatives for
> this patchset.
> 
> It is a new requirement from NVMe, and Keith wants driver to complete
> timed-out request in .timeout(). We never support that before for both
> mq and non-mq code path.

No, that's not what he wants to do. He wants to use referencing to
release the final resources of the request (the tag), to prevent
premature reuse of the request.
Ming Lei May 22, 2018, 2:37 p.m. UTC | #10
On Tue, May 22, 2018 at 10:20 PM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Tue, May 22, 2018 at 10:49:11AM +0800, Ming Lei wrote:
>> On Mon, May 21, 2018 at 05:11:31PM -0600, Keith Busch wrote:
>> > -static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
>> > +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
>> >             struct request *rq, void *priv, bool reserved)
>> >  {
>> > +   unsigned long *next = priv;
>> > +
>> >     /*
>> > -    * We marked @rq->aborted_gstate and waited for RCU.  If there were
>> > -    * completions that we lost to, they would have finished and
>> > -    * updated @rq->gstate by now; otherwise, the completion path is
>> > -    * now guaranteed to see @rq->aborted_gstate and yield.  If
>> > -    * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
>> > +    * Just do a quick check if it is expired before locking the request in
>> > +    * so we're not unnecessarilly synchronizing across CPUs.
>> >      */
>> > -   if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
>> > -       READ_ONCE(rq->gstate) == rq->aborted_gstate)
>> > +   if (!blk_mq_req_expired(rq, next))
>> > +           return;
>> > +
>> > +   /*
>> > +    * We have reason to believe the request may be expired. Take a
>> > +    * reference on the request to lock this request lifetime into its
>> > +    * currently allocated context to prevent it from being reallocated in
>> > +    * the event the completion by-passes this timeout handler.
>> > +    *
>> > +    * If the reference was already released, then the driver beat the
>> > +    * timeout handler to posting a natural completion.
>> > +    */
>> > +   if (!kref_get_unless_zero(&rq->ref))
>> > +           return;
>>
>> If this request is just completed in normal path and its state isn't
>> updated yet, timeout will hold the request, and may complete this
>> request again, then this req can be completed two times.
>
> Hi Ming,
>
> In the event the driver requests a normal completion, the timeout work
> releasing the last reference doesn't do a second completion: it only

The reference only covers request's lifetime, not related with completion.

It isn't the last reference. When driver returns EH_HANDLED, blk-mq
will complete this request on extra time.

Yes, if driver's timeout code and normal completion code can sync
about this completion, that should be fine, but the current behaviour
doesn't depend driver's sync since the req is always completed atomically
via the following way:

1) timeout

if (mark_completed(rq))
   timed_out(rq)

2) normal completion
if (mark_completed(rq))
    complete(rq)

For example, before nvme_timeout() is trying to run nvme_dev_disable(),
irq comes and this req is completed from normal completion path, but
nvme_timeout() still returns EH_HANDLED, and blk-mq may complete
the req one extra time since the normal completion path may not update
req's state yet.

Thanks,
Ming Lei
Keith Busch May 22, 2018, 2:46 p.m. UTC | #11
On Tue, May 22, 2018 at 10:37:32PM +0800, Ming Lei wrote:
> On Tue, May 22, 2018 at 10:20 PM, Keith Busch
> <keith.busch@linux.intel.com> wrote:
> > In the event the driver requests a normal completion, the timeout work
> > releasing the last reference doesn't do a second completion: it only
> 
> The reference only covers request's lifetime, not related with completion.
> 
> It isn't the last reference. When driver returns EH_HANDLED, blk-mq
> will complete this request on extra time.
> 
> Yes, if driver's timeout code and normal completion code can sync
> about this completion, that should be fine, but the current behaviour
> doesn't depend driver's sync since the req is always completed atomically
> via the following way:
> 
> 1) timeout
> 
> if (mark_completed(rq))
>    timed_out(rq)
> 
> 2) normal completion
> if (mark_completed(rq))
>     complete(rq)
> 
> For example, before nvme_timeout() is trying to run nvme_dev_disable(),
> irq comes and this req is completed from normal completion path, but
> nvme_timeout() still returns EH_HANDLED, and blk-mq may complete
> the req one extra time since the normal completion path may not update
> req's state yet.

nvme_dev_disable tears down irq's, meaing their handling is already
sync'ed before nvme_dev_disable can proceed. Whether the completion
comes through nvme_irq, or through nvme_dev_disable, there is no way
possible for nvme's timeout to return EH_HANDLED if the state was not
updated prior to returning that status.
Ming Lei May 22, 2018, 2:57 p.m. UTC | #12
On Tue, May 22, 2018 at 10:46 PM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Tue, May 22, 2018 at 10:37:32PM +0800, Ming Lei wrote:
>> On Tue, May 22, 2018 at 10:20 PM, Keith Busch
>> <keith.busch@linux.intel.com> wrote:
>> > In the event the driver requests a normal completion, the timeout work
>> > releasing the last reference doesn't do a second completion: it only
>>
>> The reference only covers request's lifetime, not related with completion.
>>
>> It isn't the last reference. When driver returns EH_HANDLED, blk-mq
>> will complete this request on extra time.
>>
>> Yes, if driver's timeout code and normal completion code can sync
>> about this completion, that should be fine, but the current behaviour
>> doesn't depend driver's sync since the req is always completed atomically
>> via the following way:
>>
>> 1) timeout
>>
>> if (mark_completed(rq))
>>    timed_out(rq)
>>
>> 2) normal completion
>> if (mark_completed(rq))
>>     complete(rq)
>>
>> For example, before nvme_timeout() is trying to run nvme_dev_disable(),
>> irq comes and this req is completed from normal completion path, but
>> nvme_timeout() still returns EH_HANDLED, and blk-mq may complete
>> the req one extra time since the normal completion path may not update
>> req's state yet.
>
> nvme_dev_disable tears down irq's, meaing their handling is already
> sync'ed before nvme_dev_disable can proceed. Whether the completion
> comes through nvme_irq, or through nvme_dev_disable, there is no way
> possible for nvme's timeout to return EH_HANDLED if the state was not
> updated prior to returning that status.

OK, that still depends on driver's behaviour, even though it is true
for NVMe,  you still have to audit other drivers about this sync
because it is required by your patch.


thanks,
Ming Lei
Keith Busch May 22, 2018, 3:01 p.m. UTC | #13
On Tue, May 22, 2018 at 10:57:40PM +0800, Ming Lei wrote:
> OK, that still depends on driver's behaviour, even though it is true
> for NVMe,  you still have to audit other drivers about this sync
> because it is required by your patch.

Okay, forget about nvme for a moment here. Please run this thought
experiment to decide if what you're saying is even plausible for any
block driver with the existing implementation:

Your scenario has a driver return EH_HANDLED for a timed out request. The
timeout work then completes the request. The request may then be
reallocated for a new command and dispatched.

At approximately the same time, you're saying the driver that returned
EH_HANDLED may then call blk_mq_complete_request() in reference to the
*old* request that it returned EH_HANDLED for, incorrectly completing
the new request before it is done. That will inevitably lead to data
corruption. Is that happening today in any driver?
Ming Lei May 22, 2018, 3:07 p.m. UTC | #14
On Tue, May 22, 2018 at 11:01 PM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Tue, May 22, 2018 at 10:57:40PM +0800, Ming Lei wrote:
>> OK, that still depends on driver's behaviour, even though it is true
>> for NVMe,  you still have to audit other drivers about this sync
>> because it is required by your patch.
>
> Okay, forget about nvme for a moment here. Please run this thought
> experiment to decide if what you're saying is even plausible for any
> block driver with the existing implementation:
>
> Your scenario has a driver return EH_HANDLED for a timed out request. The
> timeout work then completes the request. The request may then be
> reallocated for a new command and dispatched.

Yes.

>
> At approximately the same time, you're saying the driver that returned
> EH_HANDLED may then call blk_mq_complete_request() in reference to the
> *old* request that it returned EH_HANDLED for, incorrectly completing

Because this request has been completed above by blk-mq timeout,
then this old request won't be completed any more via blk_mq_complete_request()
either from normal path or what ever, such as cancel.

> the new request before it is done. That will inevitably lead to data
> corruption. Is that happening today in any driver?

No such issue since current blk-mq makes sure one req is only completed
once, but your patch changes to depend on driver to make sure that.


thanks,
Ming Lei
Keith Busch May 22, 2018, 3:17 p.m. UTC | #15
On Tue, May 22, 2018 at 11:07:07PM +0800, Ming Lei wrote:
> > At approximately the same time, you're saying the driver that returned
> > EH_HANDLED may then call blk_mq_complete_request() in reference to the
> > *old* request that it returned EH_HANDLED for, incorrectly completing
> 
> Because this request has been completed above by blk-mq timeout,
> then this old request won't be completed any more via blk_mq_complete_request()
> either from normal path or what ever, such as cancel.
 
> > the new request before it is done. That will inevitably lead to data
> > corruption. Is that happening today in any driver?
> 
> No such issue since current blk-mq makes sure one req is only completed
> once, but your patch changes to depend on driver to make sure that.

The blk-mq timeout complete makes the request available for allocation
as a new command, at which point blk_mq_complete_request can be called
again. If a driver is somehow relying on blk-mq to prevent a double
completion for a previously completed request context, they're already
in a lot of trouble.
Ming Lei May 22, 2018, 3:23 p.m. UTC | #16
On Tue, May 22, 2018 at 11:17 PM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Tue, May 22, 2018 at 11:07:07PM +0800, Ming Lei wrote:
>> > At approximately the same time, you're saying the driver that returned
>> > EH_HANDLED may then call blk_mq_complete_request() in reference to the
>> > *old* request that it returned EH_HANDLED for, incorrectly completing
>>
>> Because this request has been completed above by blk-mq timeout,
>> then this old request won't be completed any more via blk_mq_complete_request()
>> either from normal path or what ever, such as cancel.
>
>> > the new request before it is done. That will inevitably lead to data
>> > corruption. Is that happening today in any driver?
>>
>> No such issue since current blk-mq makes sure one req is only completed
>> once, but your patch changes to depend on driver to make sure that.
>
> The blk-mq timeout complete makes the request available for allocation
> as a new command, at which point blk_mq_complete_request can be called
> again. If a driver is somehow relying on blk-mq to prevent a double
> completion for a previously completed request context, they're already
> in a lot of trouble.

Yes, previously there is the atomic flag of REQ_ATOM_COMPLETE for
covering the atomic completion, and recently Tejun changes to aborted
state with generation counter, but both provides sort of atomic completion.

So even though it is much simplified by using request refcount, the atomic
completion should be provided by blk-mq, or drivers have to be audited
to avoid double completion.

Thanks,
Ming Lei
Christoph Hellwig May 22, 2018, 4:17 p.m. UTC | #17
Hi Keith,

I like this series a lot.  One comment that is probably close
to the big discussion in the thread:

>  	switch (ret) {
>  	case BLK_EH_HANDLED:
>  		/*
> +		 * If the request is still in flight, the driver is requesting
> +		 * blk-mq complete it.
>  		 */
> +		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
> +			__blk_mq_complete_request(req);
> +		break;

The state check here really irked me, and from the thread it seems like
I'm not the only one.  At least for the NVMe case I think it is perfectly
safe, although I agree I'd rather audit what other drivers do carefully.

That being said I think BLK_EH_HANDLED seems like a fundamentally broken
idea, and I'd actually prefer to get rid of it over adding things like
the MQ_RQ_IN_FLIGHT check above.

E.g. if we look at the cases where nvme-pci returns it:

 - if we did call nvme_dev_disable, we already canceled all requests,
   so we might as well just return BLK_EH_NOT_HANDLED
 - the poll for completion case already completed the command,
   so we should return BLK_EH_NOT_HANDLED

So I think we need to fix up nvme and if needed any other driver
to return the right value and then assert that the request is
still in in-flight status for the BLK_EH_HANDLED case.

> @@ -124,16 +119,7 @@ static inline int blk_mq_rq_state(struct request *rq)
>  static inline void blk_mq_rq_update_state(struct request *rq,
>  					  enum mq_rq_state state)
>  {
> +	WRITE_ONCE(rq->state, state);
>  }

I think this helper can go away now.  But we should have a comment
near the state field documenting the concurrency implications.



> +	u64 state;

This should probably be a mq_rq_state instead.  Which means it needs
to be moved to blkdev.h, but that should be ok.
Bart Van Assche May 22, 2018, 4:29 p.m. UTC | #18
On Tue, 2018-05-22 at 08:15 -0600, Keith Busch wrote:
> This shouldn't be introducing any new concorrent calls to

> __blk_mq_complete_request if they don't already exist. The timeout work

> calls it only if the driver's timeout returns BLK_EH_HANDLED, which means

> the driver is claiming the command is now done, but the driver didn't call

> blk_mq_complete_request as indicated by the request's IN_FLIGHT state.

> 

> In order to get a second call to __blk_mq_complete_request(), then,

> the driver would have to end up calling blk_mq_complete_request()

> in another context. But there's nothing stopping that from happening

> today, and would be bad if any driver actually did that: the request

> may have been re-allocated and issued as a new command, and calling

> blk_mq_complete_request() the second time introduces data corruption.


Hello Keith,

Please have another look at the current code that handles request timeouts
and completions. The current implementation guarantees that no double
completions can occur but your patch removes essential aspects of that
implementation.

Thanks,

Bart.
Keith Busch May 22, 2018, 4:34 p.m. UTC | #19
On Tue, May 22, 2018 at 04:29:07PM +0000, Bart Van Assche wrote:
> Please have another look at the current code that handles request timeouts
> and completions. The current implementation guarantees that no double
> completions can occur but your patch removes essential aspects of that
> implementation.

How does the current implementation guarantee a double completion doesn't
happen when the request is allocated for a new command?
Bart Van Assche May 22, 2018, 4:48 p.m. UTC | #20
On Tue, 2018-05-22 at 10:34 -0600, Keith Busch wrote:
> On Tue, May 22, 2018 at 04:29:07PM +0000, Bart Van Assche wrote:

> > Please have another look at the current code that handles request timeouts

> > and completions. The current implementation guarantees that no double

> > completions can occur but your patch removes essential aspects of that

> > implementation.

> 

> How does the current implementation guarantee a double completion doesn't

> happen when the request is allocated for a new command?


Hello Keith,

If a request is completes and is reused after the timeout handler has set
aborted_gstate and before blk_mq_terminate_expired() is called then the latter
function will skip the request because restarting a request causes the
generation number in rq->gstate to be incremented. From blk_mq_rq_update_state():

	if (state == MQ_RQ_IN_FLIGHT) {
		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
		new_val += MQ_RQ_GEN_INC;
	}

Bart.
Ming Lei May 23, 2018, 12:34 a.m. UTC | #21
On Tue, May 22, 2018 at 06:17:04PM +0200, Christoph Hellwig wrote:
> Hi Keith,
> 
> I like this series a lot.  One comment that is probably close
> to the big discussion in the thread:
> 
> >  	switch (ret) {
> >  	case BLK_EH_HANDLED:
> >  		/*
> > +		 * If the request is still in flight, the driver is requesting
> > +		 * blk-mq complete it.
> >  		 */
> > +		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
> > +			__blk_mq_complete_request(req);
> > +		break;
> 
> The state check here really irked me, and from the thread it seems like
> I'm not the only one.  At least for the NVMe case I think it is perfectly
> safe, although I agree I'd rather audit what other drivers do carefully.

Let's consider the normal NVMe timeout code path:

1) one request is timed out;

2) controller is shutdown, this timed-out request is requeued from
nvme_cancel_request(), but can't dispatch because queues are quiesced

3) reset is done from another context, and this request is dispatched
again, and completed exactly before returning EH_HANDLED to blk-mq, but
its state isn't updated to COMPLETE yet.

4) then double completions are done from both normal completion and timeout
path.

Seems same issue exists on poll path.

Thanks,
Ming
Hannes Reinecke May 23, 2018, 5:48 a.m. UTC | #22
On 05/22/2018 06:17 PM, Christoph Hellwig wrote:
> Hi Keith,
> 
> I like this series a lot.  One comment that is probably close
> to the big discussion in the thread:
> 
>>   	switch (ret) {
>>   	case BLK_EH_HANDLED:
>>   		/*
>> +		 * If the request is still in flight, the driver is requesting
>> +		 * blk-mq complete it.
>>   		 */
>> +		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
>> +			__blk_mq_complete_request(req);
>> +		break;
> 
> The state check here really irked me, and from the thread it seems like
> I'm not the only one.  At least for the NVMe case I think it is perfectly
> safe, although I agree I'd rather audit what other drivers do carefully.
> 
> That being said I think BLK_EH_HANDLED seems like a fundamentally broken
> idea, and I'd actually prefer to get rid of it over adding things like
> the MQ_RQ_IN_FLIGHT check above.
> 
I can't agree more here.

BLK_EH_HANDLED is breaking all sorts of assumptions, and I'd be glad to 
see it go.

> E.g. if we look at the cases where nvme-pci returns it:
> 
>   - if we did call nvme_dev_disable, we already canceled all requests,
>     so we might as well just return BLK_EH_NOT_HANDLED
>   - the poll for completion case already completed the command,
>     so we should return BLK_EH_NOT_HANDLED
> 
> So I think we need to fix up nvme and if needed any other driver
> to return the right value and then assert that the request is
> still in in-flight status for the BLK_EH_HANDLED case.
> 
... and then kill BLK_EH_HANDLED :-)

Cheers,

Hannes
Keith Busch May 23, 2018, 2:35 p.m. UTC | #23
On Wed, May 23, 2018 at 08:34:48AM +0800, Ming Lei wrote:
> Let's consider the normal NVMe timeout code path:
> 
> 1) one request is timed out;
> 
> 2) controller is shutdown, this timed-out request is requeued from
> nvme_cancel_request(), but can't dispatch because queues are quiesced
> 
> 3) reset is done from another context, and this request is dispatched
> again, and completed exactly before returning EH_HANDLED to blk-mq, but
> its state isn't updated to COMPLETE yet.
> 
> 4) then double completions are done from both normal completion and timeout
> path.

We're definitely fixing this, but I must admit that's an impressive
cognitive traversal across 5 thread contexts to arrive at that race. :)
Ming Lei May 24, 2018, 1:52 a.m. UTC | #24
On Wed, May 23, 2018 at 08:35:40AM -0600, Keith Busch wrote:
> On Wed, May 23, 2018 at 08:34:48AM +0800, Ming Lei wrote:
> > Let's consider the normal NVMe timeout code path:
> > 
> > 1) one request is timed out;
> > 
> > 2) controller is shutdown, this timed-out request is requeued from
> > nvme_cancel_request(), but can't dispatch because queues are quiesced
> > 
> > 3) reset is done from another context, and this request is dispatched
> > again, and completed exactly before returning EH_HANDLED to blk-mq, but
> > its state isn't updated to COMPLETE yet.
> > 
> > 4) then double completions are done from both normal completion and timeout
> > path.
> 
> We're definitely fixing this, but I must admit that's an impressive
> cognitive traversal across 5 thread contexts to arrive at that race. :)

It can be only 2 thread contexts if requeue is done on polled request
from nvme_timeout(), :-)

Thanks,
Ming
Bart Van Assche July 12, 2018, 6:16 p.m. UTC | #25
On Mon, 2018-05-21 at 17:11 -0600, Keith Busch wrote:
>  	/*

> -	 * We marked @rq->aborted_gstate and waited for RCU.  If there were

> -	 * completions that we lost to, they would have finished and

> -	 * updated @rq->gstate by now; otherwise, the completion path is

> -	 * now guaranteed to see @rq->aborted_gstate and yield.  If

> -	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.

> +	 * Just do a quick check if it is expired before locking the request in

> +	 * so we're not unnecessarilly synchronizing across CPUs.

>  	 */

> -	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&

> -	    READ_ONCE(rq->gstate) == rq->aborted_gstate)

> +	if (!blk_mq_req_expired(rq, next))

> +		return;

> +

> +	/*

> +	 * We have reason to believe the request may be expired. Take a

> +	 * reference on the request to lock this request lifetime into its

> +	 * currently allocated context to prevent it from being reallocated in

> +	 * the event the completion by-passes this timeout handler.

> +	 * 

> +	 * If the reference was already released, then the driver beat the

> +	 * timeout handler to posting a natural completion.

> +	 */

> +	if (!kref_get_unless_zero(&rq->ref))

> +		return;

> +

> +	/*

> +	 * The request is now locked and cannot be reallocated underneath the

> +	 * timeout handler's processing. Re-verify this exact request is truly

> +	 * expired; if it is not expired, then the request was completed and

> +	 * reallocated as a new request.

> +	 */

> +	if (blk_mq_req_expired(rq, next))

>  		blk_mq_rq_timed_out(rq, reserved);

> +	blk_mq_put_request(rq);

>  }


Hello Keith and Christoph,

What prevents that a request finishes and gets reused after the
blk_mq_req_expired() call has finished and before kref_get_unless_zero() is
called? Is this perhaps a race condition that has not yet been triggered by
any existing block layer test? Please note that there is no such race
condition in the patch I had posted ("blk-mq: Rework blk-mq timeout handling
again" - https://www.spinics.net/lists/linux-block/msg26489.html).

Thanks,

Bart.
Keith Busch July 12, 2018, 7:24 p.m. UTC | #26
On Thu, Jul 12, 2018 at 06:16:12PM +0000, Bart Van Assche wrote:
> On Mon, 2018-05-21 at 17:11 -0600, Keith Busch wrote:
> >  	/*
> > -	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
> > -	 * completions that we lost to, they would have finished and
> > -	 * updated @rq->gstate by now; otherwise, the completion path is
> > -	 * now guaranteed to see @rq->aborted_gstate and yield.  If
> > -	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
> > +	 * Just do a quick check if it is expired before locking the request in
> > +	 * so we're not unnecessarilly synchronizing across CPUs.
> >  	 */
> > -	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
> > -	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
> > +	if (!blk_mq_req_expired(rq, next))
> > +		return;
> > +
> > +	/*
> > +	 * We have reason to believe the request may be expired. Take a
> > +	 * reference on the request to lock this request lifetime into its
> > +	 * currently allocated context to prevent it from being reallocated in
> > +	 * the event the completion by-passes this timeout handler.
> > +	 * 
> > +	 * If the reference was already released, then the driver beat the
> > +	 * timeout handler to posting a natural completion.
> > +	 */
> > +	if (!kref_get_unless_zero(&rq->ref))
> > +		return;
> > +
> > +	/*
> > +	 * The request is now locked and cannot be reallocated underneath the
> > +	 * timeout handler's processing. Re-verify this exact request is truly
> > +	 * expired; if it is not expired, then the request was completed and
> > +	 * reallocated as a new request.
> > +	 */
> > +	if (blk_mq_req_expired(rq, next))
> >  		blk_mq_rq_timed_out(rq, reserved);
> > +	blk_mq_put_request(rq);
> >  }
> 
> Hello Keith and Christoph,
> 
> What prevents that a request finishes and gets reused after the
> blk_mq_req_expired() call has finished and before kref_get_unless_zero() is
> called? Is this perhaps a race condition that has not yet been triggered by
> any existing block layer test? Please note that there is no such race
> condition in the patch I had posted ("blk-mq: Rework blk-mq timeout handling
> again" - https://www.spinics.net/lists/linux-block/msg26489.html).

I don't think there's any such race in the merged implementation
either. If the request is reallocated, then the kref check may succeed,
but the blk_mq_req_expired() check would surely fail, allowing the
request to proceed as normal. The code comments at least say as much.
Bart Van Assche July 12, 2018, 10:24 p.m. UTC | #27
On Thu, 2018-07-12 at 13:24 -0600, Keith Busch wrote:
> On Thu, Jul 12, 2018 at 06:16:12PM +0000, Bart Van Assche wrote:

> > What prevents that a request finishes and gets reused after the

> > blk_mq_req_expired() call has finished and before kref_get_unless_zero() is

> > called? Is this perhaps a race condition that has not yet been triggered by

> > any existing block layer test? Please note that there is no such race

> > condition in the patch I had posted ("blk-mq: Rework blk-mq timeout handling

> > again" - https://www.spinics.net/lists/linux-block/msg26489.html).

> 

> I don't think there's any such race in the merged implementation

> either. If the request is reallocated, then the kref check may succeed,

> but the blk_mq_req_expired() check would surely fail, allowing the

> request to proceed as normal. The code comments at least say as much.


Hello Keith,

Before commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"), if a
request completion was reported after request timeout processing had
started, completion handling was skipped. The following code in
blk_mq_complete_request() realized that:

	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
		__blk_mq_complete_request(rq);

Since commit 12f5b9314545, if a completion occurs after request timeout
processing has started, that completion is processed if the request has the
state MQ_RQ_IN_FLIGHT. blk_mq_rq_timed_out() does not modify the request
state unless the block driver timeout handler modifies it, e.g. by calling
blk_mq_end_request() or by calling blk_mq_requeue_request(). The typical
behavior of scsi_times_out() is to queue sending of a SCSI abort and hence
not to change the request state immediately. In other words, if a request
completion occurs during or shortly after a timeout occurred then
blk_mq_complete_request() will call __blk_mq_complete_request() and will
complete the request, although that is not allowed because timeout handling
has already started. Do you agree with this analysis?

Thanks,

Bart.
jianchao.wang July 13, 2018, 1:12 a.m. UTC | #28
On 07/13/2018 06:24 AM, Bart Van Assche wrote:
> On Thu, 2018-07-12 at 13:24 -0600, Keith Busch wrote:
>> On Thu, Jul 12, 2018 at 06:16:12PM +0000, Bart Van Assche wrote:
>>> What prevents that a request finishes and gets reused after the
>>> blk_mq_req_expired() call has finished and before kref_get_unless_zero() is
>>> called? Is this perhaps a race condition that has not yet been triggered by
>>> any existing block layer test? Please note that there is no such race
>>> condition in the patch I had posted ("blk-mq: Rework blk-mq timeout handling
>>> again" - https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_linux-2Dblock_msg26489.html&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=zqZd2myYLkxjU6DWtRKpls-gvzEGEB4vv8bdYq5CiBs&s=-d79KAhEM83ShMp8xCHKoE6Dp5Gxf98L94DuamLEAaU&e=).
>>
>> I don't think there's any such race in the merged implementation
>> either. If the request is reallocated, then the kref check may succeed,
>> but the blk_mq_req_expired() check would surely fail, allowing the
>> request to proceed as normal. The code comments at least say as much.
> 
> Hello Keith,
> 
> Before commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"), if a
> request completion was reported after request timeout processing had
> started, completion handling was skipped. The following code in
> blk_mq_complete_request() realized that:
> 
> 	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
> 		__blk_mq_complete_request(rq);
> 
> Since commit 12f5b9314545, if a completion occurs after request timeout
> processing has started, that completion is processed if the request has the
> state MQ_RQ_IN_FLIGHT. blk_mq_rq_timed_out() does not modify the request
> state unless the block driver timeout handler modifies it, e.g. by calling
> blk_mq_end_request() or by calling blk_mq_requeue_request(). The typical
> behavior of scsi_times_out() is to queue sending of a SCSI abort and hence
> not to change the request state immediately. In other words, if a request
> completion occurs during or shortly after a timeout occurred then
> blk_mq_complete_request() will call __blk_mq_complete_request() and will
> complete the request, although that is not allowed because timeout handling
> has already started. Do you agree with this analysis?
>

Oh, thanks gods for hearing Bart said this.
I was always saying the same thing in the mail
https://marc.info/?l=linux-block&m=152950093831738&w=2
Even though my voice is tiny, I support Bart's point definitely.

Thanks
Jianchao
 
> Thanks,
> 
> Bart.
> 
> 
> 
> 
> 
> 
> 
> 
>
jianchao.wang July 13, 2018, 2:40 a.m. UTC | #29
On 07/13/2018 06:24 AM, Bart Van Assche wrote:
> Hello Keith,
> 
> Before commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"), if a
> request completion was reported after request timeout processing had
> started, completion handling was skipped. The following code in
> blk_mq_complete_request() realized that:
> 
> 	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
> 		__blk_mq_complete_request(rq);

Even if before tejun's patch, we also have this for both blk-mq and blk-legacy code.

blk_rq_check_expired

    if (time_after_eq(jiffies, rq->deadline)) {
        list_del_init(&rq->timeout_list);

        /*
         * Check if we raced with end io completion
         */
        if (!blk_mark_rq_complete(rq))
            blk_rq_timed_out(rq);
    } 
 

blk_complete_request
    
    if (!blk_mark_rq_complete(req))
        __blk_complete_request(req);

blk_mq_check_expired
    
    if (time_after_eq(jiffies, rq->deadline)) {
        if (!blk_mark_rq_complete(rq))
            blk_mq_rq_timed_out(rq, reserved);
    }


blk_mq_complete_request

    if (!blk_mark_rq_complete(rq))
        __blk_mq_complete_request(rq);

Thanks
Jianchao

> 
> Since commit 12f5b9314545, if a completion occurs after request timeout
> processing has started, that completion is processed if the request has the
> state MQ_RQ_IN_FLIGHT. blk_mq_rq_timed_out() does not modify the request
> state unless the block driver timeout handler modifies it, e.g. by calling
> blk_mq_end_request() or by calling blk_mq_requeue_request(). The typical
> behavior of scsi_times_out() is to queue sending of a SCSI abort and hence
> not to change the request state immediately. In other words, if a request
> completion occurs during or shortly after a timeout occurred then
> blk_mq_complete_request() will call __blk_mq_complete_request() and will
> complete the request, although that is not allowed because timeout handling
> has already started. Do you agree with this analysis?
Keith Busch July 13, 2018, 3:43 p.m. UTC | #30
On Thu, Jul 12, 2018 at 10:24:42PM +0000, Bart Van Assche wrote:
> Before commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"), if a
> request completion was reported after request timeout processing had
> started, completion handling was skipped. The following code in
> blk_mq_complete_request() realized that:
> 
> 	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
> 		__blk_mq_complete_request(rq);
> 
> Since commit 12f5b9314545, if a completion occurs after request timeout
> processing has started, that completion is processed if the request has the
> state MQ_RQ_IN_FLIGHT. blk_mq_rq_timed_out() does not modify the request
> state unless the block driver timeout handler modifies it, e.g. by calling
> blk_mq_end_request() or by calling blk_mq_requeue_request(). The typical
> behavior of scsi_times_out() is to queue sending of a SCSI abort and hence
> not to change the request state immediately. In other words, if a request
> completion occurs during or shortly after a timeout occurred then
> blk_mq_complete_request() will call __blk_mq_complete_request() and will
> complete the request, although that is not allowed because timeout handling
> has already started. Do you agree with this analysis?

Yes, it's different, and that was the whole point. No one made that a
secret either. Are you saying you want timeout software to take priority
over handling hardware events?
Bart Van Assche July 13, 2018, 3:52 p.m. UTC | #31
On Fri, 2018-07-13 at 09:43 -0600, Keith Busch wrote:
> On Thu, Jul 12, 2018 at 10:24:42PM +0000, Bart Van Assche wrote:

> > Before commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"), if a

> > request completion was reported after request timeout processing had

> > started, completion handling was skipped. The following code in

> > blk_mq_complete_request() realized that:

> > 

> > 	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)

> > 		__blk_mq_complete_request(rq);

> > 

> > Since commit 12f5b9314545, if a completion occurs after request timeout

> > processing has started, that completion is processed if the request has the

> > state MQ_RQ_IN_FLIGHT. blk_mq_rq_timed_out() does not modify the request

> > state unless the block driver timeout handler modifies it, e.g. by calling

> > blk_mq_end_request() or by calling blk_mq_requeue_request(). The typical

> > behavior of scsi_times_out() is to queue sending of a SCSI abort and hence

> > not to change the request state immediately. In other words, if a request

> > completion occurs during or shortly after a timeout occurred then

> > blk_mq_complete_request() will call __blk_mq_complete_request() and will

> > complete the request, although that is not allowed because timeout handling

> > has already started. Do you agree with this analysis?

> 

> Yes, it's different, and that was the whole point. No one made that a

> secret either. Are you saying you want timeout software to take priority

> over handling hardware events?


No. What I'm saying is that a behavior change has been introduced in the block
layer that was not documented in the patch description. I think that behavior
change even can trigger a kernel crash.

Bart.
Keith Busch July 13, 2018, 6:47 p.m. UTC | #32
On Fri, Jul 13, 2018 at 03:52:38PM +0000, Bart Van Assche wrote:
> No. What I'm saying is that a behavior change has been introduced in the block
> layer that was not documented in the patch description. 

Did you read the changelog?

> I think that behavior change even can trigger a kernel crash.

I think we are past acknowledging issues exist with timeouts.
Bart Van Assche July 13, 2018, 11:03 p.m. UTC | #33
On Fri, 2018-07-13 at 12:47 -0600, Keith Busch wrote:
> On Fri, Jul 13, 2018 at 03:52:38PM +0000, Bart Van Assche wrote:

> > I think that behavior change even can trigger a kernel crash.

> 

> I think we are past acknowledging issues exist with timeouts.


Hello Keith,

How do you want to go forward from here? Do you prefer the approach of the
patch I had posted (https://www.spinics.net/lists/linux-block/msg26489.html), 
Jianchao's approach (https://marc.info/?l=linux-block&m=152950093831738) or
perhaps yet another approach? Note: I think Jianchao's patch is a good start
but also that it needs further improvement.

Thanks,

Bart.
Keith Busch July 13, 2018, 11:58 p.m. UTC | #34
On Fri, Jul 13, 2018 at 11:03:18PM +0000, Bart Van Assche wrote:
> How do you want to go forward from here? Do you prefer the approach of the
> patch I had posted (https://www.spinics.net/lists/linux-block/msg26489.html), 
> Jianchao's approach (https://marc.info/?l=linux-block&m=152950093831738) or
> perhaps yet another approach? Note: I think Jianchao's patch is a good start
> but also that it needs further improvement.

Of the two you mentioned, yours is preferable IMO. While I appreciate
Jianchao's detailed analysis, it's hard to take a proposal seriously
that so colourfully calls everyone else "dangerous" while advocating
for silently losing requests on purpose.

But where's the option that fixes scsi to handle hardware completions
concurrently with arbitrary timeout software? Propping up that house of
cards can't be the only recourse.
Christoph Hellwig July 18, 2018, 7:56 p.m. UTC | #35
On Fri, Jul 13, 2018 at 05:58:08PM -0600, Keith Busch wrote:
> Of the two you mentioned, yours is preferable IMO. While I appreciate
> Jianchao's detailed analysis, it's hard to take a proposal seriously
> that so colourfully calls everyone else "dangerous" while advocating
> for silently losing requests on purpose.
> 
> But where's the option that fixes scsi to handle hardware completions
> concurrently with arbitrary timeout software? Propping up that house of
> cards can't be the only recourse.

The important bit is that we need to fix this issue quickly.  We are
past -rc5 so I'm rather concerned about anything too complicated.

I'm not even sure SCSI has a problem with multiple completions happening
at the same time, but it certainly has a problem with bypassing
blk_mq_complete_request from the EH path.

I think we can solve this properly, but I also think we are way to late
in the 4.18 cycle to fix it properly.  For now I fear we'll just have
to revert the changes and try again for 4.19 or even 4.20 if we don't
act quickly enough.
Christoph Hellwig July 18, 2018, 8:39 p.m. UTC | #36
On Wed, Jul 18, 2018 at 09:56:50PM +0200, hch@lst.de wrote:
> On Fri, Jul 13, 2018 at 05:58:08PM -0600, Keith Busch wrote:
> > Of the two you mentioned, yours is preferable IMO. While I appreciate
> > Jianchao's detailed analysis, it's hard to take a proposal seriously
> > that so colourfully calls everyone else "dangerous" while advocating
> > for silently losing requests on purpose.
> > 
> > But where's the option that fixes scsi to handle hardware completions
> > concurrently with arbitrary timeout software? Propping up that house of
> > cards can't be the only recourse.
> 
> The important bit is that we need to fix this issue quickly.  We are
> past -rc5 so I'm rather concerned about anything too complicated.
> 
> I'm not even sure SCSI has a problem with multiple completions happening
> at the same time, but it certainly has a problem with bypassing
> blk_mq_complete_request from the EH path.
> 
> I think we can solve this properly, but I also think we are way to late
> in the 4.18 cycle to fix it properly.  For now I fear we'll just have
> to revert the changes and try again for 4.19 or even 4.20 if we don't
> act quickly enough.

So here is a quick attempt at the revert while also trying to keep
nvme working.  Keith, Bart, Jianchao - does this looks reasonable
as a 4.18 band aid?

http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-eh-revert
Bart Van Assche July 18, 2018, 9:05 p.m. UTC | #37
On Wed, 2018-07-18 at 22:39 +0200, hch@lst.de wrote:
> On Wed, Jul 18, 2018 at 09:56:50PM +0200, hch@lst.de wrote:
> > The important bit is that we need to fix this issue quickly.  We are
> > past -rc5 so I'm rather concerned about anything too complicated.
> > 
> > I'm not even sure SCSI has a problem with multiple completions happening
> > at the same time, but it certainly has a problem with bypassing
> > blk_mq_complete_request from the EH path.
> > 
> > I think we can solve this properly, but I also think we are way to late
> > in the 4.18 cycle to fix it properly.  For now I fear we'll just have
> > to revert the changes and try again for 4.19 or even 4.20 if we don't
> > act quickly enough.
> 
> So here is a quick attempt at the revert while also trying to keep
> nvme working.  Keith, Bart, Jianchao - does this looks reasonable
> as a 4.18 band aid?
> 
> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-eh-revert

Hello Christoph,

A patch series that first reverts the following patches:
* blk-mq: Fix timeout handling in case the timeout handler returns BLK_EH_DONE
* block: fix timeout changes for legacy request drivers
* blk-mq: don't time out requests again that are in the timeout handler
* blk-mq: simplify blk_mq_rq_timed_out
* block: remove BLK_EH_HANDLED
* block: rename BLK_EH_NOT_HANDLED to BLK_EH_DONE
* blk-mq: Remove generation seqeunce

and next renames BLK_EH_NOT_HANDLED again into BLK_EH_DONE would probably be
a lot easier to review.

Thanks,

Bart.
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 43370faee935..cee03cad99f2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,12 +198,6 @@  void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->internal_tag = -1;
 	rq->start_time_ns = ktime_get_ns();
 	rq->part = NULL;
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * See comment of blk_mq_init_request
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3080e18cb859..ffa622366922 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -344,7 +344,6 @@  static const char *const rqf_name[] = {
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
-	RQF_NAME(MQ_TIMEOUT_EXPIRED),
 	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 66e5c768803f..4858876fd364 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -589,56 +589,6 @@  static void __blk_mq_complete_request(struct request *rq)
 	put_cpu();
 }
 
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
-	__releases(hctx->srcu)
-{
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-		rcu_read_unlock();
-	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
-}
-
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
-	__acquires(hctx->srcu)
-{
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-		/* shut up gcc false positive */
-		*srcu_idx = 0;
-		rcu_read_lock();
-	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
-}
-
-static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
-{
-	unsigned long flags;
-
-	/*
-	 * blk_mq_rq_aborted_gstate() is used from the completion path and
-	 * can thus be called from irq context.  u64_stats_fetch in the
-	 * middle of update on the same CPU leads to lockup.  Disable irq
-	 * while updating.
-	 */
-	local_irq_save(flags);
-	u64_stats_update_begin(&rq->aborted_gstate_sync);
-	rq->aborted_gstate = gstate;
-	u64_stats_update_end(&rq->aborted_gstate_sync);
-	local_irq_restore(flags);
-}
-
-static u64 blk_mq_rq_aborted_gstate(struct request *rq)
-{
-	unsigned int start;
-	u64 aborted_gstate;
-
-	do {
-		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
-		aborted_gstate = rq->aborted_gstate;
-	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
-
-	return aborted_gstate;
-}
-
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -650,27 +600,10 @@  static u64 blk_mq_rq_aborted_gstate(struct request *rq)
 void blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
-	int srcu_idx;
 
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-
-	/*
-	 * If @rq->aborted_gstate equals the current instance, timeout is
-	 * claiming @rq and we lost.  This is synchronized through
-	 * hctx_lock().  See blk_mq_timeout_work() for details.
-	 *
-	 * Completion path never blocks and we can directly use RCU here
-	 * instead of hctx_lock() which can be either RCU or SRCU.
-	 * However, that would complicate paths which want to synchronize
-	 * against us.  Let stay in sync with the issue path so that
-	 * hctx_lock() covers both issue and completion paths.
-	 */
-	hctx_lock(hctx, &srcu_idx);
-	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
-		__blk_mq_complete_request(rq);
-	hctx_unlock(hctx, srcu_idx);
+	__blk_mq_complete_request(rq);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -699,26 +632,9 @@  void blk_mq_start_request(struct request *rq)
 
 	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
-	/*
-	 * Mark @rq in-flight which also advances the generation number,
-	 * and register for timeout.  Protect with a seqcount to allow the
-	 * timeout path to read both @rq->gstate and @rq->deadline
-	 * coherently.
-	 *
-	 * This is the only place where a request is marked in-flight.  If
-	 * the timeout path reads an in-flight @rq->gstate, the
-	 * @rq->deadline it reads together under @rq->gstate_seq is
-	 * guaranteed to be the matching one.
-	 */
-	preempt_disable();
-	write_seqcount_begin(&rq->gstate_seq);
-
 	blk_add_timer(rq);
 	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
 
-	write_seqcount_end(&rq->gstate_seq);
-	preempt_enable();
-
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
 		 * Make sure space for the drain appears.  We know we can do
@@ -730,11 +646,6 @@  void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
-/*
- * When we reach here because queue is busy, it's safe to change the state
- * to IDLE without checking @rq->aborted_gstate because we should still be
- * holding the RCU read lock and thus protected against timeout.
- */
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
@@ -843,33 +754,24 @@  struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-struct blk_mq_timeout_data {
-	unsigned long next;
-	unsigned int next_set;
-	unsigned int nr_expired;
-};
-
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
 	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
-	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
-
 	if (ops->timeout)
 		ret = ops->timeout(req, reserved);
 
 	switch (ret) {
 	case BLK_EH_HANDLED:
-		__blk_mq_complete_request(req);
-		break;
-	case BLK_EH_RESET_TIMER:
 		/*
-		 * As nothing prevents from completion happening while
-		 * ->aborted_gstate is set, this may lead to ignored
-		 * completions and further spurious timeouts.
+		 * If the request is still in flight, the driver is requesting
+		 * blk-mq complete it.
 		 */
-		blk_mq_rq_update_aborted_gstate(req, 0);
+		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
+			__blk_mq_complete_request(req);
+		break;
+	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
@@ -880,64 +782,64 @@  static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 	}
 }
 
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, void *priv, bool reserved)
+static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 {
-	struct blk_mq_timeout_data *data = priv;
-	unsigned long gstate, deadline;
-	int start;
+	unsigned long deadline;
 
-	might_sleep();
-
-	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
-		return;
+	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
+		return false;
 
-	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
-	while (true) {
-		start = read_seqcount_begin(&rq->gstate_seq);
-		gstate = READ_ONCE(rq->gstate);
-		deadline = blk_rq_deadline(rq);
-		if (!read_seqcount_retry(&rq->gstate_seq, start))
-			break;
-		cond_resched();
-	}
+	deadline = blk_rq_deadline(rq);
+	if (time_after_eq(jiffies, deadline))
+		return true;
 
-	/* if in-flight && overdue, mark for abortion */
-	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
-	    time_after_eq(jiffies, deadline)) {
-		blk_mq_rq_update_aborted_gstate(rq, gstate);
-		data->nr_expired++;
-		hctx->nr_expired++;
-	} else if (!data->next_set || time_after(data->next, deadline)) {
-		data->next = deadline;
-		data->next_set = 1;
-	}
+	if (*next == 0)
+		*next = deadline;
+	else if (time_after(*next, deadline))
+		*next = deadline;
+	return false;
 }
 
-static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
+	unsigned long *next = priv;
+
 	/*
-	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
-	 * completions that we lost to, they would have finished and
-	 * updated @rq->gstate by now; otherwise, the completion path is
-	 * now guaranteed to see @rq->aborted_gstate and yield.  If
-	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 * Just do a quick check if it is expired before locking the request in
+	 * so we're not unnecessarilly synchronizing across CPUs.
 	 */
-	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
-	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
+	if (!blk_mq_req_expired(rq, next))
+		return;
+
+	/*
+	 * We have reason to believe the request may be expired. Take a
+	 * reference on the request to lock this request lifetime into its
+	 * currently allocated context to prevent it from being reallocated in
+	 * the event the completion by-passes this timeout handler.
+	 * 
+	 * If the reference was already released, then the driver beat the
+	 * timeout handler to posting a natural completion.
+	 */
+	if (!kref_get_unless_zero(&rq->ref))
+		return;
+
+	/*
+	 * The request is now locked and cannot be reallocated underneath the
+	 * timeout handler's processing. Re-verify this exact request is truly
+	 * expired; if it is not expired, then the request was completed and
+	 * reallocated as a new request.
+	 */
+	if (blk_mq_req_expired(rq, next))
 		blk_mq_rq_timed_out(rq, reserved);
+	blk_mq_put_request(rq);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, timeout_work);
-	struct blk_mq_timeout_data data = {
-		.next		= 0,
-		.next_set	= 0,
-		.nr_expired	= 0,
-	};
+	unsigned long next = 0;
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
@@ -957,39 +859,10 @@  static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
-	/* scan for the expired ones and set their ->aborted_gstate */
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 
-	if (data.nr_expired) {
-		bool has_rcu = false;
-
-		/*
-		 * Wait till everyone sees ->aborted_gstate.  The
-		 * sequential waits for SRCUs aren't ideal.  If this ever
-		 * becomes a problem, we can add per-hw_ctx rcu_head and
-		 * wait in parallel.
-		 */
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (!hctx->nr_expired)
-				continue;
-
-			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-				has_rcu = true;
-			else
-				synchronize_srcu(hctx->srcu);
-
-			hctx->nr_expired = 0;
-		}
-		if (has_rcu)
-			synchronize_rcu();
-
-		/* terminate the ones we won */
-		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
-	}
-
-	if (data.next_set) {
-		data.next = blk_rq_timeout(round_jiffies_up(data.next));
-		mod_timer(&q->timeout, data.next);
+	if (next != 0) {
+		mod_timer(&q->timeout, next);
 	} else {
 		/*
 		 * Request timeouts are handled as a forward rolling timer. If
@@ -1334,8 +1207,6 @@  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	int srcu_idx;
-
 	/*
 	 * We should be running this queue from one of the CPUs that
 	 * are mapped to it.
@@ -1369,9 +1240,7 @@  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-	hctx_lock(hctx, &srcu_idx);
 	blk_mq_sched_dispatch_requests(hctx);
-	hctx_unlock(hctx, srcu_idx);
 }
 
 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -1458,7 +1327,6 @@  EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 
 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	int srcu_idx;
 	bool need_run;
 
 	/*
@@ -1469,10 +1337,8 @@  bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	hctx_lock(hctx, &srcu_idx);
 	need_run = !blk_queue_quiesced(hctx->queue) &&
 		blk_mq_hctx_has_pending(hctx);
-	hctx_unlock(hctx, srcu_idx);
 
 	if (need_run) {
 		__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -1828,34 +1694,23 @@  static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, blk_qc_t *cookie)
 {
 	blk_status_t ret;
-	int srcu_idx;
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-	hctx_lock(hctx, &srcu_idx);
-
 	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
 		blk_mq_sched_insert_request(rq, false, true, false);
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
-
-	hctx_unlock(hctx, srcu_idx);
 }
 
 blk_status_t blk_mq_request_issue_directly(struct request *rq)
 {
-	blk_status_t ret;
-	int srcu_idx;
 	blk_qc_t unused_cookie;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
 
-	hctx_lock(hctx, &srcu_idx);
-	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
-	hctx_unlock(hctx, srcu_idx);
-
-	return ret;
+	return __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
 }
 
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -2065,15 +1920,7 @@  static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 			return ret;
 	}
 
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * start gstate with gen 1 instead of 0, otherwise it will be equal
-	 * to aborted_gstate, and be identified timed out by
-	 * blk_mq_terminate_expired.
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
-
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 	return 0;
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e1bb420dc5d6..53452df16343 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,17 +31,12 @@  struct blk_mq_ctx {
 } ____cacheline_aligned_in_smp;
 
 /*
- * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
- * and the upper bits the generation number.
+ * Request state.
  */
 enum mq_rq_state {
 	MQ_RQ_IDLE		= 0,
 	MQ_RQ_IN_FLIGHT		= 1,
 	MQ_RQ_COMPLETE		= 2,
-
-	MQ_RQ_STATE_BITS	= 2,
-	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
-	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
 };
 
 void blk_mq_freeze_queue(struct request_queue *q);
@@ -109,7 +104,7 @@  void blk_mq_release(struct request_queue *q);
  */
 static inline int blk_mq_rq_state(struct request *rq)
 {
-	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+	return READ_ONCE(rq->state);
 }
 
 /**
@@ -124,16 +119,7 @@  static inline int blk_mq_rq_state(struct request *rq)
 static inline void blk_mq_rq_update_state(struct request *rq,
 					  enum mq_rq_state state)
 {
-	u64 old_val = READ_ONCE(rq->gstate);
-	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
-
-	if (state == MQ_RQ_IN_FLIGHT) {
-		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
-		new_val += MQ_RQ_GEN_INC;
-	}
-
-	/* avoid exposing interim values */
-	WRITE_ONCE(rq->gstate, new_val);
+	WRITE_ONCE(rq->state, state);
 }
 
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 652d4d4d3e97..f95d6e6cbc96 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -214,7 +214,6 @@  void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	blk_rq_set_deadline(req, jiffies + req->timeout);
-	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 26bf2c1e3502..8812a7e3c0a3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,10 +125,8 @@  typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
-/* timeout is expired */
-#define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
 /* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -236,27 +234,7 @@  struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	/*
-	 * On blk-mq, the lower bits of ->gstate (generation number and
-	 * state) carry the MQ_RQ_* state value and the upper bits the
-	 * generation number which is monotonically incremented and used to
-	 * distinguish the reuse instances.
-	 *
-	 * ->gstate_seq allows updates to ->gstate and other fields
-	 * (currently ->deadline) during request start to be read
-	 * atomically from the timeout path, so that it can operate on a
-	 * coherent set of information.
-	 */
-	seqcount_t gstate_seq;
-	u64 gstate;
-
-	/*
-	 * ->aborted_gstate is used by the timeout to claim a specific
-	 * recycle instance of this request.  See blk_mq_timeout_work().
-	 */
-	struct u64_stats_sync aborted_gstate_sync;
-	u64 aborted_gstate;
-
+	u64 state;
 	struct kref ref;
 
 	/* access through blk_rq_set_deadline, blk_rq_deadline */