diff mbox

[1/2] nvme: pci: simplify timeout handling

Message ID 20180427175157.GB5073@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Keith Busch April 27, 2018, 5:51 p.m. UTC
On Thu, Apr 26, 2018 at 08:39:55PM +0800, Ming Lei wrote:
> +/*
> + * This one is called after queues are quiesced, and no in-fligh timeout
> + * and nvme interrupt handling.
> + */
> +static void nvme_pci_cancel_request(struct request *req, void *data,
> +		bool reserved)
> +{
> +	/* make sure timed-out requests are covered too */
> +	if (req->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) {
> +		req->aborted_gstate = 0;
> +		req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
> +	}
> +
> +	nvme_cancel_request(req, data, reserved);
> +}

I don't know about this. I feel like blk-mq owns these flags and LLD's
shouldn't require such knowledge of their implementation details.

I understand how the problems are happening a bit better now. It used
to be that blk-mq would lock an expired command one at a time, so when
we had a batch of IO timeouts, the driver was able to complete all of
them inside a single IO timeout handler.

That's not the case anymore, so the driver is called for every IO
timeout even though if it reaped all the commands at once.

IMO, the block layer should allow the driver to complete all timed out
IOs at once rather than go through ->timeout() for each of them and
without knowing about the request state details. Much of the problems
would go away in that case.

But I think much of this can be fixed by just syncing the queues in the
probe work, and may be a more reasonable bug-fix for 4.17 rather than
rewrite the entire timeout handler. What do you think of this patch? It's
an update of one I posted a few months ago, but uses Jianchao's safe
namespace list locking.

---
--

Comments

Ming Lei April 28, 2018, 3:50 a.m. UTC | #1
On Fri, Apr 27, 2018 at 11:51:57AM -0600, Keith Busch wrote:
> On Thu, Apr 26, 2018 at 08:39:55PM +0800, Ming Lei wrote:
> > +/*
> > + * This one is called after queues are quiesced, and no in-fligh timeout
> > + * and nvme interrupt handling.
> > + */
> > +static void nvme_pci_cancel_request(struct request *req, void *data,
> > +		bool reserved)
> > +{
> > +	/* make sure timed-out requests are covered too */
> > +	if (req->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) {
> > +		req->aborted_gstate = 0;
> > +		req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
> > +	}
> > +
> > +	nvme_cancel_request(req, data, reserved);
> > +}
> 
> I don't know about this. I feel like blk-mq owns these flags and LLD's
> shouldn't require such knowledge of their implementation details.

If driver returns BLK_EH_NOT_HANDLED from .timeout(), the driver may
need to deal with this flag.

But it won't be necessary to use this flag here, and it can be done by
clearing 'req->aborted_gstate' always.

> 
> I understand how the problems are happening a bit better now. It used
> to be that blk-mq would lock an expired command one at a time, so when
> we had a batch of IO timeouts, the driver was able to complete all of
> them inside a single IO timeout handler.
> 
> That's not the case anymore, so the driver is called for every IO
> timeout even though if it reaped all the commands at once.

Actually there isn't the case before, even for legacy path, one .timeout()
handles one request only.

> 
> IMO, the block layer should allow the driver to complete all timed out
> IOs at once rather than go through ->timeout() for each of them and
> without knowing about the request state details. Much of the problems
> would go away in that case.

That need to change every driver, looks not easy to do, also not sure
if other drivers need this way.

> 
> But I think much of this can be fixed by just syncing the queues in the
> probe work, and may be a more reasonable bug-fix for 4.17 rather than
> rewrite the entire timeout handler. What do you think of this patch? It's
> an update of one I posted a few months ago, but uses Jianchao's safe
> namespace list locking.
> 
> ---
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index a3771c5729f5..198b4469c3e2 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3562,12 +3562,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
>  	struct nvme_ns *ns;
>  
>  	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> +	list_for_each_entry(ns, &ctrl->namespaces, list) {
>  		blk_mq_unquiesce_queue(ns->queue);
> +		blk_mq_kick_requeue_list(ns->queue);
> +	}
>  	up_read(&ctrl->namespaces_rwsem);
>  }
>  EXPORT_SYMBOL_GPL(nvme_start_queues);
>  
> +void nvme_sync_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_sync_queue(ns->queue);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +EXPORT_SYMBOL_GPL(nvme_sync_queues);
> +
>  int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
>  {
>  	if (!ctrl->ops->reinit_request)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 7ded7a51c430..e62273198725 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -402,6 +402,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
>  void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
>  		union nvme_result *res);
>  
> +void nvme_sync_queues(struct nvme_ctrl *ctrl);
>  void nvme_stop_queues(struct nvme_ctrl *ctrl);
>  void nvme_start_queues(struct nvme_ctrl *ctrl);
>  void nvme_kill_queues(struct nvme_ctrl *ctrl);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index fbc71fac6f1e..a2f3ad105620 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2303,6 +2303,7 @@ static void nvme_reset_work(struct work_struct *work)
>  	 */
>  	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
>  		nvme_dev_disable(dev, false);
> +	nvme_sync_queues(&dev->ctrl);

This sync may be raced with one timed-out request, which may be handled
as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
work reliably. 

There are more issues except for the above one if disabling controller
and resetting it are run in different context:

1) inside resetting, nvme_dev_disable() may be called, but this way
may cause double completion on the previous timed-out request.

2) the timed-out request can't be covered by nvme_dev_disable(), and
this way is too tricky and easy to cause trouble.

IMO, it is much easier to avoid the above issues by handling controller
recover in one single thread. I will address all your comments and post
V3 for review.


Thanks,
Ming
Keith Busch April 28, 2018, 1:35 p.m. UTC | #2
On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> > I understand how the problems are happening a bit better now. It used
> > to be that blk-mq would lock an expired command one at a time, so when
> > we had a batch of IO timeouts, the driver was able to complete all of
> > them inside a single IO timeout handler.
> > 
> > That's not the case anymore, so the driver is called for every IO
> > timeout even though if it reaped all the commands at once.
> 
> Actually there isn't the case before, even for legacy path, one .timeout()
> handles one request only.

That's not quite what I was talking about.

Before, only the command that was about to be sent to the driver's
.timeout() was marked completed. The driver could (and did) compete
other timed out commands in a single .timeout(), and the tag would
clear, so we could hanlde all timeouts in a single .timeout().

Now, blk-mq marks all timed out commands as aborted prior to calling
the driver's .timeout(). If the driver completes any of those commands,
the tag does not clear, so the driver's .timeout() just gets to be called
again for commands it already reaped.



> > IMO, the block layer should allow the driver to complete all timed out
> > IOs at once rather than go through ->timeout() for each of them and
> > without knowing about the request state details. Much of the problems
> > would go away in that case.
> 
> That need to change every driver, looks not easy to do, also not sure
> if other drivers need this way.
> 
> > 
> > But I think much of this can be fixed by just syncing the queues in the
> > probe work, and may be a more reasonable bug-fix for 4.17 rather than
> > rewrite the entire timeout handler. What do you think of this patch? It's
> > an update of one I posted a few months ago, but uses Jianchao's safe
> > namespace list locking.
> > 
> > ---
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index a3771c5729f5..198b4469c3e2 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -3562,12 +3562,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
> >  	struct nvme_ns *ns;
> >  
> >  	down_read(&ctrl->namespaces_rwsem);
> > -	list_for_each_entry(ns, &ctrl->namespaces, list)
> > +	list_for_each_entry(ns, &ctrl->namespaces, list) {
> >  		blk_mq_unquiesce_queue(ns->queue);
> > +		blk_mq_kick_requeue_list(ns->queue);
> > +	}
> >  	up_read(&ctrl->namespaces_rwsem);
> >  }
> >  EXPORT_SYMBOL_GPL(nvme_start_queues);
> >  
> > +void nvme_sync_queues(struct nvme_ctrl *ctrl)
> > +{
> > +	struct nvme_ns *ns;
> > +
> > +	down_read(&ctrl->namespaces_rwsem);
> > +	list_for_each_entry(ns, &ctrl->namespaces, list)
> > +		blk_sync_queue(ns->queue);
> > +	up_read(&ctrl->namespaces_rwsem);
> > +}
> > +EXPORT_SYMBOL_GPL(nvme_sync_queues);
> > +
> >  int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
> >  {
> >  	if (!ctrl->ops->reinit_request)
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index 7ded7a51c430..e62273198725 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -402,6 +402,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
> >  void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
> >  		union nvme_result *res);
> >  
> > +void nvme_sync_queues(struct nvme_ctrl *ctrl);
> >  void nvme_stop_queues(struct nvme_ctrl *ctrl);
> >  void nvme_start_queues(struct nvme_ctrl *ctrl);
> >  void nvme_kill_queues(struct nvme_ctrl *ctrl);
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index fbc71fac6f1e..a2f3ad105620 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2303,6 +2303,7 @@ static void nvme_reset_work(struct work_struct *work)
> >  	 */
> >  	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
> >  		nvme_dev_disable(dev, false);
> > +	nvme_sync_queues(&dev->ctrl);
> 
> This sync may be raced with one timed-out request, which may be handled
> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
> work reliably. 
> 
> There are more issues except for the above one if disabling controller
> and resetting it are run in different context:
> 
> 1) inside resetting, nvme_dev_disable() may be called, but this way
> may cause double completion on the previous timed-out request.
> 
> 2) the timed-out request can't be covered by nvme_dev_disable(), and
> this way is too tricky and easy to cause trouble.
> 
> IMO, it is much easier to avoid the above issues by handling controller
> recover in one single thread. I will address all your comments and post
> V3 for review.
> 
> 
> Thanks,
> Ming
> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
jianchao.wang April 28, 2018, 2:31 p.m. UTC | #3
Hi Ming and Keith

Let me detail extend more here. :)

On 04/28/2018 09:35 PM, Keith Busch wrote:
>> Actually there isn't the case before, even for legacy path, one .timeout()
>> handles one request only.

Yes, .timeout should be invoked for every timeout request and .timeout should also
handle this only one request in principle
however, nvme_timeout will invoke nvme_dev_disable

> That's not quite what I was talking about.
> 
> Before, only the command that was about to be sent to the driver's
> .timeout() was marked completed. The driver could (and did) compete
> other timed out commands in a single .timeout(), and the tag would
> clear, so we could hanlde all timeouts in a single .timeout().

I think Keith are saying that
before this new blk-mq timeout implementation, the logic of blk_mq_timeout_work is

get _only_ _one_ timeout request
mark completed
invoke .timeout, in nvme, it is nvme_timeout
then nvme_dev_disable is invoked and thus other requests could be completed by blk_mq_complete_request
because they have not been mark completed 

> 
> Now, blk-mq marks all timed out commands as aborted prior to calling
> the driver's .timeout(). If the driver completes any of those commands,
> the tag does not clear, so the driver's .timeout() just gets to be called
> again for commands it already reaped.
> 

After the new blk-mq timeout implementation, 

set the aborted_gstate of _all_ the timeout requests
invoke the .timeout one by one
for the first timeout request's .timeout, in nvme, it is nvme_timeout
nvme_dev_disable is invoked and try to complete all the in-flight requests through blk_mq_complete_request
but timeout requests that have been set aborted_gstate cannot handled by blk_mq_complete_request
so some requests are leaked by nvme_dev_disable
these residual timeout requests will still be handled by blk_mq_timeout_work through invoke .timeout one by one


Thanks
Jianchao


>
Ming Lei April 28, 2018, 9:39 p.m. UTC | #4
On Sat, Apr 28, 2018 at 9:35 PM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
>> > I understand how the problems are happening a bit better now. It used
>> > to be that blk-mq would lock an expired command one at a time, so when
>> > we had a batch of IO timeouts, the driver was able to complete all of
>> > them inside a single IO timeout handler.
>> >
>> > That's not the case anymore, so the driver is called for every IO
>> > timeout even though if it reaped all the commands at once.
>>
>> Actually there isn't the case before, even for legacy path, one .timeout()
>> handles one request only.
>
> That's not quite what I was talking about.
>
> Before, only the command that was about to be sent to the driver's
> .timeout() was marked completed. The driver could (and did) compete
> other timed out commands in a single .timeout(), and the tag would
> clear, so we could hanlde all timeouts in a single .timeout().
>
> Now, blk-mq marks all timed out commands as aborted prior to calling
> the driver's .timeout(). If the driver completes any of those commands,
> the tag does not clear, so the driver's .timeout() just gets to be called
> again for commands it already reaped.

That won't happen because new timeout model will mark aborted on timed-out
request first, then run synchronize_rcu() before making these requests
really expired, and now rcu lock is held in normal completion
handler(blk_mq_complete_request).

Yes, Bart is working towards that way, but there is still the same race
between timeout handler(nvme_dev_disable()) and reset_work(), and nothing
changes wrt. the timeout model:

- reset may take a while to complete because of nvme_wait_freeze(), and
timeout can happen during resetting, then reset may hang forever. Even
without nvme_wait_freeze(), it is possible for timeout to happen during
reset work too in theory.

Actually for non-shutdown, it isn't necessary to freeze queue at all, and it
is enough to just quiesce queues to make hardware happy for recovery,
that has been part of my V2 patchset.

But it is really simple and clean to run the recovery(nvme_dev_disable and
reset_work) in one same context for avoiding this race, and the two
parts should have been done together for making our life easier, that is why
I was trying to work towards this direction.

Thanks,
Ming
Keith Busch April 30, 2018, 7:52 p.m. UTC | #5
On Sun, Apr 29, 2018 at 05:39:52AM +0800, Ming Lei wrote:
> On Sat, Apr 28, 2018 at 9:35 PM, Keith Busch
> <keith.busch@linux.intel.com> wrote:
> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> >> > I understand how the problems are happening a bit better now. It used
> >> > to be that blk-mq would lock an expired command one at a time, so when
> >> > we had a batch of IO timeouts, the driver was able to complete all of
> >> > them inside a single IO timeout handler.
> >> >
> >> > That's not the case anymore, so the driver is called for every IO
> >> > timeout even though if it reaped all the commands at once.
> >>
> >> Actually there isn't the case before, even for legacy path, one .timeout()
> >> handles one request only.
> >
> > That's not quite what I was talking about.
> >
> > Before, only the command that was about to be sent to the driver's
> > .timeout() was marked completed. The driver could (and did) compete
> > other timed out commands in a single .timeout(), and the tag would
> > clear, so we could hanlde all timeouts in a single .timeout().
> >
> > Now, blk-mq marks all timed out commands as aborted prior to calling
> > the driver's .timeout(). If the driver completes any of those commands,
> > the tag does not clear, so the driver's .timeout() just gets to be called
> > again for commands it already reaped.
> 
> That won't happen because new timeout model will mark aborted on timed-out
> request first, then run synchronize_rcu() before making these requests
> really expired, and now rcu lock is held in normal completion
> handler(blk_mq_complete_request).
> 
> Yes, Bart is working towards that way, but there is still the same race
> between timeout handler(nvme_dev_disable()) and reset_work(), and nothing
> changes wrt. the timeout model:

Yeah, the driver makes sure there are no possible outstanding commands at
the end of nvme_dev_disable. This should mean there's no timeout handler
running because there's no possible commands for that handler. But that's
not really the case anymore, so we had been inadvertently depending on
that behavior.

> - reset may take a while to complete because of nvme_wait_freeze(), and
> timeout can happen during resetting, then reset may hang forever. Even
> without nvme_wait_freeze(), it is possible for timeout to happen during
> reset work too in theory.
>
> Actually for non-shutdown, it isn't necessary to freeze queue at all, and it
> is enough to just quiesce queues to make hardware happy for recovery,
> that has been part of my V2 patchset.

When we freeze, we prevent IOs from entering contexts that may not be
valid on the other side of the reset. It's not very common for the
context count to change, but it can happen.

Anyway, will take a look at your series and catch up on the notes from
you and Jianchao.
Ming Lei April 30, 2018, 11:14 p.m. UTC | #6
On Mon, Apr 30, 2018 at 01:52:17PM -0600, Keith Busch wrote:
> On Sun, Apr 29, 2018 at 05:39:52AM +0800, Ming Lei wrote:
> > On Sat, Apr 28, 2018 at 9:35 PM, Keith Busch
> > <keith.busch@linux.intel.com> wrote:
> > > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> > >> > I understand how the problems are happening a bit better now. It used
> > >> > to be that blk-mq would lock an expired command one at a time, so when
> > >> > we had a batch of IO timeouts, the driver was able to complete all of
> > >> > them inside a single IO timeout handler.
> > >> >
> > >> > That's not the case anymore, so the driver is called for every IO
> > >> > timeout even though if it reaped all the commands at once.
> > >>
> > >> Actually there isn't the case before, even for legacy path, one .timeout()
> > >> handles one request only.
> > >
> > > That's not quite what I was talking about.
> > >
> > > Before, only the command that was about to be sent to the driver's
> > > .timeout() was marked completed. The driver could (and did) compete
> > > other timed out commands in a single .timeout(), and the tag would
> > > clear, so we could hanlde all timeouts in a single .timeout().
> > >
> > > Now, blk-mq marks all timed out commands as aborted prior to calling
> > > the driver's .timeout(). If the driver completes any of those commands,
> > > the tag does not clear, so the driver's .timeout() just gets to be called
> > > again for commands it already reaped.
> > 
> > That won't happen because new timeout model will mark aborted on timed-out
> > request first, then run synchronize_rcu() before making these requests
> > really expired, and now rcu lock is held in normal completion
> > handler(blk_mq_complete_request).
> > 
> > Yes, Bart is working towards that way, but there is still the same race
> > between timeout handler(nvme_dev_disable()) and reset_work(), and nothing
> > changes wrt. the timeout model:
> 
> Yeah, the driver makes sure there are no possible outstanding commands at
> the end of nvme_dev_disable. This should mean there's no timeout handler
> running because there's no possible commands for that handler. But that's
> not really the case anymore, so we had been inadvertently depending on
> that behavior.

I guess we may not depend on that behavior, because the timeout work
is per-request-queue(namespace), and timeout from all namespace/admin
queue may happen at the same time, meantime the .timeout() may be run
at different timing because of scheduling delay, and one of them may
cause nvme_dev_disable() to be called during resetting, not mention the
case of timeout triggered by reset_work().

That means we may have to drain timeout too even though Bart's patch is merged.

In short, there are several issues wrt. NVMe recovery:

1) timeout may be triggered in reset_work() by draining IO in
wait_freeze()

2) timeout still may be triggered by other queue, and nvme_dev_disable()
may be called during resetting which is scheduled by other queue's timeout

In both 1) and 2), queues can be quiesced and wait_freeze() in reset_work()
may never complete, then controller can't be recovered at all.

3) race related with start_freeze & unfreeze()


And it may be fixed by changing the model into the following two parts:

1) recovering controller:
- freeze queues
- nvme_dev_disable()
- resetting & setting up queues

2) post-reset or post-recovery
- wait for freezing & unfreezing

And make sure the #1 can always go on for recovering controller even
though that #2 is blocked by timeout.

If freezing can be removed, the #2 may not be necessary, but it may
cause more requests to be handled during recovering hardware, so it
is still reasonable to keep freezing as before.

> 
> > - reset may take a while to complete because of nvme_wait_freeze(), and
> > timeout can happen during resetting, then reset may hang forever. Even
> > without nvme_wait_freeze(), it is possible for timeout to happen during
> > reset work too in theory.
> >
> > Actually for non-shutdown, it isn't necessary to freeze queue at all, and it
> > is enough to just quiesce queues to make hardware happy for recovery,
> > that has been part of my V2 patchset.
> 
> When we freeze, we prevent IOs from entering contexts that may not be
> valid on the other side of the reset. It's not very common for the
> context count to change, but it can happen.
> 
> Anyway, will take a look at your series and catch up on the notes from
> you and Jianchao.

The V2 has been posted out, and freeze isn't removed, but moved to
post-reset.

The main approach should be fine in V2, but there are still issues
(the change may break the reset from other context, such as pci reset;
freezing caused by update_nr_hw_queues) in V2, and the implementation can
be simpler by partitioning the reset work into two parts simply.

I am working on V3, but any comments are welcome on V2, especially about
the taken approach.

Thanks,
Ming
Keith Busch May 8, 2018, 3:30 p.m. UTC | #7
On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> This sync may be raced with one timed-out request, which may be handled
> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
> work reliably. 

Ming,

As proposed, that scenario is impossible to encounter. Resetting the
controller inline with the timeout reaps all the commands, and then
sets the controller state to RESETTING. While blk-mq may not allow the
driver to complete those requests, having the driver sync with the queues
will hold the controller in the reset state until blk-mq is done with
its timeout work; therefore, it is impossible for the NVMe driver to
return "BLK_EH_RESET_TIMER", and all commands will be completed through
nvme_timeout's BLK_EH_HANDLED exactly as desired.

Could you please recheck my suggestion? The alternatives proposed are
far too risky for a 4.17 consideration, and I'm hoping we can stabilize
this behavior in the current release if possible.

Thanks,
Keith
Ming Lei May 10, 2018, 8:52 p.m. UTC | #8
Hi Keith,

On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
> On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
>> This sync may be raced with one timed-out request, which may be handled
>> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
>> work reliably.
>
> Ming,
>
> As proposed, that scenario is impossible to encounter. Resetting the
> controller inline with the timeout reaps all the commands, and then
> sets the controller state to RESETTING. While blk-mq may not allow the
> driver to complete those requests, having the driver sync with the queues
> will hold the controller in the reset state until blk-mq is done with
> its timeout work; therefore, it is impossible for the NVMe driver to
> return "BLK_EH_RESET_TIMER", and all commands will be completed through
> nvme_timeout's BLK_EH_HANDLED exactly as desired.

That isn't true for multiple namespace case,  each request queue has its
own timeout work, and all these timeout work can be triggered concurrently.

>
> Could you please recheck my suggestion? The alternatives proposed are
> far too risky for a 4.17 consideration, and I'm hoping we can stabilize
> this behavior in the current release if possible.

Bart's rework on timeout won't cover the case of multiple namespace, so
we have to sync timeout inside driver.

The approach taken in my V4 has been simpler, could you take a look at it?

Thanks,
Ming Lei
Keith Busch May 10, 2018, 9:05 p.m. UTC | #9
On Fri, May 11, 2018 at 04:52:11AM +0800, Ming Lei wrote:
> Hi Keith,
> 
> On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> >> This sync may be raced with one timed-out request, which may be handled
> >> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
> >> work reliably.
> >
> > Ming,
> >
> > As proposed, that scenario is impossible to encounter. Resetting the
> > controller inline with the timeout reaps all the commands, and then
> > sets the controller state to RESETTING. While blk-mq may not allow the
> > driver to complete those requests, having the driver sync with the queues
> > will hold the controller in the reset state until blk-mq is done with
> > its timeout work; therefore, it is impossible for the NVMe driver to
> > return "BLK_EH_RESET_TIMER", and all commands will be completed through
> > nvme_timeout's BLK_EH_HANDLED exactly as desired.
> 
> That isn't true for multiple namespace case,  each request queue has its
> own timeout work, and all these timeout work can be triggered concurrently.

The controller state is most certainly not per queue/namespace. It's
global to the controller. Once the reset is triggered, nvme_timeout can
only return EH_HANDLED.
Ming Lei May 10, 2018, 9:10 p.m. UTC | #10
On Fri, May 11, 2018 at 5:05 AM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Fri, May 11, 2018 at 04:52:11AM +0800, Ming Lei wrote:
>> Hi Keith,
>>
>> On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
>> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
>> >> This sync may be raced with one timed-out request, which may be handled
>> >> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
>> >> work reliably.
>> >
>> > Ming,
>> >
>> > As proposed, that scenario is impossible to encounter. Resetting the
>> > controller inline with the timeout reaps all the commands, and then
>> > sets the controller state to RESETTING. While blk-mq may not allow the
>> > driver to complete those requests, having the driver sync with the queues
>> > will hold the controller in the reset state until blk-mq is done with
>> > its timeout work; therefore, it is impossible for the NVMe driver to
>> > return "BLK_EH_RESET_TIMER", and all commands will be completed through
>> > nvme_timeout's BLK_EH_HANDLED exactly as desired.
>>
>> That isn't true for multiple namespace case,  each request queue has its
>> own timeout work, and all these timeout work can be triggered concurrently.
>
> The controller state is most certainly not per queue/namespace. It's
> global to the controller. Once the reset is triggered, nvme_timeout can
> only return EH_HANDLED.

It is related with EH_HANDLED, please see the following case:

1) when req A from N1 is timed out, nvme_timeout() handles
it as EH_HANDLED: nvme_dev_disable() and reset is scheduled.

2) when req B from N2 is timed out, nvme_timeout() handles
it as EH_HANDLED, then nvme_dev_disable() is called exactly
when reset is in-progress, so queues become quiesced, and nothing
can move on in the resetting triggered by N1.

Thanks,
Ming Lei
Keith Busch May 10, 2018, 9:18 p.m. UTC | #11
On Fri, May 11, 2018 at 05:10:40AM +0800, Ming Lei wrote:
> On Fri, May 11, 2018 at 5:05 AM, Keith Busch
> <keith.busch@linux.intel.com> wrote:
> > On Fri, May 11, 2018 at 04:52:11AM +0800, Ming Lei wrote:
> >> Hi Keith,
> >>
> >> On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
> >> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> >> >> This sync may be raced with one timed-out request, which may be handled
> >> >> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
> >> >> work reliably.
> >> >
> >> > Ming,
> >> >
> >> > As proposed, that scenario is impossible to encounter. Resetting the
> >> > controller inline with the timeout reaps all the commands, and then
> >> > sets the controller state to RESETTING. While blk-mq may not allow the
> >> > driver to complete those requests, having the driver sync with the queues
> >> > will hold the controller in the reset state until blk-mq is done with
> >> > its timeout work; therefore, it is impossible for the NVMe driver to
> >> > return "BLK_EH_RESET_TIMER", and all commands will be completed through
> >> > nvme_timeout's BLK_EH_HANDLED exactly as desired.
> >>
> >> That isn't true for multiple namespace case,  each request queue has its
> >> own timeout work, and all these timeout work can be triggered concurrently.
> >
> > The controller state is most certainly not per queue/namespace. It's
> > global to the controller. Once the reset is triggered, nvme_timeout can
> > only return EH_HANDLED.
> 
> It is related with EH_HANDLED, please see the following case:
> 
> 1) when req A from N1 is timed out, nvme_timeout() handles
> it as EH_HANDLED: nvme_dev_disable() and reset is scheduled.
> 
> 2) when req B from N2 is timed out, nvme_timeout() handles
> it as EH_HANDLED, then nvme_dev_disable() is called exactly
> when reset is in-progress, so queues become quiesced, and nothing
> can move on in the resetting triggered by N1.

Huh? The nvme_sync_queues ensures that doesn't happen. That was the
whole point.
Ming Lei May 10, 2018, 9:24 p.m. UTC | #12
On Thu, May 10, 2018 at 03:18:29PM -0600, Keith Busch wrote:
> On Fri, May 11, 2018 at 05:10:40AM +0800, Ming Lei wrote:
> > On Fri, May 11, 2018 at 5:05 AM, Keith Busch
> > <keith.busch@linux.intel.com> wrote:
> > > On Fri, May 11, 2018 at 04:52:11AM +0800, Ming Lei wrote:
> > >> Hi Keith,
> > >>
> > >> On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
> > >> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
> > >> >> This sync may be raced with one timed-out request, which may be handled
> > >> >> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
> > >> >> work reliably.
> > >> >
> > >> > Ming,
> > >> >
> > >> > As proposed, that scenario is impossible to encounter. Resetting the
> > >> > controller inline with the timeout reaps all the commands, and then
> > >> > sets the controller state to RESETTING. While blk-mq may not allow the
> > >> > driver to complete those requests, having the driver sync with the queues
> > >> > will hold the controller in the reset state until blk-mq is done with
> > >> > its timeout work; therefore, it is impossible for the NVMe driver to
> > >> > return "BLK_EH_RESET_TIMER", and all commands will be completed through
> > >> > nvme_timeout's BLK_EH_HANDLED exactly as desired.
> > >>
> > >> That isn't true for multiple namespace case,  each request queue has its
> > >> own timeout work, and all these timeout work can be triggered concurrently.
> > >
> > > The controller state is most certainly not per queue/namespace. It's
> > > global to the controller. Once the reset is triggered, nvme_timeout can
> > > only return EH_HANDLED.
> > 
> > It is related with EH_HANDLED, please see the following case:
> > 
> > 1) when req A from N1 is timed out, nvme_timeout() handles
> > it as EH_HANDLED: nvme_dev_disable() and reset is scheduled.
> > 
> > 2) when req B from N2 is timed out, nvme_timeout() handles
> > it as EH_HANDLED, then nvme_dev_disable() is called exactly
> > when reset is in-progress, so queues become quiesced, and nothing
> > can move on in the resetting triggered by N1.
> 
> Huh? The nvme_sync_queues ensures that doesn't happen. That was the
> whole point.

Could you share me the link?

Firstly, the previous nvme_sync_queues() won't work reliably, so this
patch introduces blk_unquiesce_timeout() and blk_quiesce_timeout() for
this purpose.

Secondly, I remembered that you only call nvme_sync_queues() at the
entry of nvme_reset_work(), but timeout(either admin or normal IO)
can happen again during resetting, that is another race addressed by
this patchset, but can't cover by your proposal.

Thanks,
Ming
Keith Busch May 10, 2018, 9:44 p.m. UTC | #13
On Fri, May 11, 2018 at 05:24:46AM +0800, Ming Lei wrote:
> Could you share me the link?

The diff was in this reply here:

http://lists.infradead.org/pipermail/linux-nvme/2018-April/017019.html

> Firstly, the previous nvme_sync_queues() won't work reliably, so this
> patch introduces blk_unquiesce_timeout() and blk_quiesce_timeout() for
> this purpose.
>
> Secondly, I remembered that you only call nvme_sync_queues() at the
> entry of nvme_reset_work(), but timeout(either admin or normal IO)
> can happen again during resetting, that is another race addressed by
> this patchset, but can't cover by your proposal.

I sync the queues at the beginning because it ensures there is not
a single in flight request for the entire controller (all namespaces
plus admin queue) before transitioning to the connecting state.

If a command times out during connecting state, we go to the dead state.
Ming Lei May 10, 2018, 9:50 p.m. UTC | #14
On Thu, May 10, 2018 at 03:44:41PM -0600, Keith Busch wrote:
> On Fri, May 11, 2018 at 05:24:46AM +0800, Ming Lei wrote:
> > Could you share me the link?
> 
> The diff was in this reply here:
> 
> http://lists.infradead.org/pipermail/linux-nvme/2018-April/017019.html
> 
> > Firstly, the previous nvme_sync_queues() won't work reliably, so this
> > patch introduces blk_unquiesce_timeout() and blk_quiesce_timeout() for
> > this purpose.
> >
> > Secondly, I remembered that you only call nvme_sync_queues() at the
> > entry of nvme_reset_work(), but timeout(either admin or normal IO)
> > can happen again during resetting, that is another race addressed by
> > this patchset, but can't cover by your proposal.
> 
> I sync the queues at the beginning because it ensures there is not
> a single in flight request for the entire controller (all namespaces
> plus admin queue) before transitioning to the connecting state.

But it can't avoid the race I mentioned, since nvme_dev_disable() can
happen again during resetting.

> 
> If a command times out during connecting state, we go to the dead state.

That is too risky, since the IO during resetting isn't much different
with IOs submitted from other IO paths.

For example, in case of blktests block/011, controller shouldn't have
been put into dead, and this patchset(V4 & V5) can cover this case well.

Thanks,
Ming
Ming Lei May 10, 2018, 9:53 p.m. UTC | #15
On Thu, May 10, 2018 at 03:44:41PM -0600, Keith Busch wrote:
> On Fri, May 11, 2018 at 05:24:46AM +0800, Ming Lei wrote:
> > Could you share me the link?
> 
> The diff was in this reply here:
> 
> http://lists.infradead.org/pipermail/linux-nvme/2018-April/017019.html
> 
> > Firstly, the previous nvme_sync_queues() won't work reliably, so this
> > patch introduces blk_unquiesce_timeout() and blk_quiesce_timeout() for
> > this purpose.
> >
> > Secondly, I remembered that you only call nvme_sync_queues() at the
> > entry of nvme_reset_work(), but timeout(either admin or normal IO)
> > can happen again during resetting, that is another race addressed by
> > this patchset, but can't cover by your proposal.
> 
> I sync the queues at the beginning because it ensures there is not
> a single in flight request for the entire controller (all namespaces
> plus admin queue) before transitioning to the connecting state.
> 
> If a command times out during connecting state, we go to the dead state.

Actually, it is hang forever in blk_mq_freeze_queue_wait() from
nvme_reset_dev(), not a dead state of controller.

Thanks,
Ming
Ming Lei May 10, 2018, 10:03 p.m. UTC | #16
On Fri, May 11, 2018 at 5:18 AM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Fri, May 11, 2018 at 05:10:40AM +0800, Ming Lei wrote:
>> On Fri, May 11, 2018 at 5:05 AM, Keith Busch
>> <keith.busch@linux.intel.com> wrote:
>> > On Fri, May 11, 2018 at 04:52:11AM +0800, Ming Lei wrote:
>> >> Hi Keith,
>> >>
>> >> On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
>> >> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
>> >> >> This sync may be raced with one timed-out request, which may be handled
>> >> >> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
>> >> >> work reliably.
>> >> >
>> >> > Ming,
>> >> >
>> >> > As proposed, that scenario is impossible to encounter. Resetting the
>> >> > controller inline with the timeout reaps all the commands, and then
>> >> > sets the controller state to RESETTING. While blk-mq may not allow the
>> >> > driver to complete those requests, having the driver sync with the queues
>> >> > will hold the controller in the reset state until blk-mq is done with
>> >> > its timeout work; therefore, it is impossible for the NVMe driver to
>> >> > return "BLK_EH_RESET_TIMER", and all commands will be completed through
>> >> > nvme_timeout's BLK_EH_HANDLED exactly as desired.
>> >>
>> >> That isn't true for multiple namespace case,  each request queue has its
>> >> own timeout work, and all these timeout work can be triggered concurrently.
>> >
>> > The controller state is most certainly not per queue/namespace. It's
>> > global to the controller. Once the reset is triggered, nvme_timeout can
>> > only return EH_HANDLED.
>>
>> It is related with EH_HANDLED, please see the following case:
>>
>> 1) when req A from N1 is timed out, nvme_timeout() handles
>> it as EH_HANDLED: nvme_dev_disable() and reset is scheduled.
>>
>> 2) when req B from N2 is timed out, nvme_timeout() handles
>> it as EH_HANDLED, then nvme_dev_disable() is called exactly
>> when reset is in-progress, so queues become quiesced, and nothing
>> can move on in the resetting triggered by N1.
>
> Huh? The nvme_sync_queues ensures that doesn't happen. That was the
> whole point.

Sorry, forget to mention, it isn't enough to simply sync timeout inside reset().

Another tricky thing is about freeze & unfreeze, now freeze is done in
nvme_dev_disable(), and unfreeze is done in nvme_reset_work. That means
we have to make sure both are paired, otherwise queues may be kept as
freeze for ever.

This issue is covered by my V4 & V5 too.

thanks,
Ming Lei
Keith Busch May 10, 2018, 10:43 p.m. UTC | #17
On Fri, May 11, 2018 at 06:03:59AM +0800, Ming Lei wrote:
> Sorry, forget to mention, it isn't enough to simply sync timeout inside reset().
> 
> Another tricky thing is about freeze & unfreeze, now freeze is done in
> nvme_dev_disable(), and unfreeze is done in nvme_reset_work. That means
> we have to make sure both are paired, otherwise queues may be kept as
> freeze for ever.
> 
> This issue is covered by my V4 & V5 too.

That it isn't an issue in my patch either.
Ming Lei May 11, 2018, 12:14 a.m. UTC | #18
On Thu, May 10, 2018 at 04:43:57PM -0600, Keith Busch wrote:
> On Fri, May 11, 2018 at 06:03:59AM +0800, Ming Lei wrote:
> > Sorry, forget to mention, it isn't enough to simply sync timeout inside reset().
> > 
> > Another tricky thing is about freeze & unfreeze, now freeze is done in
> > nvme_dev_disable(), and unfreeze is done in nvme_reset_work. That means
> > we have to make sure both are paired, otherwise queues may be kept as
> > freeze for ever.
> > 
> > This issue is covered by my V4 & V5 too.
> 
> That it isn't an issue in my patch either.

When blk_sync_queue() is used in your patch for draining timeout,
several nvme_timeout() instances may be drained, that means there
may be more than one nvme_start_freeze() called from all these
nvme_timeout() instances, but finally only one nvme_reset_work()
is run, then queues are kept as freeze forever even though
reset is done successfully.

So you patch won't fix this issue.

All these problems are mentioned in the commit log of V4, and I
am proposing this approach to try to address them all.

Thanks,
Ming
Ming Lei May 11, 2018, 2:10 a.m. UTC | #19
On Fri, May 11, 2018 at 5:05 AM, Keith Busch
<keith.busch@linux.intel.com> wrote:
> On Fri, May 11, 2018 at 04:52:11AM +0800, Ming Lei wrote:
>> Hi Keith,
>>
>> On Tue, May 8, 2018 at 11:30 PM, Keith Busch <keith.busch@intel.com> wrote:
>> > On Sat, Apr 28, 2018 at 11:50:17AM +0800, Ming Lei wrote:
>> >> This sync may be raced with one timed-out request, which may be handled
>> >> as BLK_EH_HANDLED or BLK_EH_RESET_TIMER, so the above sync queues can't
>> >> work reliably.
>> >
>> > Ming,
>> >
>> > As proposed, that scenario is impossible to encounter. Resetting the
>> > controller inline with the timeout reaps all the commands, and then
>> > sets the controller state to RESETTING. While blk-mq may not allow the
>> > driver to complete those requests, having the driver sync with the queues
>> > will hold the controller in the reset state until blk-mq is done with
>> > its timeout work; therefore, it is impossible for the NVMe driver to
>> > return "BLK_EH_RESET_TIMER", and all commands will be completed through
>> > nvme_timeout's BLK_EH_HANDLED exactly as desired.
>>
>> That isn't true for multiple namespace case,  each request queue has its
>> own timeout work, and all these timeout work can be triggered concurrently.
>
> The controller state is most certainly not per queue/namespace. It's
> global to the controller. Once the reset is triggered, nvme_timeout can
> only return EH_HANDLED.

One exception is PCI error recovery, in which EH_RESET_TIMER still
may be returned any time.

Also the two timeout can happen at the same time from more than one
NS, just before resetting is started(before updating to NVME_CTRL_RESETTING).

OR one timeout is from admin queue, another one is from NS, both happen
at the same time, still before updating to NVME_CTRL_RESETTING.

In above two situations, one timeout can be handled as EH_HANDLED, and
another can be handled as EH_RESET_TIMER.

So it isn't enough to drain timeout by blk_sync_queue() simply.


Thanks,
Ming Lei
diff mbox

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a3771c5729f5..198b4469c3e2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3562,12 +3562,25 @@  void nvme_start_queues(struct nvme_ctrl *ctrl)
 	struct nvme_ns *ns;
 
 	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		blk_mq_unquiesce_queue(ns->queue);
+		blk_mq_kick_requeue_list(ns->queue);
+	}
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
 {
 	if (!ctrl->ops->reinit_request)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7ded7a51c430..e62273198725 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -402,6 +402,7 @@  int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fbc71fac6f1e..a2f3ad105620 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2303,6 +2303,7 @@  static void nvme_reset_work(struct work_struct *work)
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
 
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the