diff mbox series

[3/3] nvme-pci: use queue close instead of queue freeze

Message ID 1536120586-3378-4-git-send-email-jianchao.w.wang@oracle.com (mailing list archive)
State New, archived
Headers show
Series Introduce a light-weight queue close feature | expand

Commit Message

jianchao.wang Sept. 5, 2018, 4:09 a.m. UTC
nvme_dev_disable freezes queues to prevent new IO. nvme_reset_work
will unfreeze and wait to drain the queues. However, if IO timeout
at the moment, no body could do recovery as nvme_reset_work is
waiting. We will encounter IO hang.

To avoid this scenario, use queue close to prevent new IO which
doesn't need to drain the queues. And just use queue freeze to
try to wait for in-flight IO for shutdown case.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
---
 drivers/nvme/host/core.c | 22 ++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  3 +++
 drivers/nvme/host/pci.c  | 27 ++++++++++++++-------------
 3 files changed, 39 insertions(+), 13 deletions(-)

Comments

Ming Lei Sept. 5, 2018, 10:09 p.m. UTC | #1
On Wed, Sep 05, 2018 at 12:09:46PM +0800, Jianchao Wang wrote:
> nvme_dev_disable freezes queues to prevent new IO. nvme_reset_work
> will unfreeze and wait to drain the queues. However, if IO timeout
> at the moment, no body could do recovery as nvme_reset_work is
> waiting. We will encounter IO hang.
> 
> To avoid this scenario, use queue close to prevent new IO which
> doesn't need to drain the queues. And just use queue freeze to
> try to wait for in-flight IO for shutdown case.
> 
> Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
> ---
>  drivers/nvme/host/core.c | 22 ++++++++++++++++++++++
>  drivers/nvme/host/nvme.h |  3 +++
>  drivers/nvme/host/pci.c  | 27 ++++++++++++++-------------
>  3 files changed, 39 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index dd8ec1d..ce5b35b 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3602,6 +3602,28 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>  }
>  EXPORT_SYMBOL_GPL(nvme_kill_queues);
>  
> +void nvme_close_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_set_queue_closed(ns->queue);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +EXPORT_SYMBOL_GPL(nvme_close_queues);
> +
> +void nvme_open_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_clear_queue_closed(ns->queue);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +EXPORT_SYMBOL_GPL(nvme_open_queues);
> +
>  void nvme_unfreeze(struct nvme_ctrl *ctrl)
>  {
>  	struct nvme_ns *ns;
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index bb4a200..fcd44cb 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -437,6 +437,9 @@ void nvme_wait_freeze(struct nvme_ctrl *ctrl);
>  void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
>  void nvme_start_freeze(struct nvme_ctrl *ctrl);
>  
> +void nvme_close_queues(struct nvme_ctrl *ctrl);
> +void nvme_open_queues(struct nvme_ctrl *ctrl);
> +
>  #define NVME_QID_ANY -1
>  struct request *nvme_alloc_request(struct request_queue *q,
>  		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d668682..c0ccd04 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2145,23 +2145,25 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  	struct pci_dev *pdev = to_pci_dev(dev->dev);
>  
>  	mutex_lock(&dev->shutdown_lock);
> +	nvme_close_queues(&dev->ctrl);
>  	if (pci_is_enabled(pdev)) {
>  		u32 csts = readl(dev->bar + NVME_REG_CSTS);
>  
> -		if (dev->ctrl.state == NVME_CTRL_LIVE ||
> -		    dev->ctrl.state == NVME_CTRL_RESETTING)
> -			nvme_start_freeze(&dev->ctrl);
>  		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
>  			pdev->error_state  != pci_channel_io_normal);
> -	}
>  
> -	/*
> -	 * Give the controller a chance to complete all entered requests if
> -	 * doing a safe shutdown.
> -	 */
> -	if (!dead) {
> -		if (shutdown)
> -			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
> +		if (dev->ctrl.state == NVME_CTRL_LIVE ||
> +		    dev->ctrl.state == NVME_CTRL_RESETTING) {
> +			/*
> +			 * Give the controller a chance to complete all entered
> +			 * requests if doing a safe shutdown.
> +			 */
> +			if (!dead && shutdown) {
> +				nvme_start_freeze(&dev->ctrl);
> +				nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
> +				nvme_unfreeze(&dev->ctrl);
> +			}
> +		}
>  	}
>  
>  	nvme_stop_queues(&dev->ctrl);
> @@ -2328,11 +2330,9 @@ static void nvme_reset_work(struct work_struct *work)
>  		new_state = NVME_CTRL_ADMIN_ONLY;
>  	} else {
>  		nvme_start_queues(&dev->ctrl);
> -		nvme_wait_freeze(&dev->ctrl);
>  		/* hit this only when allocate tagset fails */
>  		if (nvme_dev_add(dev))
>  			new_state = NVME_CTRL_ADMIN_ONLY;
> -		nvme_unfreeze(&dev->ctrl);

nvme_dev_add() still may call freeze & unfreeze queue, so your patch
can't avoid draining queue completely here.

Thanks,
Ming
jianchao.wang Sept. 6, 2018, 1:28 a.m. UTC | #2
Hi Ming

On 09/06/2018 06:09 AM, Ming Lei wrote:
> nvme_dev_add() still may call freeze & unfreeze queue, so your patch
> can't avoid draining queue completely here.

Yes, I know this. We still need to freeze queue when update nr_hw_queues.
But we move forward a step at least. :)
We don't need to drain request queue in normal case of nvme_reset_work.

As for updating nr_hw_queues, we could try some other method on it next.


Thanks
Jianchao
Ming Lei Sept. 6, 2018, 1:07 p.m. UTC | #3
On Thu, Sep 06, 2018 at 09:28:46AM +0800, jianchao.wang wrote:
> Hi Ming
> 
> On 09/06/2018 06:09 AM, Ming Lei wrote:
> > nvme_dev_add() still may call freeze & unfreeze queue, so your patch
> > can't avoid draining queue completely here.
> 
> Yes, I know this. We still need to freeze queue when update nr_hw_queues.
> But we move forward a step at least. :)
> We don't need to drain request queue in normal case of nvme_reset_work.

It is hard to say who is the normal case. In case of CPU hotplug, it is quite
easy to trigger updating nr_hw_queues.

> 
> As for updating nr_hw_queues, we could try some other method on it next.

The thing is that draining queue may be inevitable inside reset work
function because of updating nr_hw_queues, that means the approach in
this patchset is just a partial solution

Or it may not make sense to do that because we may never remove the draining
queue in the code path of updating nr_hw_queues.

Given they belong to same issue, I suggest to solve them all.

Thanks,
Ming
jianchao.wang Sept. 6, 2018, 2:19 p.m. UTC | #4
On 09/06/2018 09:07 PM, Ming Lei wrote:
> On Thu, Sep 06, 2018 at 09:28:46AM +0800, jianchao.wang wrote:
>> Hi Ming
>>
>> On 09/06/2018 06:09 AM, Ming Lei wrote:
>>> nvme_dev_add() still may call freeze & unfreeze queue, so your patch
>>> can't avoid draining queue completely here.
>>
>> Yes, I know this. We still need to freeze queue when update nr_hw_queues.
>> But we move forward a step at least. :)
>> We don't need to drain request queue in normal case of nvme_reset_work.
> 
> It is hard to say who is the normal case. In case of CPU hotplug, it is quite
> easy to trigger updating nr_hw_queues.
> 
>>
>> As for updating nr_hw_queues, we could try some other method on it next.
> 
> The thing is that draining queue may be inevitable inside reset work
> function because of updating nr_hw_queues, that means the approach in
> this patchset is just a partial solution
> 
> Or it may not make sense to do that because we may never remove the draining
> queue in the code path of updating nr_hw_queues.
> 
> Given they belong to same issue, I suggest to solve them all.

I still think if just prevent new IO, freeze queue is over-kill.

With respect to update nr_hw_queues, we could try to take the bios down from the
queued requests as the blk_steal_bios. Then re-issue these bios after updating the
nr_hw_queues.

Thanks
Jianchao

> 
> Thanks,
> Ming
> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=cybJ83konrbKh9jtodiaZRmNrKAo1YtEI_oyaNIbuvk&s=EBhkv_XTOQb3K3-4esnkBrs_ErKuNzXDqZwrIzEA0ig&e=
>
Ming Lei Sept. 7, 2018, 4:23 p.m. UTC | #5
On Thu, Sep 06, 2018 at 10:19:27PM +0800, jianchao.wang wrote:
> 
> 
> On 09/06/2018 09:07 PM, Ming Lei wrote:
> > On Thu, Sep 06, 2018 at 09:28:46AM +0800, jianchao.wang wrote:
> >> Hi Ming
> >>
> >> On 09/06/2018 06:09 AM, Ming Lei wrote:
> >>> nvme_dev_add() still may call freeze & unfreeze queue, so your patch
> >>> can't avoid draining queue completely here.
> >>
> >> Yes, I know this. We still need to freeze queue when update nr_hw_queues.
> >> But we move forward a step at least. :)
> >> We don't need to drain request queue in normal case of nvme_reset_work.
> > 
> > It is hard to say who is the normal case. In case of CPU hotplug, it is quite
> > easy to trigger updating nr_hw_queues.
> > 
> >>
> >> As for updating nr_hw_queues, we could try some other method on it next.
> > 
> > The thing is that draining queue may be inevitable inside reset work
> > function because of updating nr_hw_queues, that means the approach in
> > this patchset is just a partial solution
> > 
> > Or it may not make sense to do that because we may never remove the draining
> > queue in the code path of updating nr_hw_queues.
> > 
> > Given they belong to same issue, I suggest to solve them all.
> 
> I still think if just prevent new IO, freeze queue is over-kill.
> 
> With respect to update nr_hw_queues, we could try to take the bios down from the
> queued requests as the blk_steal_bios. Then re-issue these bios after updating the
> nr_hw_queues.

It is always easier to say than done, :-)

Please go ahead and post patches, and I am happy to review.


Thanks,
Ming
jianchao.wang Sept. 10, 2018, 1:49 a.m. UTC | #6
On 09/08/2018 12:23 AM, Ming Lei wrote:
> On Thu, Sep 06, 2018 at 10:19:27PM +0800, jianchao.wang wrote:
>>
>>
>> On 09/06/2018 09:07 PM, Ming Lei wrote:
>>> On Thu, Sep 06, 2018 at 09:28:46AM +0800, jianchao.wang wrote:
>>>> Hi Ming
>>>>
>>>> On 09/06/2018 06:09 AM, Ming Lei wrote:
>>>>> nvme_dev_add() still may call freeze & unfreeze queue, so your patch
>>>>> can't avoid draining queue completely here.
>>>>
>>>> Yes, I know this. We still need to freeze queue when update nr_hw_queues.
>>>> But we move forward a step at least. :)
>>>> We don't need to drain request queue in normal case of nvme_reset_work.
>>>
>>> It is hard to say who is the normal case. In case of CPU hotplug, it is quite
>>> easy to trigger updating nr_hw_queues.
>>>
>>>>
>>>> As for updating nr_hw_queues, we could try some other method on it next.
>>>
>>> The thing is that draining queue may be inevitable inside reset work
>>> function because of updating nr_hw_queues, that means the approach in
>>> this patchset is just a partial solution
>>>
>>> Or it may not make sense to do that because we may never remove the draining
>>> queue in the code path of updating nr_hw_queues.
>>>
>>> Given they belong to same issue, I suggest to solve them all.
>>
>> I still think if just prevent new IO, freeze queue is over-kill.
>>
>> With respect to update nr_hw_queues, we could try to take the bios down from the
>> queued requests as the blk_steal_bios. Then re-issue these bios after updating the
>> nr_hw_queues.
> 
> It is always easier to say than done, :-)
> 
> Please go ahead and post patches, and I am happy to review.
> 

Yes, I'm trying it and will post it when complete. :)

Thanks
Jianchao

> 
> Thanks,
> Ming
> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=8v99qftJpifVTL4l7Od8_RT0UZCUN1aIotRsKWugz28&s=Z56o8mcARcWcRHe1yOr8Hu2a7djUCM6hKtrDTHuA9Qc&e=
>
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dd8ec1d..ce5b35b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3602,6 +3602,28 @@  void nvme_kill_queues(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_kill_queues);
 
+void nvme_close_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_set_queue_closed(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_close_queues);
+
+void nvme_open_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_clear_queue_closed(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_open_queues);
+
 void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bb4a200..fcd44cb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -437,6 +437,9 @@  void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
 void nvme_start_freeze(struct nvme_ctrl *ctrl);
 
+void nvme_close_queues(struct nvme_ctrl *ctrl);
+void nvme_open_queues(struct nvme_ctrl *ctrl);
+
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d668682..c0ccd04 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2145,23 +2145,25 @@  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	mutex_lock(&dev->shutdown_lock);
+	nvme_close_queues(&dev->ctrl);
 	if (pci_is_enabled(pdev)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
-			nvme_start_freeze(&dev->ctrl);
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pdev->error_state  != pci_channel_io_normal);
-	}
 
-	/*
-	 * Give the controller a chance to complete all entered requests if
-	 * doing a safe shutdown.
-	 */
-	if (!dead) {
-		if (shutdown)
-			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+		if (dev->ctrl.state == NVME_CTRL_LIVE ||
+		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+			/*
+			 * Give the controller a chance to complete all entered
+			 * requests if doing a safe shutdown.
+			 */
+			if (!dead && shutdown) {
+				nvme_start_freeze(&dev->ctrl);
+				nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+				nvme_unfreeze(&dev->ctrl);
+			}
+		}
 	}
 
 	nvme_stop_queues(&dev->ctrl);
@@ -2328,11 +2330,9 @@  static void nvme_reset_work(struct work_struct *work)
 		new_state = NVME_CTRL_ADMIN_ONLY;
 	} else {
 		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
 		/* hit this only when allocate tagset fails */
 		if (nvme_dev_add(dev))
 			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
 	}
 
 	/*
@@ -2345,6 +2345,7 @@  static void nvme_reset_work(struct work_struct *work)
 		goto out;
 	}
 
+	nvme_open_queues(&dev->ctrl);
 	nvme_start_ctrl(&dev->ctrl);
 	return;