diff mbox series

nvme: allow ANA support to be independent of native multipathing

Message ID 20181115174605.GA19782@redhat.com (mailing list archive)
State New, archived
Headers show
Series nvme: allow ANA support to be independent of native multipathing | expand

Commit Message

Mike Snitzer Nov. 15, 2018, 5:46 p.m. UTC
Whether or not ANA is present is a choice of the target implementation;
the host (and whether it supports multipathing) has _zero_ influence on
this.  If the target declares a path as 'inaccessible' the path _is_
inaccessible to the host.  As such, ANA support should be functional
even if native multipathing is not.

Introduce ability to always re-read ANA log page as required due to ANA
error and make current ANA state available via sysfs -- even if native
multipathing is disabled on the host (e.g. nvme_core.multipath=N).

This affords userspace access to the current ANA state independent of
which layer might be doing multipathing.  It also allows multipath-tools
to rely on the NVMe driver for ANA support while dm-multipath takes care
of multipathing.

While implementing these changes care was taken to preserve the exact
ANA functionality and code sequence native multipathing has provided.
This manifests as native multipathing's nvme_failover_req() being
tweaked to call __nvme_update_ana() which was factored out to allow
nvme_update_ana() to be called independent of nvme_failover_req().

And as always, if embedded NVMe users do not want any performance
overhead associated with ANA or native NVMe multipathing they can
disable CONFIG_NVME_MULTIPATH.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/nvme/host/core.c      | 10 +++++----
 drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 48 insertions(+), 15 deletions(-)

Comments

Hannes Reinecke Nov. 16, 2018, 7:25 a.m. UTC | #1
On 11/15/18 6:46 PM, Mike Snitzer wrote:
> Whether or not ANA is present is a choice of the target implementation;
> the host (and whether it supports multipathing) has _zero_ influence on
> this.  If the target declares a path as 'inaccessible' the path _is_
> inaccessible to the host.  As such, ANA support should be functional
> even if native multipathing is not.
> 
> Introduce ability to always re-read ANA log page as required due to ANA
> error and make current ANA state available via sysfs -- even if native
> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> This affords userspace access to the current ANA state independent of
> which layer might be doing multipathing.  It also allows multipath-tools
> to rely on the NVMe driver for ANA support while dm-multipath takes care
> of multipathing.
> 
> While implementing these changes care was taken to preserve the exact
> ANA functionality and code sequence native multipathing has provided.
> This manifests as native multipathing's nvme_failover_req() being
> tweaked to call __nvme_update_ana() which was factored out to allow
> nvme_update_ana() to be called independent of nvme_failover_req().
> 
> And as always, if embedded NVMe users do not want any performance
> overhead associated with ANA or native NVMe multipathing they can
> disable CONFIG_NVME_MULTIPATH.
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>   drivers/nvme/host/core.c      | 10 +++++----
>   drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
>   drivers/nvme/host/nvme.h      |  4 ++++
>   3 files changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fe957166c4a9..3df607905628 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
>   		nvme_req(req)->ctrl->comp_seen = true;
>   
>   	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> -		    blk_path_error(status)) {
> -			nvme_failover_req(req);
> -			return;
> +		if (blk_path_error(status)) {
> +			if (req->cmd_flags & REQ_NVME_MPATH) {
> +				nvme_failover_req(req);
> +				return;
> +			}
> +			nvme_update_ana(req);
>   		}
>   
>   		if (!blk_queue_dying(req->q)) {
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 8e03cda770c5..0adbcff5fba2 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
>   
>   inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>   {
> -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
>   }
>   
>   /*
> @@ -47,6 +47,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> +static bool nvme_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static void __nvme_update_ana(struct nvme_ns *ns)
> +{
> +	if (!ns->ctrl->ana_log_buf)
> +		return;
> +
> +	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +	queue_work(nvme_wq, &ns->ctrl->ana_work);
> +}
> +
> +void nvme_update_ana(struct request *req)
> +{
> +	struct nvme_ns *ns = req->q->queuedata;
> +	u16 status = nvme_req(req)->status;
> +
> +	if (nvme_ana_error(status))
> +		__nvme_update_ana(ns);
> +}
> +
>   void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>   	blk_mq_end_request(req, 0);
>   
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> +	if (nvme_ana_error(status)) {
>   		/*
>   		 * If we got back an ANA error we know the controller is alive,
>   		 * but not ready to serve this namespaces.  The spec suggests
>   		 * we should update our general state here, but due to the fact
>   		 * that the admin and I/O queues are not serialized that is
>   		 * fundamentally racy.  So instead just clear the current path,
> -		 * mark the the path as pending and kick of a re-read of the ANA
> +		 * mark the path as pending and kick off a re-read of the ANA
>   		 * log page ASAP.
>   		 */
>   		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> -		}
> -		break;
> +		__nvme_update_ana(ns);
> +		goto kick_requeue;
> +	}
> +
> +	switch (status & 0x7ff) {
>   	case NVME_SC_HOST_PATH_ERROR:
>   		/*
>   		 * Temporary transport disruption in talking to the controller.
> @@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
>   		break;
>   	}
>   
> +kick_requeue:
>   	kblockd_schedule_work(&ns->head->requeue_work);
>   }
>   
Doesn't the need to be protected by 'if (ns->head->disk)' or somesuch?

Cheers,

Hannes
Christoph Hellwig Nov. 16, 2018, 9:14 a.m. UTC | #2
On Thu, Nov 15, 2018 at 12:46:05PM -0500, Mike Snitzer wrote:
> Whether or not ANA is present is a choice of the target implementation;
> the host (and whether it supports multipathing) has _zero_ influence on
> this.  If the target declares a path as 'inaccessible' the path _is_
> inaccessible to the host.  As such, ANA support should be functional
> even if native multipathing is not.
> 
> Introduce ability to always re-read ANA log page as required due to ANA
> error and make current ANA state available via sysfs -- even if native
> multipathing is disabled on the host (e.g. nvme_core.multipath=N).

The first part I could see, but I still want to make it conditional
in some way as nvme is going into deeply embedded setups, and I don't
want to carry the weight of the ANA code around for everyone.

The second I fundamentally disagree with.  And even if you found agreement
it would have to be in a separate patch as it is a separate feature.
Hannes Reinecke Nov. 16, 2018, 9:40 a.m. UTC | #3
On 11/16/18 10:14 AM, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018 at 12:46:05PM -0500, Mike Snitzer wrote:
>> Whether or not ANA is present is a choice of the target implementation;
>> the host (and whether it supports multipathing) has _zero_ influence on
>> this.  If the target declares a path as 'inaccessible' the path _is_
>> inaccessible to the host.  As such, ANA support should be functional
>> even if native multipathing is not.
>>
>> Introduce ability to always re-read ANA log page as required due to ANA
>> error and make current ANA state available via sysfs -- even if native
>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> The first part I could see, but I still want to make it conditional
> in some way as nvme is going into deeply embedded setups, and I don't
> want to carry the weight of the ANA code around for everyone.
> 
Can you clarify this a bit?
We _do_ have the NVME multipath config option to deconfigure the whole 
thing during compile time; that isn't influenced with this patch.
So are you worried about the size of the ANA implementation itself?
Or are you worried about the size of the ANA structures?

> The second I fundamentally disagree with.  And even if you found agreement
> it would have to be in a separate patch as it is a separate feature.
> 
Why? Where's the problem with re-reading the ANA log pages if we get an 
event indicating that we should?

Cheers,

Hannes
Christoph Hellwig Nov. 16, 2018, 9:49 a.m. UTC | #4
On Fri, Nov 16, 2018 at 10:40:40AM +0100, Hannes Reinecke wrote:
>>> Introduce ability to always re-read ANA log page as required due to ANA
>>> error and make current ANA state available via sysfs -- even if native
>>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
>>
>> The first part I could see, but I still want to make it conditional
>> in some way as nvme is going into deeply embedded setups, and I don't
>> want to carry the weight of the ANA code around for everyone.
>>
> Can you clarify this a bit?
> We _do_ have the NVME multipath config option to deconfigure the whole 
> thing during compile time; that isn't influenced with this patch.
> So are you worried about the size of the ANA implementation itself?
> Or are you worried about the size of the ANA structures?

I just see the next step of wanting to move ANA code into the core
which is implied above.
>
>> The second I fundamentally disagree with.  And even if you found agreement
>> it would have to be in a separate patch as it is a separate feature.
>>
> Why? Where's the problem with re-reading the ANA log pages if we get an 
> event indicating that we should?

"second" here means the sysfs file.
Hannes Reinecke Nov. 16, 2018, 10:06 a.m. UTC | #5
On 11/16/18 10:49 AM, Christoph Hellwig wrote:
> On Fri, Nov 16, 2018 at 10:40:40AM +0100, Hannes Reinecke wrote:
>>>> Introduce ability to always re-read ANA log page as required due to ANA
>>>> error and make current ANA state available via sysfs -- even if native
>>>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
>>>
>>> The first part I could see, but I still want to make it conditional
>>> in some way as nvme is going into deeply embedded setups, and I don't
>>> want to carry the weight of the ANA code around for everyone.
>>>
>> Can you clarify this a bit?
>> We _do_ have the NVME multipath config option to deconfigure the whole
>> thing during compile time; that isn't influenced with this patch.
>> So are you worried about the size of the ANA implementation itself?
>> Or are you worried about the size of the ANA structures?
> 
> I just see the next step of wanting to move ANA code into the core
> which is implied above.

Really, I couldn't care less _where_ the ANA code lives.
If the size of which is any concern we can even make it configurable of 
sorts.

>>
>>> The second I fundamentally disagree with.  And even if you found agreement
>>> it would have to be in a separate patch as it is a separate feature.
>>>
>> Why? Where's the problem with re-reading the ANA log pages if we get an
>> event indicating that we should?
> 
> "second" here means the sysfs file.
> 
Ok, so would you be happy with making ANA support configurable?

Cheers,

Hannes
Christoph Hellwig Nov. 16, 2018, 10:17 a.m. UTC | #6
On Fri, Nov 16, 2018 at 11:06:32AM +0100, Hannes Reinecke wrote:
> Ok, so would you be happy with making ANA support configurable?

I've looked a bit over the whole situation, and what I think we need
to do is:

 a) warn if we see a ANA capable device without multipath support
    so people know it is not going to work properly.
 b) deprecate the multipath module option.  It was only intended as
    a migration for any pre-existing PCIe multipath user if there
    were any, not to support any new functionality.  So for 4.20
    put in a patch that prints a clear warning when it is used,
    including a link to the nvme list, and then for 4.25 or so
    remove it entirely unless something unexpected come up.

This whole drama of optional multipath use has wasted way too much
of everyones time already.
Mike Snitzer Nov. 16, 2018, 2:01 p.m. UTC | #7
On Fri, Nov 16 2018 at  2:25am -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/15/18 6:46 PM, Mike Snitzer wrote:
> >Whether or not ANA is present is a choice of the target implementation;
> >the host (and whether it supports multipathing) has _zero_ influence on
> >this.  If the target declares a path as 'inaccessible' the path _is_
> >inaccessible to the host.  As such, ANA support should be functional
> >even if native multipathing is not.
> >
> >Introduce ability to always re-read ANA log page as required due to ANA
> >error and make current ANA state available via sysfs -- even if native
> >multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> >
> >This affords userspace access to the current ANA state independent of
> >which layer might be doing multipathing.  It also allows multipath-tools
> >to rely on the NVMe driver for ANA support while dm-multipath takes care
> >of multipathing.
> >
> >While implementing these changes care was taken to preserve the exact
> >ANA functionality and code sequence native multipathing has provided.
> >This manifests as native multipathing's nvme_failover_req() being
> >tweaked to call __nvme_update_ana() which was factored out to allow
> >nvme_update_ana() to be called independent of nvme_failover_req().
> >
> >And as always, if embedded NVMe users do not want any performance
> >overhead associated with ANA or native NVMe multipathing they can
> >disable CONFIG_NVME_MULTIPATH.
> >
> >Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> >---
> >  drivers/nvme/host/core.c      | 10 +++++----
> >  drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
> >  drivers/nvme/host/nvme.h      |  4 ++++
> >  3 files changed, 48 insertions(+), 15 deletions(-)
> >
> >diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >index fe957166c4a9..3df607905628 100644
> >--- a/drivers/nvme/host/core.c
> >+++ b/drivers/nvme/host/core.c
> >@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
> >  		nvme_req(req)->ctrl->comp_seen = true;
> >  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> >-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> >-		    blk_path_error(status)) {
> >-			nvme_failover_req(req);
> >-			return;
> >+		if (blk_path_error(status)) {
> >+			if (req->cmd_flags & REQ_NVME_MPATH) {
> >+				nvme_failover_req(req);
> >+				return;
> >+			}
> >+			nvme_update_ana(req);
> >  		}
> >  		if (!blk_queue_dying(req->q)) {

...

> >diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> >index 8e03cda770c5..0adbcff5fba2 100644
> >--- a/drivers/nvme/host/multipath.c
> >+++ b/drivers/nvme/host/multipath.c
> >@@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
> >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> >  	blk_mq_end_request(req, 0);
> >-	switch (status & 0x7ff) {
> >-	case NVME_SC_ANA_TRANSITION:
> >-	case NVME_SC_ANA_INACCESSIBLE:
> >-	case NVME_SC_ANA_PERSISTENT_LOSS:
> >+	if (nvme_ana_error(status)) {
> >  		/*
> >  		 * If we got back an ANA error we know the controller is alive,
> >  		 * but not ready to serve this namespaces.  The spec suggests
> >  		 * we should update our general state here, but due to the fact
> >  		 * that the admin and I/O queues are not serialized that is
> >  		 * fundamentally racy.  So instead just clear the current path,
> >-		 * mark the the path as pending and kick of a re-read of the ANA
> >+		 * mark the path as pending and kick off a re-read of the ANA
> >  		 * log page ASAP.
> >  		 */
> >  		nvme_mpath_clear_current_path(ns);
> >-		if (ns->ctrl->ana_log_buf) {
> >-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> >-			queue_work(nvme_wq, &ns->ctrl->ana_work);
> >-		}
> >-		break;
> >+		__nvme_update_ana(ns);
> >+		goto kick_requeue;
> >+	}
> >+
> >+	switch (status & 0x7ff) {
> >  	case NVME_SC_HOST_PATH_ERROR:
> >  		/*
> >  		 * Temporary transport disruption in talking to the controller.
> >@@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
> >  		break;
> >  	}
> >+kick_requeue:
> >  	kblockd_schedule_work(&ns->head->requeue_work);
> >  }
> Doesn't the need to be protected by 'if (ns->head->disk)' or somesuch?

No.  nvme_failover_req() is only ever called by native multipathing; see
nvme_complete_rq()'s check for req->cmd_flags & REQ_NVME_MPATH as the
condition for calling nvme_complete_rq().

The previos RFC-style patch I posted muddled ANA and multipathing in
nvme_update_ana() but this final patch submission was fixed because I
saw a cleaner way forward by having nvme_failover_req() also do ANA work
just like it always has -- albeit with new helpers that
nvme_update_ana() also calls.

Mike
Mike Snitzer Nov. 16, 2018, 2:12 p.m. UTC | #8
On Fri, Nov 16 2018 at  4:14am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Nov 15, 2018 at 12:46:05PM -0500, Mike Snitzer wrote:
> > Whether or not ANA is present is a choice of the target implementation;
> > the host (and whether it supports multipathing) has _zero_ influence on
> > this.  If the target declares a path as 'inaccessible' the path _is_
> > inaccessible to the host.  As such, ANA support should be functional
> > even if native multipathing is not.
> > 
> > Introduce ability to always re-read ANA log page as required due to ANA
> > error and make current ANA state available via sysfs -- even if native
> > multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> The first part I could see, but I still want to make it conditional
> in some way as nvme is going into deeply embedded setups, and I don't
> want to carry the weight of the ANA code around for everyone.

So disabling CONFIG_NVME_MULTIPATH isn't adequate?  You see a need for
enabling CONFIG_NVME_MULTIPATH but disallowing consideration of ANA
state management for these embedded cases?  That doesn't make much sense
in that ANA won't be enabled on the target, and if it is then shouldn't
the NVMe host enable ANA support?
 
> The second I fundamentally disagree with.  And even if you found agreement
> it would have to be in a separate patch as it is a separate feature.

In a later mail you clarified that by "second" you meant the sysfs
export.

nvme_ns_id_attrs_are_visible() calls nvme_ctrl_use_ana() to know whether
to export the ana state via sysfs.  Makes no sense to enable ANA to
work, independent of native multipathing, and then even though ANA is
active disallow exporting the associated ANA state.  We need
consistency.. if the sysfs file exists it means ANA state is being
managed.

All I can figure is that what you're suggesting is born out of
disallowing uses you fundamentally disagree with (e.g. multipath-tools
reading the ANA state from this sysfs file).

Mike
Mike Snitzer Nov. 16, 2018, 7:28 p.m. UTC | #9
On Fri, Nov 16 2018 at  5:17am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018 at 11:06:32AM +0100, Hannes Reinecke wrote:
> > Ok, so would you be happy with making ANA support configurable?
> 
> I've looked a bit over the whole situation, and what I think we need
> to do is:
> 
>  a) warn if we see a ANA capable device without multipath support
>     so people know it is not going to work properly.

I disagree with your cynicism but v2 of this patch now emits a warning
accordingly.

>  b) deprecate the multipath module option.  It was only intended as
>     a migration for any pre-existing PCIe multipath user if there
>     were any, not to support any new functionality.  So for 4.20
>     put in a patch that prints a clear warning when it is used,
>     including a link to the nvme list, and then for 4.25 or so
>     remove it entirely unless something unexpected come up.

You rejected the idea of allowing fine-grained control over whether
native NVMe multipathing is enabled or not on a per-namespace basis.
All we have is the coarse-grained nvme_core.multipath=N knob.  Now
you're forecasting removing even that.  Please don't do that.

> This whole drama of optional multipath use has wasted way too much
> of everyones time already.

It has wasted _way_ too much time.

But the drama is born out of you rejecting that we need to preserve
multipath-tools and dm-multipath's ability to work across any
transport.  You don't need to do that work: Hannes, myself and others
have always been willing and able -- if you'd let us.

IIRC it was at 2016's LSF in Boston where Ewan Milne and I had a
face-to-face conversation with you in the hallway track where you agreed
that ANA support would be activated if the capability was advertised by
the target.  The model we discussed is that it would be comparable to
how ALUA gets enabled during SCSI LUN discovery.

I hope you can see your way forward to be more accommodating now.
Especially given the proposed changes are backed by NVMe standards.

Please, PLEASE take v2 of this patch.. please? ;)

Thanks,
Mike
Laurence Oberman Nov. 16, 2018, 7:34 p.m. UTC | #10
On Fri, 2018-11-16 at 14:28 -0500, Mike Snitzer wrote:
> On Fri, Nov 16 2018 at  5:17am -0500,
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Fri, Nov 16, 2018 at 11:06:32AM +0100, Hannes Reinecke wrote:
> > > Ok, so would you be happy with making ANA support configurable?
> > 
> > I've looked a bit over the whole situation, and what I think we
> > need
> > to do is:
> > 
> >  a) warn if we see a ANA capable device without multipath support
> >     so people know it is not going to work properly.
> 
> I disagree with your cynicism but v2 of this patch now emits a
> warning
> accordingly.
> 
> >  b) deprecate the multipath module option.  It was only intended as
> >     a migration for any pre-existing PCIe multipath user if there
> >     were any, not to support any new functionality.  So for 4.20
> >     put in a patch that prints a clear warning when it is used,
> >     including a link to the nvme list, and then for 4.25 or so
> >     remove it entirely unless something unexpected come up.
> 
> You rejected the idea of allowing fine-grained control over whether
> native NVMe multipathing is enabled or not on a per-namespace basis.
> All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> you're forecasting removing even that.  Please don't do that.
> 
> > This whole drama of optional multipath use has wasted way too much
> > of everyones time already.
> 
> It has wasted _way_ too much time.
> 
> But the drama is born out of you rejecting that we need to preserve
> multipath-tools and dm-multipath's ability to work across any
> transport.  You don't need to do that work: Hannes, myself and others
> have always been willing and able -- if you'd let us.
> 
> IIRC it was at 2016's LSF in Boston where Ewan Milne and I had a
> face-to-face conversation with you in the hallway track where you
> agreed
> that ANA support would be activated if the capability was advertised
> by
> the target.  The model we discussed is that it would be comparable to
> how ALUA gets enabled during SCSI LUN discovery.
> 
> I hope you can see your way forward to be more accommodating now.
> Especially given the proposed changes are backed by NVMe standards.
> 
> Please, PLEASE take v2 of this patch.. please? ;)
> 
> Thanks,
> Mike

I am begging you take it too please 
Thanks
Laurence
Christoph Hellwig Nov. 19, 2018, 9:39 a.m. UTC | #11
On Fri, Nov 16, 2018 at 02:28:02PM -0500, Mike Snitzer wrote:
> You rejected the idea of allowing fine-grained control over whether
> native NVMe multipathing is enabled or not on a per-namespace basis.
> All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> you're forecasting removing even that.  Please don't do that.

The whole point is that this hook was intended as a band aid for the
hypothetical pre-existing setups.  Not ever for new development.

> Please, PLEASE take v2 of this patch.. please? ;)

See the previous mail for the plan ahead.   I'm sick and tired of you
trying to sneak your new developemts back in.
Mike Snitzer Nov. 19, 2018, 2:56 p.m. UTC | #12
On Mon, Nov 19 2018 at  4:39am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018 at 02:28:02PM -0500, Mike Snitzer wrote:
> > You rejected the idea of allowing fine-grained control over whether
> > native NVMe multipathing is enabled or not on a per-namespace basis.
> > All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> > you're forecasting removing even that.  Please don't do that.
> 
> The whole point is that this hook was intended as a band aid for the
> hypothetical pre-existing setups.  Not ever for new development.

It pains me that you're marching us towards increased conflict that
needs resolution through more formal means.  And that now I'm going to
have to document the timeline of your authoritarian approach to
stifling another Linux maintainer's freedom to do his job of delivering
on functionality I've been depended on for many years.

> > Please, PLEASE take v2 of this patch.. please? ;)
> 
> See the previous mail for the plan ahead.   I'm sick and tired of you
> trying to sneak your new developemts back in.

I've only ever posted patches in the open and never has it been with the
idea of sneaking anything in.  Miscategorizing my actions as such is a
gross abuse that I will not tolerate.

If we were to confine ourselves to this v2 patch I pleaded with you to
take: https://lkml.org/lkml/2018/11/17/4

1) Hannes proposed a simplistic patch that didn't account for the fact
   that NVMe's ana workqueue wouldn't get kicked, his intent was to make
   ANA work independent of your native multipathing.  (The fact he or I
   even need to post such patches, to unwind your tight-coupling of ANA
   and multipathing, speaks to how you've calculatingly undermined any
   effort to implement proper NVMe multipathing outside of the NVMe
   driver)

2) ANA and multipathing are completely disjoint on an NVMe spec level.
   You know this.

SO: will you be taking my v2 patch for 4.21 or not?

Please advise, thanks.
Mike
Christoph Hellwig Nov. 20, 2018, 9:42 a.m. UTC | #13
On Mon, Nov 19, 2018 at 09:56:50AM -0500, Mike Snitzer wrote:
> SO: will you be taking my v2 patch for 4.21 or not?

No.
Mike Snitzer Nov. 20, 2018, 1:37 p.m. UTC | #14
On Tue, Nov 20 2018 at  4:42am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Mon, Nov 19, 2018 at 09:56:50AM -0500, Mike Snitzer wrote:
> > SO: will you be taking my v2 patch for 4.21 or not?
> 
> No.

This isn't how a Linux maintainer engages in technical discussion.

You _clearly_ just want to prevent further use of multipath-tools and
DM-multipath.  You will resort to rejecting a patch that improves the
NVMe driver's standards compliance if it allows you hijack NVMe
multipathing because you think you have the best way and nobody else
should be allowed to use a well established competing _open_ _Linux_
solution.  Simple as that.

You haven't made a single technical argument against my v2 patch, yet
you're rejecting it.  Purely on the basis that having NVMe's ANA updates
work independent on native NVMe multipathing happens to benefit an
alternative (and that benefit is just to not have multipath-tools to be
so crude with a pure userspace ANA state tracking).

Jens, this entire situation has gotten well beyond acceptable and you or
other NVMe co-maintainers need to step in.  We need reasoned _technical_
discussion or this entire process falls apart.

Mike
Christoph Hellwig Nov. 20, 2018, 4:23 p.m. UTC | #15
On Tue, Nov 20, 2018 at 08:37:19AM -0500, Mike Snitzer wrote:
> This isn't how a Linux maintainer engages in technical discussion.

And this isn't a technical discussion.   As told before the only
reason to not build the multipath code is to save space, and
the only reason to disable multipath at runtime is for potential
pre-existing setups, which obviously are not ANA capable.

The right fix is to warn users if they use a driver without
CONFIG_NVME_MULTIPATH on a multi-port device and to phase out the
multipath=0 option in the long run, again with a warning.  Any new
additions to "improve" these cases simply don't make sense.
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fe957166c4a9..3df607905628 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -255,10 +255,12 @@  void nvme_complete_rq(struct request *req)
 		nvme_req(req)->ctrl->comp_seen = true;
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				return;
+			}
+			nvme_update_ana(req);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8e03cda770c5..0adbcff5fba2 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -22,7 +22,7 @@  MODULE_PARM_DESC(multipath,
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +47,35 @@  void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
+static void __nvme_update_ana(struct nvme_ns *ns)
+{
+	if (!ns->ctrl->ana_log_buf)
+		return;
+
+	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	queue_work(nvme_wq, &ns->ctrl->ana_work);
+}
+
+void nvme_update_ana(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status;
+
+	if (nvme_ana_error(status))
+		__nvme_update_ana(ns);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,25 +87,22 @@  void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
 		 * we should update our general state here, but due to the fact
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
-		 * mark the the path as pending and kick of a re-read of the ANA
+		 * mark the path as pending and kick off a re-read of the ANA
 		 * log page ASAP.
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
-		}
-		break;
+		__nvme_update_ana(ns);
+		goto kick_requeue;
+	}
+
+	switch (status & 0x7ff) {
 	case NVME_SC_HOST_PATH_ERROR:
 		/*
 		 * Temporary transport disruption in talking to the controller.
@@ -93,6 +119,7 @@  void nvme_failover_req(struct request *req)
 		break;
 	}
 
+kick_requeue:
 	kblockd_schedule_work(&ns->head->requeue_work);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27663ce3044e..cbe4253f2d02 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -471,6 +471,7 @@  bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -510,6 +511,9 @@  static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct request *req)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }