diff mbox series

[5/5] scsi_transport_fc: Added a new sysfs attribute noretries_abort

Message ID 1596595862-11075-6-git-send-email-muneendra.kumar@broadcom.com (mailing list archive)
State Changes Requested
Headers show
Series scsi: Support to handle Intermittent errors | expand

Commit Message

Muneendra Kumar M Aug. 5, 2020, 2:51 a.m. UTC
Added a new sysfs attribute noretries_abort under fc_transport/target*/

This interface will set SCMD_NORETRIES_ABORT bit in scmd->state for all
the pending io's on the scsi device associated with target port.

Below is the interface provided to abort the io
echo 1 >> /sys/class/fc_transport/targetX\:Y\:Z/noretries_abort

Signed-off-by: Muneendra <muneendra.kumar@broadcom.com>
---
 drivers/scsi/scsi_transport_fc.c | 49 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

Comments

Hannes Reinecke Aug. 10, 2020, 6:24 a.m. UTC | #1
On 8/5/20 4:51 AM, Muneendra wrote:
> Added a new sysfs attribute noretries_abort under fc_transport/target*/
> 
> This interface will set SCMD_NORETRIES_ABORT bit in scmd->state for all
> the pending io's on the scsi device associated with target port.
> 
> Below is the interface provided to abort the io
> echo 1 >> /sys/class/fc_transport/targetX\:Y\:Z/noretries_abort
> 
> Signed-off-by: Muneendra <muneendra.kumar@broadcom.com>
> ---
>   drivers/scsi/scsi_transport_fc.c | 49 ++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 47 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
> index 2732fa6..f7b00ae 100644
> --- a/drivers/scsi/scsi_transport_fc.c
> +++ b/drivers/scsi/scsi_transport_fc.c
> @@ -305,7 +305,7 @@ struct device_attribute device_attr_##_prefix##_##_name = 	\
>    * Attribute counts pre object type...
>    * Increase these values if you add attributes
>    */
> -#define FC_STARGET_NUM_ATTRS 	3
> +#define FC_STARGET_NUM_ATTRS	4
>   #define FC_RPORT_NUM_ATTRS	10
>   #define FC_VPORT_NUM_ATTRS	9
>   #define FC_HOST_NUM_ATTRS	29
> @@ -994,6 +994,44 @@ static FC_DEVICE_ATTR(rport, fast_io_fail_tmo, S_IRUGO | S_IWUSR,
>   /*
>    * FC SCSI Target Attribute Management
>    */
> +static void scsi_target_set_noretries_abort(struct scsi_target *starget)
> +{
> +	struct scsi_device *sdev, *tmp;
> +	struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(shost->host_lock, flags);
> +	list_for_each_entry_safe(sdev, tmp, &starget->devices, same_target_siblings) {
> +		if (sdev->sdev_state == SDEV_DEL)
> +			continue;
> +		if (scsi_device_get(sdev))
> +			continue;
> +
> +		spin_unlock_irqrestore(shost->host_lock, flags);
> +		scsi_set_noretries_abort_io_device(sdev);
> +		spin_lock_irqsave(shost->host_lock, flags);
> +		scsi_device_put(sdev);
> +	}
> +	spin_unlock_irqrestore(shost->host_lock, flags);
> +}
> +
> +/*
> + * Sets  no retries on abort in scmd->state for all
> + * outstanding io of all the scsi_devs
> + * write 1 to set the bit for all outstanding io's
> + */
> +static ssize_t fc_target_set_noretries_abort(struct device *dev,
> +						struct device_attribute *attr,
> +						const char *buf, size_t count)
> +{
> +	struct scsi_target *starget = transport_class_to_starget(dev);
> +
> +	scsi_target_set_noretries_abort(starget);
> +	return count;
> +}
> +
> +static FC_DEVICE_ATTR(starget, noretries_abort, 0200,
> +		NULL, fc_target_set_noretries_abort);
>   
>   /*
>    * Note: in the target show function we recognize when the remote
> @@ -1036,6 +1074,13 @@ static FC_DEVICE_ATTR(starget, field, S_IRUGO,			\
>   	if (i->f->show_starget_##field)					\
>   		count++
>   
> +#define SETUP_PRIVATE_STARGET_ATTRIBUTE_RW(field)			\
> +do {									\
> +	i->private_starget_attrs[count] = device_attr_starget_##field; \
> +	i->starget_attrs[count] = &i->private_starget_attrs[count];	\
> +	count++;							\
> +} while (0)
> +
>   #define SETUP_STARGET_ATTRIBUTE_RW(field)				\
>   	i->private_starget_attrs[count] = device_attr_starget_##field; \
>   	if (!i->f->set_starget_##field) {				\
> @@ -2197,7 +2242,7 @@ struct scsi_transport_template *
>   	SETUP_STARGET_ATTRIBUTE_RD(node_name);
>   	SETUP_STARGET_ATTRIBUTE_RD(port_name);
>   	SETUP_STARGET_ATTRIBUTE_RD(port_id);
> -
> +	SETUP_PRIVATE_STARGET_ATTRIBUTE_RW(noretries_abort);
>   	BUG_ON(count > FC_STARGET_NUM_ATTRS);
>   
>   	i->starget_attrs[count] = NULL;
> 
Hmm. Wouldn't it make more sense to introduce a new port state 
'marginal' for this? We might want/need to introduce additional error 
recovery mechanisms here, so having a new state might be easier in the 
long run ...

Additionally, from my understanding the FPIN events will be generated 
with a certain frequency. So we could model the new 'marginal' state 
similar to the dev_loss_tmo mechanism; start a timer whenever the 
'marginal' state is being set, and clear the state back to 'running' if 
the state hasn't been refreshed within that timeframe.
That would give us an automatic state reset back to running, and quite 
easy to implement from userland.

Cheers,

Hannes
Muneendra Kumar M Aug. 11, 2020, 6:01 a.m. UTC | #2
Hi Hannes,

>
>Hmm. Wouldn't it make more sense to introduce a new port state 'marginal'
>for this? We might >want/need to introduce additional error recovery
>mechanisms here, so having a new state >might be easier in the long run ...

>Additionally, from my understanding the FPIN events will be generated with
>a certain >frequency. So we could model the new 'marginal' state similar to
>the dev_loss_tmo >mechanism; start a timer whenever the 'marginal' state is
>being set, and clear the state back to >'running' if the state hasn't been
>refreshed within that timeframe.
>That would give us an automatic state reset back to running, and quite easy
>to implement from >userland.

Thanks for the review.
I have a small doubt.
When the port state moves from marginal to running state does it mean we
expect a traffic from the path ?

Regards,
Muneendra.
Hannes Reinecke Aug. 11, 2020, 6:35 a.m. UTC | #3
On 8/11/20 8:01 AM, Muneendra Kumar M wrote:
> Hi Hannes,
> 
>>
>> Hmm. Wouldn't it make more sense to introduce a new port state 'marginal'
>> for this? We might >want/need to introduce additional error recovery
>> mechanisms here, so having a new state >might be easier in the long run ...
> 
>> Additionally, from my understanding the FPIN events will be generated with
>> a certain >frequency. So we could model the new 'marginal' state similar to
>> the dev_loss_tmo >mechanism; start a timer whenever the 'marginal' state is
>> being set, and clear the state back to >'running' if the state hasn't been
>> refreshed within that timeframe.
>> That would give us an automatic state reset back to running, and quite easy
>> to implement from >userland.
> 
> Thanks for the review.
> I have a small doubt.
> When the port state moves from marginal to running state does it mean we
> expect a traffic from the path ?
> 
We don't expect traffic; rather we _allow_ traffic.
But moving to from marginal to running means that we didn't receive FPIN
events, and the path should be considered healthy again.
So from that perspective it should be back to normal operations.

Cheers,

Hannes
Muneendra Kumar M Aug. 11, 2020, 7:03 a.m. UTC | #4
Hi Hannes,
>>
>> Hmm. Wouldn't it make more sense to introduce a new port state 'marginal'
>> for this? We might >want/need to introduce additional error recovery
>> mechanisms here, so having a new state >might be easier in the long run
>> ...
>
>> Additionally, from my understanding the FPIN events will be generated
>> with a certain >frequency. So we could model the new 'marginal' state
>> similar to the dev_loss_tmo >mechanism; start a timer whenever the
>> 'marginal' state is being set, and clear the state back to >'running'
>> if the state hasn't been refreshed within that timeframe.
>> That would give us an automatic state reset back to running, and
>> quite easy to implement from >userland.
>
> Thanks for the review.
> I have a small doubt.
> When the port state moves from marginal to running state does it mean
> we expect a traffic from the path ?
>
>We don't expect traffic; rather we _allow_ traffic.
>But moving to from marginal to running means that we didn't receive FPIN
>events, and the path should be considered healthy again.
>So from that perspective it should be back to normal operations.


But this could  apply only to FPIN-Congestion. Only in this case FPIN-CN
FPIN events will be generated  with a certain  frequency.
But for FPIN-Li this is not the case.
FPIN-LI is used to inform about marginal paths, which needs manual
intervention to recover.
And for FPIN-LI the path should be re-enabled on any link bounce
(portdisable followed by portenable) which would correlated to a cable/sfp
change.
For now, however, we are addressing FPIN-LI primarily.

Regards,
Muneendra.
Hannes Reinecke Aug. 11, 2020, 2 p.m. UTC | #5
On 8/11/20 9:03 AM, Muneendra Kumar M wrote:
>  Hi Hannes,
>>>
>>> Hmm. Wouldn't it make more sense to introduce a new port state 'marginal'
>>> for this? We might >want/need to introduce additional error recovery
>>> mechanisms here, so having a new state >might be easier in the long run
>>> ...
>>
>>> Additionally, from my understanding the FPIN events will be generated
>>> with a certain >frequency. So we could model the new 'marginal' state
>>> similar to the dev_loss_tmo >mechanism; start a timer whenever the
>>> 'marginal' state is being set, and clear the state back to >'running'
>>> if the state hasn't been refreshed within that timeframe.
>>> That would give us an automatic state reset back to running, and
>>> quite easy to implement from >userland.
>>
>> Thanks for the review.
>> I have a small doubt.
>> When the port state moves from marginal to running state does it mean
>> we expect a traffic from the path ?
>>
>> We don't expect traffic; rather we _allow_ traffic.
>> But moving to from marginal to running means that we didn't receive FPIN
>> events, and the path should be considered healthy again.
>> So from that perspective it should be back to normal operations.
> 
> 
> But this could  apply only to FPIN-Congestion. Only in this case FPIN-CN
> FPIN events will be generated  with a certain  frequency.
> But for FPIN-Li this is not the case.
> FPIN-LI is used to inform about marginal paths, which needs manual
> intervention to recover.
> And for FPIN-LI the path should be re-enabled on any link bounce
> (portdisable followed by portenable) which would correlated to a cable/sfp
> change.
> For now, however, we are addressing FPIN-LI primarily.
> 
Ah. So that changes things slightly; I had hoped we can address things
systematically, but with link integrity issues we don't have any other
choice but to replace the cable (ie wait for user interaction).

But still I'm in favour of the 'marginal' state; that one could be set
manually (or by an FPIN LI event), and would need to be reset either
manually or by link reset.

And have the advantage of being easier to implement :-)

Cheers,

Hannes
Muneendra Kumar M Aug. 14, 2020, 5:33 a.m. UTC | #6
Hi Hannes,


>>>
>>> Hmm. Wouldn't it make more sense to introduce a new port state
>>> 'marginal'
>>> for this? We might >want/need to introduce additional error recovery
>>> mechanisms here, so having a new state >might be easier in the long
>>> run ...
>Ah. So that changes things slightly; I had hoped we can address things
>systematically, but with link integrity issues we don't have any other
>choice but to replace the cable (ie wait for user interaction).

>But still I'm in favour of the 'marginal' state; that one could be set
>manually (or by an FPIN LI event), and would need to be reset either
>manually or by link reset.

>And have the advantage of being easier to implement :-)

Thanks for the review.
I will incorporate all your review comments and will add marginal state in
my next version.

Regards,
Muneendra.
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 2732fa6..f7b00ae 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -305,7 +305,7 @@  struct device_attribute device_attr_##_prefix##_##_name = 	\
  * Attribute counts pre object type...
  * Increase these values if you add attributes
  */
-#define FC_STARGET_NUM_ATTRS 	3
+#define FC_STARGET_NUM_ATTRS	4
 #define FC_RPORT_NUM_ATTRS	10
 #define FC_VPORT_NUM_ATTRS	9
 #define FC_HOST_NUM_ATTRS	29
@@ -994,6 +994,44 @@  static FC_DEVICE_ATTR(rport, fast_io_fail_tmo, S_IRUGO | S_IWUSR,
 /*
  * FC SCSI Target Attribute Management
  */
+static void scsi_target_set_noretries_abort(struct scsi_target *starget)
+{
+	struct scsi_device *sdev, *tmp;
+	struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
+	unsigned long flags;
+
+	spin_lock_irqsave(shost->host_lock, flags);
+	list_for_each_entry_safe(sdev, tmp, &starget->devices, same_target_siblings) {
+		if (sdev->sdev_state == SDEV_DEL)
+			continue;
+		if (scsi_device_get(sdev))
+			continue;
+
+		spin_unlock_irqrestore(shost->host_lock, flags);
+		scsi_set_noretries_abort_io_device(sdev);
+		spin_lock_irqsave(shost->host_lock, flags);
+		scsi_device_put(sdev);
+	}
+	spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
+/*
+ * Sets  no retries on abort in scmd->state for all
+ * outstanding io of all the scsi_devs
+ * write 1 to set the bit for all outstanding io's
+ */
+static ssize_t fc_target_set_noretries_abort(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct scsi_target *starget = transport_class_to_starget(dev);
+
+	scsi_target_set_noretries_abort(starget);
+	return count;
+}
+
+static FC_DEVICE_ATTR(starget, noretries_abort, 0200,
+		NULL, fc_target_set_noretries_abort);
 
 /*
  * Note: in the target show function we recognize when the remote
@@ -1036,6 +1074,13 @@  static FC_DEVICE_ATTR(starget, field, S_IRUGO,			\
 	if (i->f->show_starget_##field)					\
 		count++
 
+#define SETUP_PRIVATE_STARGET_ATTRIBUTE_RW(field)			\
+do {									\
+	i->private_starget_attrs[count] = device_attr_starget_##field; \
+	i->starget_attrs[count] = &i->private_starget_attrs[count];	\
+	count++;							\
+} while (0)
+
 #define SETUP_STARGET_ATTRIBUTE_RW(field)				\
 	i->private_starget_attrs[count] = device_attr_starget_##field; \
 	if (!i->f->set_starget_##field) {				\
@@ -2197,7 +2242,7 @@  struct scsi_transport_template *
 	SETUP_STARGET_ATTRIBUTE_RD(node_name);
 	SETUP_STARGET_ATTRIBUTE_RD(port_name);
 	SETUP_STARGET_ATTRIBUTE_RD(port_id);
-
+	SETUP_PRIVATE_STARGET_ATTRIBUTE_RW(noretries_abort);
 	BUG_ON(count > FC_STARGET_NUM_ATTRS);
 
 	i->starget_attrs[count] = NULL;