diff mbox

[04/12] IB/srp: Fix connection state tracking

Message ID 5541EE9F.8090605@sandisk.com (mailing list archive)
State Rejected
Headers show

Commit Message

Bart Van Assche April 30, 2015, 8:58 a.m. UTC
Reception of a DREQ message only causes the state of a single
channel to change. Modify the SRP initiator such that channel
and target connection state are tracked separately. This patch
avoids that following false positive warning can be reported
by srp_destroy_qp():

WARNING: at drivers/infiniband/ulp/srp/ib_srp.c:617 srp_destroy_qp+0xa6/0x120 [ib_srp]()
Call Trace:
[<ffffffff8106e10f>] warn_slowpath_common+0x7f/0xc0
[<ffffffff8106e16a>] warn_slowpath_null+0x1a/0x20
[<ffffffffa0440226>] srp_destroy_qp+0xa6/0x120 [ib_srp]
[<ffffffffa0440322>] srp_free_ch_ib+0x82/0x1e0 [ib_srp]
[<ffffffffa044408b>] srp_create_target+0x7ab/0x998 [ib_srp]
[<ffffffff81346f60>] dev_attr_store+0x20/0x30
[<ffffffff811dd90f>] sysfs_write_file+0xef/0x170
[<ffffffff8116d248>] vfs_write+0xc8/0x190
[<ffffffff8116d411>] sys_write+0x51/0x90

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Cc: <stable@vger.kernel.org> #v3.19
---
 drivers/infiniband/ulp/srp/ib_srp.c | 7 ++++---
 drivers/infiniband/ulp/srp/ib_srp.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

Comments

Sagi Grimberg April 30, 2015, 9:51 a.m. UTC | #1
On 4/30/2015 11:58 AM, Bart Van Assche wrote:
> Reception of a DREQ message only causes the state of a single
> channel to change. Modify the SRP initiator such that channel
> and target connection state are tracked separately. This patch
> avoids that following false positive warning can be reported
> by srp_destroy_qp():
>
> WARNING: at drivers/infiniband/ulp/srp/ib_srp.c:617 srp_destroy_qp+0xa6/0x120 [ib_srp]()
> Call Trace:
> [<ffffffff8106e10f>] warn_slowpath_common+0x7f/0xc0
> [<ffffffff8106e16a>] warn_slowpath_null+0x1a/0x20
> [<ffffffffa0440226>] srp_destroy_qp+0xa6/0x120 [ib_srp]
> [<ffffffffa0440322>] srp_free_ch_ib+0x82/0x1e0 [ib_srp]
> [<ffffffffa044408b>] srp_create_target+0x7ab/0x998 [ib_srp]
> [<ffffffff81346f60>] dev_attr_store+0x20/0x30
> [<ffffffff811dd90f>] sysfs_write_file+0xef/0x170
> [<ffffffff8116d248>] vfs_write+0xc8/0x190
> [<ffffffff8116d411>] sys_write+0x51/0x90
>
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Sagi Grimberg <sagig@mellanox.com>
> Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
> Cc: <stable@vger.kernel.org> #v3.19
> ---
>   drivers/infiniband/ulp/srp/ib_srp.c | 7 ++++---
>   drivers/infiniband/ulp/srp/ib_srp.h | 1 +
>   2 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
> index 5ce6cfd..0eb07d3 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.c
> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
> @@ -465,14 +465,13 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
>    */
>   static void srp_destroy_qp(struct srp_rdma_ch *ch)
>   {
> -	struct srp_target_port *target = ch->target;
>   	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>   	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
>   	struct ib_recv_wr *bad_wr;
>   	int ret;
>
>   	/* Destroying a QP and reusing ch->done is only safe if not connected */
> -	WARN_ON_ONCE(target->connected);
> +	WARN_ON_ONCE(ch->connected);
>
>   	ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE);
>   	WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret);
> @@ -836,6 +835,7 @@ static void srp_disconnect_target(struct srp_target_port *target)
>
>   		for (i = 0; i < target->ch_count; i++) {
>   			ch = &target->ch[i];
> +			ch->connected = false;
>   			if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) {
>   				shost_printk(KERN_DEBUG, target->scsi_host,
>   					     PFX "Sending CM DREQ failed\n");
> @@ -1017,6 +1017,7 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
>   		switch (ch->status) {
>   		case 0:
>   			srp_change_conn_state(target, true);
> +			ch->connected = true;
>   			return 0;
>
>   		case SRP_PORT_REDIRECT:
> @@ -2367,7 +2368,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
>   	case IB_CM_DREQ_RECEIVED:
>   		shost_printk(KERN_WARNING, target->scsi_host,
>   			     PFX "DREQ received - connection closed\n");
> -		srp_change_conn_state(target, false);
> +		ch->connected = false;

Shouldn't this be protected by the channel lock (like the target)?

>   		if (ib_send_cm_drep(cm_id, NULL, 0))
>   			shost_printk(KERN_ERR, target->scsi_host,
>   				     PFX "Sending CM DREP failed\n");
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
> index a611556..95a4471 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.h
> +++ b/drivers/infiniband/ulp/srp/ib_srp.h
> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>
>   	struct completion	tsk_mgmt_done;
>   	u8			tsk_mgmt_status;
> +	bool			connected;

I generally agree with this flag, but I'm a bit confused about the
semantics of two connected flags? Also, we check the target connected
flag on TMFs, although we are executing it on a channel (should we
check both?)

I'd say keep only the channel connected flag, the target logic needs to
be mandated by the state.

Sagi.

I think we need to keep only



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche April 30, 2015, 11:25 a.m. UTC | #2
On 04/30/15 11:51, Sagi Grimberg wrote:
> On 4/30/2015 11:58 AM, Bart Van Assche wrote:
>> -        srp_change_conn_state(target, false);
>> +        ch->connected = false;
>
> Shouldn't this be protected by the channel lock (like the target)?

On all CPU architectures I'm familiar with changes of boolean variables 
are atomic so I think modifying that variable without locking is fine.

>> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>>
>>       struct completion    tsk_mgmt_done;
>>       u8            tsk_mgmt_status;
>> +    bool            connected;
>
> I generally agree with this flag, but I'm a bit confused about the
> semantics of two connected flags? Also, we check the target connected
> flag on TMFs, although we are executing it on a channel (should we
> check both?)
>
> I'd say keep only the channel connected flag, the target logic needs to
> be mandated by the state.

I think we need both flags. The ch->connected flag is used only in 
srp_destroy_qp() to verify whether a channel has been disconnected 
before it is destroyed. The target->connected flag provides an easy way 
to test the connected state in e.g. srp_disconnect_target(). And if it 
is attempted to send a task management function over a channel that has 
just been disconnected that should be fine since any channel disconnect 
causes tl_err_work to be queued. That last action will sooner or later 
result in a reconnect.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg April 30, 2015, 3 p.m. UTC | #3
On 4/30/2015 2:25 PM, Bart Van Assche wrote:
> On 04/30/15 11:51, Sagi Grimberg wrote:
>> On 4/30/2015 11:58 AM, Bart Van Assche wrote:
>>> -        srp_change_conn_state(target, false);
>>> +        ch->connected = false;
>>
>> Shouldn't this be protected by the channel lock (like the target)?
>
> On all CPU architectures I'm familiar with changes of boolean variables
> are atomic so I think modifying that variable without locking is fine.
>
>>> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>>>
>>>       struct completion    tsk_mgmt_done;
>>>       u8            tsk_mgmt_status;
>>> +    bool            connected;
>>
>> I generally agree with this flag, but I'm a bit confused about the
>> semantics of two connected flags? Also, we check the target connected
>> flag on TMFs, although we are executing it on a channel (should we
>> check both?)
>>
>> I'd say keep only the channel connected flag, the target logic needs to
>> be mandated by the state.
>
> I think we need both flags. The ch->connected flag is used only in
> srp_destroy_qp() to verify whether a channel has been disconnected
> before it is destroyed. The target->connected flag provides an easy way
> to test the connected state in e.g. srp_disconnect_target().

We can just as easily check rport state. rport state is modified before
target-connected. I still think one is redundant.

> And if it
> is attempted to send a task management function over a channel that has
> just been disconnected that should be fine since any channel disconnect
> causes tl_err_work to be queued. That last action will sooner or later
> result in a reconnect.

Will it be fine to queue commands when the channel is in reconnecting
stage?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 30, 2015, 4:08 p.m. UTC | #4
On Thu, 2015-04-30 at 10:58 +0200, Bart Van Assche wrote:
> Reception of a DREQ message only causes the state of a single
> channel to change. Modify the SRP initiator such that channel
> and target connection state are tracked separately. This patch
> avoids that following false positive warning can be reported
> by srp_destroy_qp():
> 
> WARNING: at drivers/infiniband/ulp/srp/ib_srp.c:617 srp_destroy_qp+0xa6/0x120 [ib_srp]()
> Call Trace:
> [<ffffffff8106e10f>] warn_slowpath_common+0x7f/0xc0
> [<ffffffff8106e16a>] warn_slowpath_null+0x1a/0x20
> [<ffffffffa0440226>] srp_destroy_qp+0xa6/0x120 [ib_srp]
> [<ffffffffa0440322>] srp_free_ch_ib+0x82/0x1e0 [ib_srp]
> [<ffffffffa044408b>] srp_create_target+0x7ab/0x998 [ib_srp]
> [<ffffffff81346f60>] dev_attr_store+0x20/0x30
> [<ffffffff811dd90f>] sysfs_write_file+0xef/0x170
> [<ffffffff8116d248>] vfs_write+0xc8/0x190
> [<ffffffff8116d411>] sys_write+0x51/0x90
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Sagi Grimberg <sagig@mellanox.com>
> Cc: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
> Cc: <stable@vger.kernel.org> #v3.19
> ---
>  drivers/infiniband/ulp/srp/ib_srp.c | 7 ++++---
>  drivers/infiniband/ulp/srp/ib_srp.h | 1 +
>  2 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
> index 5ce6cfd..0eb07d3 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.c
> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
> @@ -465,14 +465,13 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
>   */
>  static void srp_destroy_qp(struct srp_rdma_ch *ch)
>  {
> -	struct srp_target_port *target = ch->target;
>  	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>  	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
>  	struct ib_recv_wr *bad_wr;
>  	int ret;
>  
>  	/* Destroying a QP and reusing ch->done is only safe if not connected */
> -	WARN_ON_ONCE(target->connected);
> +	WARN_ON_ONCE(ch->connected);
>  
>  	ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE);
>  	WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret);
> @@ -836,6 +835,7 @@ static void srp_disconnect_target(struct srp_target_port *target)
>  
>  		for (i = 0; i < target->ch_count; i++) {
>  			ch = &target->ch[i];
> +			ch->connected = false;
>  			if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) {
>  				shost_printk(KERN_DEBUG, target->scsi_host,
>  					     PFX "Sending CM DREQ failed\n");
> @@ -1017,6 +1017,7 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
>  		switch (ch->status) {
>  		case 0:
>  			srp_change_conn_state(target, true);
> +			ch->connected = true;
>  			return 0;
>  
>  		case SRP_PORT_REDIRECT:
> @@ -2367,7 +2368,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
>  	case IB_CM_DREQ_RECEIVED:
>  		shost_printk(KERN_WARNING, target->scsi_host,
>  			     PFX "DREQ received - connection closed\n");
> -		srp_change_conn_state(target, false);
> +		ch->connected = false;

So, in this patch, you modify disconnect to set srp_change_conn_state()
to false for the target, then loop through all the channels sending
cm_dreq's, and on the receiving side, you modify the cm_dreq handler to
set each channel to false.  However, once you get to 0 channels open,
shouldn't you then set the target state to false too just to keep things
consistent?

>  		if (ib_send_cm_drep(cm_id, NULL, 0))
>  			shost_printk(KERN_ERR, target->scsi_host,
>  				     PFX "Sending CM DREP failed\n");
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
> index a611556..95a4471 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.h
> +++ b/drivers/infiniband/ulp/srp/ib_srp.h
> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>  
>  	struct completion	tsk_mgmt_done;
>  	u8			tsk_mgmt_status;
> +	bool			connected;
>  };
>  
>  /**
Bart Van Assche May 5, 2015, 9:21 a.m. UTC | #5
On 04/30/15 18:08, Doug Ledford wrote:
> On Thu, 2015-04-30 at 10:58 +0200, Bart Van Assche wrote:
>> @@ -2367,7 +2368,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
>>   	case IB_CM_DREQ_RECEIVED:
>>   		shost_printk(KERN_WARNING, target->scsi_host,
>>   			     PFX "DREQ received - connection closed\n");
>> -		srp_change_conn_state(target, false);
>> +		ch->connected = false;
>
> So, in this patch, you modify disconnect to set srp_change_conn_state()
> to false for the target, then loop through all the channels sending
> cm_dreq's, and on the receiving side, you modify the cm_dreq handler to
> set each channel to false.  However, once you get to 0 channels open,
> shouldn't you then set the target state to false too just to keep things
> consistent?

Hello Doug,

What is not visible in this patch but only in the ib_srp.c source code 
is that the first received DREQ initiates a reconnect (the queue_work() 
call below):

	case IB_CM_DREQ_RECEIVED:
		shost_printk(KERN_WARNING, target->scsi_host,
			     PFX "DREQ received - connection closed\n");
		ch->connected = false;
		if (ib_send_cm_drep(cm_id, NULL, 0))
			shost_printk(KERN_ERR, target->scsi_host,
				     PFX "Sending CM DREP failed\n");
		queue_work(system_long_wq, &target->tl_err_work);
		break;

That should be sufficient to restore communication after a DREQ has been 
received.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche May 5, 2015, 9:31 a.m. UTC | #6
On 04/30/15 17:00, Sagi Grimberg wrote:
> On 4/30/2015 2:25 PM, Bart Van Assche wrote:
>> On 04/30/15 11:51, Sagi Grimberg wrote:
>>> On 4/30/2015 11:58 AM, Bart Van Assche wrote:
>>>> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>>>>
>>>>       struct completion    tsk_mgmt_done;
>>>>       u8            tsk_mgmt_status;
>>>> +    bool            connected;
>>>
>>> I generally agree with this flag, but I'm a bit confused about the
>>> semantics of two connected flags? Also, we check the target connected
>>> flag on TMFs, although we are executing it on a channel (should we
>>> check both?)
>>>
>>> I'd say keep only the channel connected flag, the target logic needs to
>>> be mandated by the state.
>>
>> I think we need both flags. The ch->connected flag is used only in
>> srp_destroy_qp() to verify whether a channel has been disconnected
>> before it is destroyed. The target->connected flag provides an easy way
>> to test the connected state in e.g. srp_disconnect_target().
>
> We can just as easily check rport state. rport state is modified before
> target-connected. I still think one is redundant.

Sorry but I disagree. If e.g. both the fast_io_fail and dev_loss_tmo 
timers have been disabled then the rport state is SRP_RPORT_RUNNING all 
the time. At least in that case it is not possible to derive the 
target->connected state from the rport state.

>> And if it
>> is attempted to send a task management function over a channel that has
>> just been disconnected that should be fine since any channel disconnect
>> causes tl_err_work to be queued. That last action will sooner or later
>> result in a reconnect.
>
> Will it be fine to queue commands when the channel is in reconnecting
> stage?

As one can see in srp_reconnect_rport() the associated SCSI devices are 
blocked as long as a reconnect is in progress. This means that 
srp_queuecommand() won't be called while a reconnect is in progress.

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg May 5, 2015, 9:45 a.m. UTC | #7
On 5/5/2015 12:31 PM, Bart Van Assche wrote:
> On 04/30/15 17:00, Sagi Grimberg wrote:
>> On 4/30/2015 2:25 PM, Bart Van Assche wrote:
>>> On 04/30/15 11:51, Sagi Grimberg wrote:
>>>> On 4/30/2015 11:58 AM, Bart Van Assche wrote:
>>>>> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>>>>>
>>>>>       struct completion    tsk_mgmt_done;
>>>>>       u8            tsk_mgmt_status;
>>>>> +    bool            connected;
>>>>
>>>> I generally agree with this flag, but I'm a bit confused about the
>>>> semantics of two connected flags? Also, we check the target connected
>>>> flag on TMFs, although we are executing it on a channel (should we
>>>> check both?)
>>>>
>>>> I'd say keep only the channel connected flag, the target logic needs to
>>>> be mandated by the state.
>>>
>>> I think we need both flags. The ch->connected flag is used only in
>>> srp_destroy_qp() to verify whether a channel has been disconnected
>>> before it is destroyed. The target->connected flag provides an easy way
>>> to test the connected state in e.g. srp_disconnect_target().
>>
>> We can just as easily check rport state. rport state is modified before
>> target-connected. I still think one is redundant.
>
> Sorry but I disagree. If e.g. both the fast_io_fail and dev_loss_tmo
> timers have been disabled then the rport state is SRP_RPORT_RUNNING all
> the time. At least in that case it is not possible to derive the
> target->connected state from the rport state.

Can't you rely on ch->connected for that?

We can keep both, but I just think that the meaning of
target->connected is confusing now.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche May 5, 2015, 9:59 a.m. UTC | #8
On 05/05/15 11:45, Sagi Grimberg wrote:
> On 5/5/2015 12:31 PM, Bart Van Assche wrote:
>> On 04/30/15 17:00, Sagi Grimberg wrote:
>>> On 4/30/2015 2:25 PM, Bart Van Assche wrote:
>>>> On 04/30/15 11:51, Sagi Grimberg wrote:
>>>>> On 4/30/2015 11:58 AM, Bart Van Assche wrote:
>>>>>> @@ -170,6 +170,7 @@ struct srp_rdma_ch {
>>>>>>
>>>>>>        struct completion    tsk_mgmt_done;
>>>>>>        u8            tsk_mgmt_status;
>>>>>> +    bool            connected;
>>>>>
>>>>> I generally agree with this flag, but I'm a bit confused about the
>>>>> semantics of two connected flags? Also, we check the target connected
>>>>> flag on TMFs, although we are executing it on a channel (should we
>>>>> check both?)
>>>>>
>>>>> I'd say keep only the channel connected flag, the target logic needs to
>>>>> be mandated by the state.
>>>>
>>>> I think we need both flags. The ch->connected flag is used only in
>>>> srp_destroy_qp() to verify whether a channel has been disconnected
>>>> before it is destroyed. The target->connected flag provides an easy way
>>>> to test the connected state in e.g. srp_disconnect_target().
>>>
>>> We can just as easily check rport state. rport state is modified before
>>> target-connected. I still think one is redundant.
>>
>> Sorry but I disagree. If e.g. both the fast_io_fail and dev_loss_tmo
>> timers have been disabled then the rport state is SRP_RPORT_RUNNING all
>> the time. At least in that case it is not possible to derive the
>> target->connected state from the rport state.
>
> Can't you rely on ch->connected for that?
>
> We can keep both, but I just think that the meaning of
> target->connected is confusing now.

Hello Sagi,

I will see whether I can remove the target->connected state variable.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 5, 2015, 2:10 p.m. UTC | #9
On Tue, 2015-05-05 at 11:21 +0200, Bart Van Assche wrote:
> On 04/30/15 18:08, Doug Ledford wrote:
> > On Thu, 2015-04-30 at 10:58 +0200, Bart Van Assche wrote:
> >> @@ -2367,7 +2368,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
> >>   	case IB_CM_DREQ_RECEIVED:
> >>   		shost_printk(KERN_WARNING, target->scsi_host,
> >>   			     PFX "DREQ received - connection closed\n");
> >> -		srp_change_conn_state(target, false);
> >> +		ch->connected = false;
> >
> > So, in this patch, you modify disconnect to set srp_change_conn_state()
> > to false for the target, then loop through all the channels sending
> > cm_dreq's, and on the receiving side, you modify the cm_dreq handler to
> > set each channel to false.  However, once you get to 0 channels open,
> > shouldn't you then set the target state to false too just to keep things
> > consistent?
> 
> Hello Doug,
> 
> What is not visible in this patch but only in the ib_srp.c source code 
> is that the first received DREQ initiates a reconnect (the queue_work() 
> call below):
> 
> 	case IB_CM_DREQ_RECEIVED:
> 		shost_printk(KERN_WARNING, target->scsi_host,
> 			     PFX "DREQ received - connection closed\n");
> 		ch->connected = false;
> 		if (ib_send_cm_drep(cm_id, NULL, 0))
> 			shost_printk(KERN_ERR, target->scsi_host,
> 				     PFX "Sending CM DREP failed\n");
> 		queue_work(system_long_wq, &target->tl_err_work);
> 		break;
> 
> That should be sufficient to restore communication after a DREQ has been 
> received.

Sure, but there is no guarantee that the wq is not busy with something
else, or that the reconnect attempt will succeed.  So, it would seem to
me that if you want to make sure your internal driver state is always
consistent, you should set the device connected state to 0 when there
are no connected channels any more.

However, while looking through the driver to research this, I noticed
something else that seems more important if you ask me.  With this patch
we now implement individual channel connection tracking.  However, in
srp_queuecommand() you pick the channel based on the tag, and the blk
layer has no idea of these disconnects, so the blk layer is free to
assign a tag/channel to a channel that's disconnected, and then as best
I can tell, you will simply try to post a work request to a channel
that's already disconnected, which I would expect to fail if we have
already disconnected this particular qp and not brought up a new one
yet.  So it seems to me there is a race condition between new incoming
SCSI commands and this disconnect/reconnect window, and that maybe we
should be sending these commands back to the mid layer for requeueing
when the channel the blk_mq tag points to is disconnected.  Or am I
missing something in there?
Bart Van Assche May 5, 2015, 2:26 p.m. UTC | #10
On 05/05/15 16:10, Doug Ledford wrote:
> However, while looking through the driver to research this, I noticed
> something else that seems more important if you ask me.  With this patch
> we now implement individual channel connection tracking.  However, in
> srp_queuecommand() you pick the channel based on the tag, and the blk
> layer has no idea of these disconnects, so the blk layer is free to
> assign a tag/channel to a channel that's disconnected, and then as best
> I can tell, you will simply try to post a work request to a channel
> that's already disconnected, which I would expect to fail if we have
> already disconnected this particular qp and not brought up a new one
> yet.  So it seems to me there is a race condition between new incoming
> SCSI commands and this disconnect/reconnect window, and that maybe we
> should be sending these commands back to the mid layer for requeueing
> when the channel the blk_mq tag points to is disconnected.  Or am I
> missing something in there?

Hello Doug,

Around the time a cable disconnect or other link layer failure is 
detected by the SRP initiator or any other SCSI LLD it is unavoidable 
that one or more SCSI requests fail. It is up to a higher layer (e.g. 
dm-multipath + multipathd) to decide what to do with such requests, e.g. 
queue these requests and resend these over another path. The SRP 
initiator driver has been tested thoroughly with the multipath 
queue_if_no_path policy, with a fio job with I/O verification enabled 
running on top of a dm device while concurrently repeatedly simulating 
link layer failures (via ibportstate).

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 5, 2015, 3:10 p.m. UTC | #11
On Tue, 2015-05-05 at 16:26 +0200, Bart Van Assche wrote:
> On 05/05/15 16:10, Doug Ledford wrote:
> > However, while looking through the driver to research this, I noticed
> > something else that seems more important if you ask me.  With this patch
> > we now implement individual channel connection tracking.  However, in
> > srp_queuecommand() you pick the channel based on the tag, and the blk
> > layer has no idea of these disconnects, so the blk layer is free to
> > assign a tag/channel to a channel that's disconnected, and then as best
> > I can tell, you will simply try to post a work request to a channel
> > that's already disconnected, which I would expect to fail if we have
> > already disconnected this particular qp and not brought up a new one
> > yet.  So it seems to me there is a race condition between new incoming
> > SCSI commands and this disconnect/reconnect window, and that maybe we
> > should be sending these commands back to the mid layer for requeueing
> > when the channel the blk_mq tag points to is disconnected.  Or am I
> > missing something in there?
> 
> Hello Doug,
> 
> Around the time a cable disconnect or other link layer failure is 
> detected by the SRP initiator or any other SCSI LLD it is unavoidable 
> that one or more SCSI requests fail. It is up to a higher layer (e.g. 
> dm-multipath + multipathd) to decide what to do with such requests, e.g. 
> queue these requests and resend these over another path.

Sure, but that wasn't my point.  My point was that if you know the
channel is disconnected, then why don't you go immediately to the
correct action in queuecommand (where correct action could be requeue
waiting on reconnect or return with error, whatever is appropriate)?
Instead you attempt to post a command to a known disconnected queue
pair.

>  The SRP 
> initiator driver has been tested thoroughly with the multipath 
> queue_if_no_path policy, with a fio job with I/O verification enabled 
> running on top of a dm device while concurrently repeatedly simulating 
> link layer failures (via ibportstate).

Part of my questions here are because I don't know how the blk_mq
handles certain conditions.  However, your testing above only handles
one case: all channels get dropped.  As unlikely it may be, what if
resource constraints caused just one channel to be dropped out of the
bunch and the others stayed alive?  Then the blk_mq would see requests
on just one queue come back errored, while the others finished
successfully.  Does it drop that one queue out of rotation, or does it
fail over the entire connection?
Bart Van Assche May 5, 2015, 3:27 p.m. UTC | #12
On 05/05/15 17:10, Doug Ledford wrote:
> On Tue, 2015-05-05 at 16:26 +0200, Bart Van Assche wrote:
>> On 05/05/15 16:10, Doug Ledford wrote:
>>> However, while looking through the driver to research this, I noticed
>>> something else that seems more important if you ask me.  With this patch
>>> we now implement individual channel connection tracking.  However, in
>>> srp_queuecommand() you pick the channel based on the tag, and the blk
>>> layer has no idea of these disconnects, so the blk layer is free to
>>> assign a tag/channel to a channel that's disconnected, and then as best
>>> I can tell, you will simply try to post a work request to a channel
>>> that's already disconnected, which I would expect to fail if we have
>>> already disconnected this particular qp and not brought up a new one
>>> yet.  So it seems to me there is a race condition between new incoming
>>> SCSI commands and this disconnect/reconnect window, and that maybe we
>>> should be sending these commands back to the mid layer for requeueing
>>> when the channel the blk_mq tag points to is disconnected.  Or am I
>>> missing something in there?
>>
>> Hello Doug,
>>
>> Around the time a cable disconnect or other link layer failure is
>> detected by the SRP initiator or any other SCSI LLD it is unavoidable
>> that one or more SCSI requests fail. It is up to a higher layer (e.g.
>> dm-multipath + multipathd) to decide what to do with such requests, e.g.
>> queue these requests and resend these over another path.
>
> Sure, but that wasn't my point.  My point was that if you know the
> channel is disconnected, then why don't you go immediately to the
> correct action in queuecommand (where correct action could be requeue
> waiting on reconnect or return with error, whatever is appropriate)?
> Instead you attempt to post a command to a known disconnected queue
> pair.
>
>> The SRP initiator driver has been tested thoroughly with the multipath
>> queue_if_no_path policy, with a fio job with I/O verification enabled
>> running on top of a dm device while concurrently repeatedly simulating
>> link layer failures (via ibportstate).
>
> Part of my questions here are because I don't know how the blk_mq
> handles certain conditions.  However, your testing above only handles
> one case: all channels get dropped.  As unlikely it may be, what if
> resource constraints caused just one channel to be dropped out of the
> bunch and the others stayed alive?  Then the blk_mq would see requests
> on just one queue come back errored, while the others finished
> successfully.  Does it drop that one queue out of rotation, or does it
> fail over the entire connection?

Hello Doug,

Sorry but I don't think that a SCSI LLD is the appropriate layer to 
choose between requeuing or failing a request. If multiple paths are 
available between an initiator system and a SAN and if one path fails 
only the multipath layer knows whether there are other working paths 
available. If a working path is still available then the request should 
be resent as soon as possible over another path. The multipath layer can 
only take such a decision after a SCSI LLD has failed a request.

If only one channel fails all other channels are disconnected and the 
transport layer error handling mechanism is started.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 5, 2015, 4:10 p.m. UTC | #13
On Tue, 2015-05-05 at 17:27 +0200, Bart Van Assche wrote:
> On 05/05/15 17:10, Doug Ledford wrote:
> > On Tue, 2015-05-05 at 16:26 +0200, Bart Van Assche wrote:
> >> On 05/05/15 16:10, Doug Ledford wrote:
> >>> However, while looking through the driver to research this, I noticed
> >>> something else that seems more important if you ask me.  With this patch
> >>> we now implement individual channel connection tracking.  However, in
> >>> srp_queuecommand() you pick the channel based on the tag, and the blk
> >>> layer has no idea of these disconnects, so the blk layer is free to
> >>> assign a tag/channel to a channel that's disconnected, and then as best
> >>> I can tell, you will simply try to post a work request to a channel
> >>> that's already disconnected, which I would expect to fail if we have
> >>> already disconnected this particular qp and not brought up a new one
> >>> yet.  So it seems to me there is a race condition between new incoming
> >>> SCSI commands and this disconnect/reconnect window, and that maybe we
> >>> should be sending these commands back to the mid layer for requeueing
> >>> when the channel the blk_mq tag points to is disconnected.  Or am I
> >>> missing something in there?
> >>
> >> Hello Doug,
> >>
> >> Around the time a cable disconnect or other link layer failure is
> >> detected by the SRP initiator or any other SCSI LLD it is unavoidable
> >> that one or more SCSI requests fail. It is up to a higher layer (e.g.
> >> dm-multipath + multipathd) to decide what to do with such requests, e.g.
> >> queue these requests and resend these over another path.
> >
> > Sure, but that wasn't my point.  My point was that if you know the
> > channel is disconnected, then why don't you go immediately to the
> > correct action in queuecommand (where correct action could be requeue
> > waiting on reconnect or return with error, whatever is appropriate)?
> > Instead you attempt to post a command to a known disconnected queue
> > pair.
> >
> >> The SRP initiator driver has been tested thoroughly with the multipath
> >> queue_if_no_path policy, with a fio job with I/O verification enabled
> >> running on top of a dm device while concurrently repeatedly simulating
> >> link layer failures (via ibportstate).
> >
> > Part of my questions here are because I don't know how the blk_mq
> > handles certain conditions.  However, your testing above only handles
> > one case: all channels get dropped.  As unlikely it may be, what if
> > resource constraints caused just one channel to be dropped out of the
> > bunch and the others stayed alive?  Then the blk_mq would see requests
> > on just one queue come back errored, while the others finished
> > successfully.  Does it drop that one queue out of rotation, or does it
> > fail over the entire connection?
> 
> Hello Doug,
> 
> Sorry but I don't think that a SCSI LLD is the appropriate layer to 
> choose between requeuing or failing a request.

Be that as it may, that doesn't change what I said about posting a
command to a known disconnected QP.  You could just fail immediately.
Something like:

if (!ch->connected) {
	scmnd->result = DID_NO_CONNECT;
	goto err;
}

right after getting the channel in queuecommand would work.  That would
save a couple spinlocks, several DMA mappings, a call into the low level
driver, and a few other things.  (And I only left requeue on the table
because I wasn't sure how the blk_mq dealt with just a single channel
being down versus all of them being down)

>  If multiple paths are 
> available between an initiator system and a SAN and if one path fails

Who says the path failed?  The path may be just fine.

> only the multipath layer knows whether there are other working paths 
> available. If a working path is still available then the request should 
> be resent as soon as possible over another path. The multipath layer can 
> only take such a decision after a SCSI LLD has failed a request.

Sure.  I totally get failing fast and unilaterally for multipath managed
devices.  That's all assuming multipath though.  There are uses without
that.

But my point in all of this is that if you have a single qp between
yourself and the target, then any error including a qp resource error ==
path error since you only have one path.  When you have a multi queue
device, that's no longer true.  A transient resource problem on one qp
does not mean a path event (at least not necessarily, although your
statement below converts a QP event into a path event by virtue
disconnecting and reconnecting all of the QPs).  My curiosity is now
moot given what you wrote about tearing everything down and reconnecting
(unless the error handling is modified to be more subtle in its
workings), but the original question in my mind was what happens at the
blk_mq level if you did have a single queue drop but not all of them and
you weren't using multipath.

> If only one channel fails all other channels are disconnected and the 
> transport layer error handling mechanism is started.

I missed that.  I assume it's done in srp_start_tl_fail_timers()?
Bart Van Assche May 6, 2015, 9:29 a.m. UTC | #14
Hello Doug,

On 05/05/15 18:10, Doug Ledford wrote:
> Be that as it may, that doesn't change what I said about posting a
> command to a known disconnected QP.  You could just fail immediately.
> Something like:
>
> if (!ch->connected) {
> 	scmnd->result = DID_NO_CONNECT;
> 	goto err;
> }
>
> right after getting the channel in queuecommand would work.  That would
> save a couple spinlocks, several DMA mappings, a call into the low level
> driver, and a few other things.  (And I only left requeue on the table
> because I wasn't sure how the blk_mq dealt with just a single channel
> being down versus all of them being down)

What you wrote above looks correct to me. However, it is intentional 
that such a check is not present in srp_queuecommand(). The intention 
was to optimize the hot path of that driver as much as possible. Hence 
the choice to post a work request on the QP even after it has been 
disconnected and to let the HCA generate an error completion.

> But my point in all of this is that if you have a single qp between
> yourself and the target, then any error including a qp resource error ==
> path error since you only have one path.  When you have a multi queue
> device, that's no longer true.  A transient resource problem on one qp
> does not mean a path event (at least not necessarily, although your
> statement below converts a QP event into a path event by virtue
> disconnecting and reconnecting all of the QPs).  My curiosity is now
> moot given what you wrote about tearing everything down and reconnecting
> (unless the error handling is modified to be more subtle in its
> workings), but the original question in my mind was what happens at the
> blk_mq level if you did have a single queue drop but not all of them and
> you weren't using multipath.

If we want to support this without adding similar code to handle this in 
every SCSI LLD I think we need to change first how blk-mq and 
dm-multipath interact. Today dm-multipath is a layer on top of blk-mq. 
Supporting the above scenario properly is possible e.g. by integrating 
multipath support in the blk-mq layer. I think Hannes and Christoph have 
already started to work on this.

>> If only one channel fails all other channels are disconnected and the
>> transport layer error handling mechanism is started.
>
> I missed that.  I assume it's done in srp_start_tl_fail_timers()?

Yes, that's correct. Both QP errors and reception of a DREQ trigger a 
call of srp_tl_err_work(). That last function calls 
srp_start_tl_fail_timers() which starts the reconnection mechanism, at 
least if the reconnect_delay parameter has a positive value (> 0).

Bart.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 7, 2015, 1:44 p.m. UTC | #15
On Wed, 2015-05-06 at 11:29 +0200, Bart Van Assche wrote:
> Hello Doug,
> 
> On 05/05/15 18:10, Doug Ledford wrote:
> > Be that as it may, that doesn't change what I said about posting a
> > command to a known disconnected QP.  You could just fail immediately.
> > Something like:
> >
> > if (!ch->connected) {
> > 	scmnd->result = DID_NO_CONNECT;
> > 	goto err;
> > }
> >
> > right after getting the channel in queuecommand would work.  That would
> > save a couple spinlocks, several DMA mappings, a call into the low level
> > driver, and a few other things.  (And I only left requeue on the table
> > because I wasn't sure how the blk_mq dealt with just a single channel
> > being down versus all of them being down)
> 
> What you wrote above looks correct to me. However, it is intentional 
> that such a check is not present in srp_queuecommand(). The intention 
> was to optimize the hot path of that driver as much as possible. Hence 
> the choice to post a work request on the QP even after it has been 
> disconnected and to let the HCA generate an error completion.

Given the amount of time it takes to do all of the dma mapping in that
function on the normal hot path, I suspect the above test, especially if
you added an unlikely annotation, would not even make a blip on your
performance numbers.  In any case, it's not something I would require in
the patch.

> > But my point in all of this is that if you have a single qp between
> > yourself and the target, then any error including a qp resource error ==
> > path error since you only have one path.  When you have a multi queue
> > device, that's no longer true.  A transient resource problem on one qp
> > does not mean a path event (at least not necessarily, although your
> > statement below converts a QP event into a path event by virtue
> > disconnecting and reconnecting all of the QPs).  My curiosity is now
> > moot given what you wrote about tearing everything down and reconnecting
> > (unless the error handling is modified to be more subtle in its
> > workings), but the original question in my mind was what happens at the
> > blk_mq level if you did have a single queue drop but not all of them and
> > you weren't using multipath.
> 
> If we want to support this without adding similar code to handle this in 
> every SCSI LLD I think we need to change first how blk-mq and 
> dm-multipath interact. Today dm-multipath is a layer on top of blk-mq. 
> Supporting the above scenario properly is possible e.g. by integrating 
> multipath support in the blk-mq layer. I think Hannes and Christoph have 
> already started to work on this.

Ok.
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 5ce6cfd..0eb07d3 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -465,14 +465,13 @@  static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
  */
 static void srp_destroy_qp(struct srp_rdma_ch *ch)
 {
-	struct srp_target_port *target = ch->target;
 	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
 	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	/* Destroying a QP and reusing ch->done is only safe if not connected */
-	WARN_ON_ONCE(target->connected);
+	WARN_ON_ONCE(ch->connected);
 
 	ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE);
 	WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret);
@@ -836,6 +835,7 @@  static void srp_disconnect_target(struct srp_target_port *target)
 
 		for (i = 0; i < target->ch_count; i++) {
 			ch = &target->ch[i];
+			ch->connected = false;
 			if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) {
 				shost_printk(KERN_DEBUG, target->scsi_host,
 					     PFX "Sending CM DREQ failed\n");
@@ -1017,6 +1017,7 @@  static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
 		switch (ch->status) {
 		case 0:
 			srp_change_conn_state(target, true);
+			ch->connected = true;
 			return 0;
 
 		case SRP_PORT_REDIRECT:
@@ -2367,7 +2368,7 @@  static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 	case IB_CM_DREQ_RECEIVED:
 		shost_printk(KERN_WARNING, target->scsi_host,
 			     PFX "DREQ received - connection closed\n");
-		srp_change_conn_state(target, false);
+		ch->connected = false;
 		if (ib_send_cm_drep(cm_id, NULL, 0))
 			shost_printk(KERN_ERR, target->scsi_host,
 				     PFX "Sending CM DREP failed\n");
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index a611556..95a4471 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -170,6 +170,7 @@  struct srp_rdma_ch {
 
 	struct completion	tsk_mgmt_done;
 	u8			tsk_mgmt_status;
+	bool			connected;
 };
 
 /**