diff mbox series

[v3] iscsi: Perform connection failure entirely in kernel space

Message ID 20191226204746.2197233-1-krisman@collabora.com (mailing list archive)
State Superseded
Headers show
Series [v3] iscsi: Perform connection failure entirely in kernel space | expand

Commit Message

Gabriel Krisman Bertazi Dec. 26, 2019, 8:47 p.m. UTC
From: Bharath Ravi <rbharath@google.com>

Connection failure processing depends on a daemon being present to (at
least) stop the connection and start recovery.  This is a problem on a
multipath scenario, where if the daemon failed for whatever reason, the
SCSI path is never marked as down, multipath won't perform the
failover and IO to the device will be forever waiting for that
connection to come back.

This patch performs the connection failure entirely inside the kernel.
This way, the failover can happen and pending IO can continue even if
the daemon is dead. Once the daemon comes alive again, it can execute
recovery procedures if applicable.

Changes since v2:
  - Don't hold rx_mutex for too long at once

Changes since v1:
  - Remove module parameter.
  - Always do kernel-side stop work.
  - Block recovery timeout handler if system is dying.
  - send a CONN_TERM stop if the system is dying.

Cc: Mike Christie <mchristi@redhat.com>
Cc: Lee Duncan <LDuncan@suse.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Co-developed-by: Dave Clausen <dclausen@google.com>
Signed-off-by: Dave Clausen <dclausen@google.com>
Co-developed-by: Nick Black <nlb@google.com>
Signed-off-by: Nick Black <nlb@google.com>
Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Co-developed-by: Anatol Pomazau <anatol@google.com>
Signed-off-by: Anatol Pomazau <anatol@google.com>
Co-developed-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Co-developed-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Co-developed-by: Junho Ryu <jayr@google.com>
Signed-off-by: Junho Ryu <jayr@google.com>
Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Signed-off-by: Bharath Ravi <rbharath@google.com>
Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++
 include/scsi/scsi_transport_iscsi.h |  1 +
 2 files changed, 64 insertions(+)

Comments

Lee Duncan Jan. 1, 2020, 6:45 p.m. UTC | #1
On 12/26/19 12:47 PM, Gabriel Krisman Bertazi wrote:
> From: Bharath Ravi <rbharath@google.com>
> 
> Connection failure processing depends on a daemon being present to (at
> least) stop the connection and start recovery.  This is a problem on a
> multipath scenario, where if the daemon failed for whatever reason, the
> SCSI path is never marked as down, multipath won't perform the
> failover and IO to the device will be forever waiting for that
> connection to come back.
> 
> This patch performs the connection failure entirely inside the kernel.
> This way, the failover can happen and pending IO can continue even if
> the daemon is dead. Once the daemon comes alive again, it can execute
> recovery procedures if applicable.
> 
> Changes since v2:
>   - Don't hold rx_mutex for too long at once
> 
> Changes since v1:
>   - Remove module parameter.
>   - Always do kernel-side stop work.
>   - Block recovery timeout handler if system is dying.
>   - send a CONN_TERM stop if the system is dying.
> 
> Cc: Mike Christie <mchristi@redhat.com>
> Cc: Lee Duncan <LDuncan@suse.com>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Co-developed-by: Dave Clausen <dclausen@google.com>
> Signed-off-by: Dave Clausen <dclausen@google.com>
> Co-developed-by: Nick Black <nlb@google.com>
> Signed-off-by: Nick Black <nlb@google.com>
> Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
> Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
> Co-developed-by: Anatol Pomazau <anatol@google.com>
> Signed-off-by: Anatol Pomazau <anatol@google.com>
> Co-developed-by: Tahsin Erdogan <tahsin@google.com>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> Co-developed-by: Frank Mayhar <fmayhar@google.com>
> Signed-off-by: Frank Mayhar <fmayhar@google.com>
> Co-developed-by: Junho Ryu <jayr@google.com>
> Signed-off-by: Junho Ryu <jayr@google.com>
> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
> Signed-off-by: Bharath Ravi <rbharath@google.com>
> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> ---
>  drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++
>  include/scsi/scsi_transport_iscsi.h |  1 +
>  2 files changed, 64 insertions(+)
> 
> diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
> index 271afea654e2..c6db6ded60a1 100644
> --- a/drivers/scsi/scsi_transport_iscsi.c
> +++ b/drivers/scsi/scsi_transport_iscsi.c
> @@ -86,6 +86,12 @@ struct iscsi_internal {
>  	struct transport_container session_cont;
>  };
>  
> +/* Worker to perform connection failure on unresponsive connections
> + * completely in kernel space.
> + */
> +static void stop_conn_work_fn(struct work_struct *work);
> +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
> +
>  static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
>  static struct workqueue_struct *iscsi_eh_timer_workq;
>  
> @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex);
>  static LIST_HEAD(sesslist);
>  static DEFINE_SPINLOCK(sesslock);
>  static LIST_HEAD(connlist);
> +static LIST_HEAD(connlist_err);
>  static DEFINE_SPINLOCK(connlock);
>  
>  static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
> @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
>  
>  	mutex_init(&conn->ep_mutex);
>  	INIT_LIST_HEAD(&conn->conn_list);
> +	INIT_LIST_HEAD(&conn->conn_list_err);
>  	conn->transport = transport;
>  	conn->cid = cid;
>  
> @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
>  
>  	spin_lock_irqsave(&connlock, flags);
>  	list_del(&conn->conn_list);
> +	list_del(&conn->conn_list_err);
>  	spin_unlock_irqrestore(&connlock, flags);
>  
>  	transport_unregister_device(&conn->dev);
> @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
>  }
>  EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
>  
> +static void stop_conn_work_fn(struct work_struct *work)
> +{
> +	struct iscsi_cls_conn *conn, *tmp;
> +	unsigned long flags;
> +	LIST_HEAD(recovery_list);
> +
> +	spin_lock_irqsave(&connlock, flags);
> +	if (list_empty(&connlist_err)) {
> +		spin_unlock_irqrestore(&connlock, flags);
> +		return;
> +	}
> +	list_splice_init(&connlist_err, &recovery_list);
> +	spin_unlock_irqrestore(&connlock, flags);
> +
> +	list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
> +		uint32_t sid = iscsi_conn_get_sid(conn);
> +		struct iscsi_cls_session *session;
> +
> +		mutex_lock(&rx_queue_mutex);
> +
> +		session = iscsi_session_lookup(sid);
> +		if (session) {
> +			if (system_state != SYSTEM_RUNNING) {
> +				session->recovery_tmo = 0;
> +				conn->transport->stop_conn(conn,
> +							   STOP_CONN_TERM);
> +			} else {
> +				conn->transport->stop_conn(conn,
> +							   STOP_CONN_RECOVER);
> +			}
> +		}
> +
> +		list_del_init(&conn->conn_list_err);
> +
> +		mutex_unlock(&rx_queue_mutex);
> +
> +		/* we don't want to hold rx_queue_mutex for too long,
> +		 * for instance if many conns failed at the same time,
> +		 * since this stall other iscsi maintenance operations.
> +		 * Give other users a chance to proceed.
> +		 */
> +		cond_resched();
> +	}
> +}
> +
>  void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
>  {
>  	struct nlmsghdr	*nlh;
> @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
>  	struct iscsi_uevent *ev;
>  	struct iscsi_internal *priv;
>  	int len = nlmsg_total_size(sizeof(*ev));
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&connlock, flags);
> +	list_add(&conn->conn_list_err, &connlist_err);
> +	spin_unlock_irqrestore(&connlock, flags);
> +	queue_work(system_unbound_wq, &stop_conn_work);
>  
>  	priv = iscsi_if_transport_lookup(conn->transport);
>  	if (!priv)
> @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
>  	if (!conn)
>  		return -EINVAL;
>  
> +	if (!list_empty(&conn->conn_list_err))
> +		return -EAGAIN;
> +
>  	ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
>  	if (transport->destroy_conn)
>  		transport->destroy_conn(conn);
> diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
> index 325ae731d9ad..2129dc9e2dec 100644
> --- a/include/scsi/scsi_transport_iscsi.h
> +++ b/include/scsi/scsi_transport_iscsi.h
> @@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no,
>  
>  struct iscsi_cls_conn {
>  	struct list_head conn_list;	/* item in connlist */
> +	struct list_head conn_list_err;	/* item in connlist_err */
>  	void *dd_data;			/* LLD private data */
>  	struct iscsi_transport *transport;
>  	uint32_t cid;			/* connection id */
> 

Reviewed-by: Lee Duncan <lduncan@suse.com>
Khazhy Kumykov Jan. 2, 2020, 5:07 p.m. UTC | #2
On Thu, Dec 26, 2019 at 3:48 PM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> From: Bharath Ravi <rbharath@google.com>
>
> Connection failure processing depends on a daemon being present to (at
> least) stop the connection and start recovery.  This is a problem on a
> multipath scenario, where if the daemon failed for whatever reason, the
> SCSI path is never marked as down, multipath won't perform the
> failover and IO to the device will be forever waiting for that
> connection to come back.
>
> This patch performs the connection failure entirely inside the kernel.
> This way, the failover can happen and pending IO can continue even if
> the daemon is dead. Once the daemon comes alive again, it can execute
> recovery procedures if applicable.
>
> Changes since v2:
>   - Don't hold rx_mutex for too long at once
>
> Changes since v1:
>   - Remove module parameter.
>   - Always do kernel-side stop work.
>   - Block recovery timeout handler if system is dying.
>   - send a CONN_TERM stop if the system is dying.
>
> Cc: Mike Christie <mchristi@redhat.com>
> Cc: Lee Duncan <LDuncan@suse.com>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Co-developed-by: Dave Clausen <dclausen@google.com>
> Signed-off-by: Dave Clausen <dclausen@google.com>
> Co-developed-by: Nick Black <nlb@google.com>
> Signed-off-by: Nick Black <nlb@google.com>
> Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
> Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
> Co-developed-by: Anatol Pomazau <anatol@google.com>
> Signed-off-by: Anatol Pomazau <anatol@google.com>
> Co-developed-by: Tahsin Erdogan <tahsin@google.com>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> Co-developed-by: Frank Mayhar <fmayhar@google.com>
> Signed-off-by: Frank Mayhar <fmayhar@google.com>
> Co-developed-by: Junho Ryu <jayr@google.com>
> Signed-off-by: Junho Ryu <jayr@google.com>
> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
> Signed-off-by: Bharath Ravi <rbharath@google.com>
> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> ---
>  drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++
>  include/scsi/scsi_transport_iscsi.h |  1 +
>  2 files changed, 64 insertions(+)
>
> diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
> index 271afea654e2..c6db6ded60a1 100644
> --- a/drivers/scsi/scsi_transport_iscsi.c
> +++ b/drivers/scsi/scsi_transport_iscsi.c
> @@ -86,6 +86,12 @@ struct iscsi_internal {
>         struct transport_container session_cont;
>  };
>
> +/* Worker to perform connection failure on unresponsive connections
> + * completely in kernel space.
> + */
> +static void stop_conn_work_fn(struct work_struct *work);
> +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
> +
>  static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
>  static struct workqueue_struct *iscsi_eh_timer_workq;
>
> @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex);
>  static LIST_HEAD(sesslist);
>  static DEFINE_SPINLOCK(sesslock);
>  static LIST_HEAD(connlist);
> +static LIST_HEAD(connlist_err);
>  static DEFINE_SPINLOCK(connlock);
>
>  static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
> @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
>
>         mutex_init(&conn->ep_mutex);
>         INIT_LIST_HEAD(&conn->conn_list);
> +       INIT_LIST_HEAD(&conn->conn_list_err);
>         conn->transport = transport;
>         conn->cid = cid;
>
> @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
>
>         spin_lock_irqsave(&connlock, flags);
>         list_del(&conn->conn_list);
> +       list_del(&conn->conn_list_err);
>         spin_unlock_irqrestore(&connlock, flags);
>
>         transport_unregister_device(&conn->dev);
> @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
>  }
>  EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
>
> +static void stop_conn_work_fn(struct work_struct *work)
> +{
> +       struct iscsi_cls_conn *conn, *tmp;
> +       unsigned long flags;
> +       LIST_HEAD(recovery_list);
> +
> +       spin_lock_irqsave(&connlock, flags);
> +       if (list_empty(&connlist_err)) {
> +               spin_unlock_irqrestore(&connlock, flags);
> +               return;
> +       }
> +       list_splice_init(&connlist_err, &recovery_list);
> +       spin_unlock_irqrestore(&connlock, flags);
> +
> +       list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
> +               uint32_t sid = iscsi_conn_get_sid(conn);
> +               struct iscsi_cls_session *session;
> +
> +               mutex_lock(&rx_queue_mutex);
This worried me a bit, but it seems we won't destroy_conn while it's
on the err list - cool.
> +
> +               session = iscsi_session_lookup(sid);
> +               if (session) {
> +                       if (system_state != SYSTEM_RUNNING) {
> +                               session->recovery_tmo = 0;
> +                               conn->transport->stop_conn(conn,
> +                                                          STOP_CONN_TERM);
> +                       } else {
> +                               conn->transport->stop_conn(conn,
> +                                                          STOP_CONN_RECOVER);
> +                       }
> +               }
> +
> +               list_del_init(&conn->conn_list_err);
> +
> +               mutex_unlock(&rx_queue_mutex);
> +
> +               /* we don't want to hold rx_queue_mutex for too long,
> +                * for instance if many conns failed at the same time,
> +                * since this stall other iscsi maintenance operations.
> +                * Give other users a chance to proceed.
> +                */
> +               cond_resched();
> +       }
> +}
> +
>  void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
>  {
>         struct nlmsghdr *nlh;
> @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
>         struct iscsi_uevent *ev;
>         struct iscsi_internal *priv;
>         int len = nlmsg_total_size(sizeof(*ev));
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&connlock, flags);
> +       list_add(&conn->conn_list_err, &connlist_err);
> +       spin_unlock_irqrestore(&connlock, flags);
> +       queue_work(system_unbound_wq, &stop_conn_work);
>
>         priv = iscsi_if_transport_lookup(conn->transport);
>         if (!priv)
> @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
>         if (!conn)
>                 return -EINVAL;
>
> +       if (!list_empty(&conn->conn_list_err))
Does this check need to be under connlock?
> +               return -EAGAIN;
> +
>         ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
>         if (transport->destroy_conn)
>                 transport->destroy_conn(conn);
> diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
> index 325ae731d9ad..2129dc9e2dec 100644
> --- a/include/scsi/scsi_transport_iscsi.h
> +++ b/include/scsi/scsi_transport_iscsi.h
> @@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no,
>
>  struct iscsi_cls_conn {
>         struct list_head conn_list;     /* item in connlist */
> +       struct list_head conn_list_err; /* item in connlist_err */
>         void *dd_data;                  /* LLD private data */
>         struct iscsi_transport *transport;
>         uint32_t cid;                   /* connection id */
> --
> 2.24.1
>
Gabriel Krisman Bertazi Jan. 2, 2020, 6:13 p.m. UTC | #3
Khazhismel Kumykov <khazhy@google.com> writes:

> On Thu, Dec 26, 2019 at 3:48 PM Gabriel Krisman Bertazi
> <krisman@collabora.com> wrote:
>>
>> From: Bharath Ravi <rbharath@google.com>
>>
>> Connection failure processing depends on a daemon being present to (at
>> least) stop the connection and start recovery.  This is a problem on a
>> multipath scenario, where if the daemon failed for whatever reason, the
>> SCSI path is never marked as down, multipath won't perform the
>> failover and IO to the device will be forever waiting for that
>> connection to come back.
>>
>> This patch performs the connection failure entirely inside the kernel.
>> This way, the failover can happen and pending IO can continue even if
>> the daemon is dead. Once the daemon comes alive again, it can execute
>> recovery procedures if applicable.
>>
>> Changes since v2:
>>   - Don't hold rx_mutex for too long at once
>>
>> Changes since v1:
>>   - Remove module parameter.
>>   - Always do kernel-side stop work.
>>   - Block recovery timeout handler if system is dying.
>>   - send a CONN_TERM stop if the system is dying.
>>
>> Cc: Mike Christie <mchristi@redhat.com>
>> Cc: Lee Duncan <LDuncan@suse.com>
>> Cc: Bart Van Assche <bvanassche@acm.org>
>> Co-developed-by: Dave Clausen <dclausen@google.com>
>> Signed-off-by: Dave Clausen <dclausen@google.com>
>> Co-developed-by: Nick Black <nlb@google.com>
>> Signed-off-by: Nick Black <nlb@google.com>
>> Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
>> Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
>> Co-developed-by: Anatol Pomazau <anatol@google.com>
>> Signed-off-by: Anatol Pomazau <anatol@google.com>
>> Co-developed-by: Tahsin Erdogan <tahsin@google.com>
>> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
>> Co-developed-by: Frank Mayhar <fmayhar@google.com>
>> Signed-off-by: Frank Mayhar <fmayhar@google.com>
>> Co-developed-by: Junho Ryu <jayr@google.com>
>> Signed-off-by: Junho Ryu <jayr@google.com>
>> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
>> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
>> Signed-off-by: Bharath Ravi <rbharath@google.com>
>> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>> ---
>>  drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++
>>  include/scsi/scsi_transport_iscsi.h |  1 +
>>  2 files changed, 64 insertions(+)
>>
>> diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
>> index 271afea654e2..c6db6ded60a1 100644
>> --- a/drivers/scsi/scsi_transport_iscsi.c
>> +++ b/drivers/scsi/scsi_transport_iscsi.c
>> @@ -86,6 +86,12 @@ struct iscsi_internal {
>>         struct transport_container session_cont;
>>  };
>>
>> +/* Worker to perform connection failure on unresponsive connections
>> + * completely in kernel space.
>> + */
>> +static void stop_conn_work_fn(struct work_struct *work);
>> +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
>> +
>>  static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
>>  static struct workqueue_struct *iscsi_eh_timer_workq;
>>
>> @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex);
>>  static LIST_HEAD(sesslist);
>>  static DEFINE_SPINLOCK(sesslock);
>>  static LIST_HEAD(connlist);
>> +static LIST_HEAD(connlist_err);
>>  static DEFINE_SPINLOCK(connlock);
>>
>>  static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
>> @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
>>
>>         mutex_init(&conn->ep_mutex);
>>         INIT_LIST_HEAD(&conn->conn_list);
>> +       INIT_LIST_HEAD(&conn->conn_list_err);
>>         conn->transport = transport;
>>         conn->cid = cid;
>>
>> @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
>>
>>         spin_lock_irqsave(&connlock, flags);
>>         list_del(&conn->conn_list);
>> +       list_del(&conn->conn_list_err);
>>         spin_unlock_irqrestore(&connlock, flags);
>>
>>         transport_unregister_device(&conn->dev);
>> @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
>>  }
>>  EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
>>
>> +static void stop_conn_work_fn(struct work_struct *work)
>> +{
>> +       struct iscsi_cls_conn *conn, *tmp;
>> +       unsigned long flags;
>> +       LIST_HEAD(recovery_list);
>> +
>> +       spin_lock_irqsave(&connlock, flags);
>> +       if (list_empty(&connlist_err)) {
>> +               spin_unlock_irqrestore(&connlock, flags);
>> +               return;
>> +       }
>> +       list_splice_init(&connlist_err, &recovery_list);
>> +       spin_unlock_irqrestore(&connlock, flags);
>> +
>> +       list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
>> +               uint32_t sid = iscsi_conn_get_sid(conn);
>> +               struct iscsi_cls_session *session;
>> +
>> +               mutex_lock(&rx_queue_mutex);
> This worried me a bit, but it seems we won't destroy_conn while it's
> on the err list - cool.
>> +
>> +               session = iscsi_session_lookup(sid);
>> +               if (session) {
>> +                       if (system_state != SYSTEM_RUNNING) {
>> +                               session->recovery_tmo = 0;
>> +                               conn->transport->stop_conn(conn,
>> +                                                          STOP_CONN_TERM);
>> +                       } else {
>> +                               conn->transport->stop_conn(conn,
>> +                                                          STOP_CONN_RECOVER);
>> +                       }
>> +               }
>> +
>> +               list_del_init(&conn->conn_list_err);
>> +
>> +               mutex_unlock(&rx_queue_mutex);
>> +
>> +               /* we don't want to hold rx_queue_mutex for too long,
>> +                * for instance if many conns failed at the same time,
>> +                * since this stall other iscsi maintenance operations.
>> +                * Give other users a chance to proceed.
>> +                */
>> +               cond_resched();
>> +       }
>> +}
>> +
>>  void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
>>  {
>>         struct nlmsghdr *nlh;
>> @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
>>         struct iscsi_uevent *ev;
>>         struct iscsi_internal *priv;
>>         int len = nlmsg_total_size(sizeof(*ev));
>> +       unsigned long flags;
>> +
>> +       spin_lock_irqsave(&connlock, flags);
>> +       list_add(&conn->conn_list_err, &connlist_err);
>> +       spin_unlock_irqrestore(&connlock, flags);
>> +       queue_work(system_unbound_wq, &stop_conn_work);
>>
>>         priv = iscsi_if_transport_lookup(conn->transport);
>>         if (!priv)
>> @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
>>         if (!conn)
>>                 return -EINVAL;
>>
>> +       if (!list_empty(&conn->conn_list_err))
> Does this check need to be under connlock?

My understanding is that it is not necessary, since it is serialized
against the conn removal itself, through the rx_mutex, it seemed safe to
do the verification lockless.

It can only race with the insertion, in which case, it will be safely
removed from the dispatch list here, under rx_mutex, and the worker will
detect and skipped it.
Khazhy Kumykov Jan. 2, 2020, 6:24 p.m. UTC | #4
On Thu, Jan 2, 2020 at 1:13 PM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> Khazhismel Kumykov <khazhy@google.com> writes:
>
> > On Thu, Dec 26, 2019 at 3:48 PM Gabriel Krisman Bertazi
> > <krisman@collabora.com> wrote:
> >>
> >> From: Bharath Ravi <rbharath@google.com>
> >>
> >> Connection failure processing depends on a daemon being present to (at
> >> least) stop the connection and start recovery.  This is a problem on a
> >> multipath scenario, where if the daemon failed for whatever reason, the
> >> SCSI path is never marked as down, multipath won't perform the
> >> failover and IO to the device will be forever waiting for that
> >> connection to come back.
> >>
> >> This patch performs the connection failure entirely inside the kernel.
> >> This way, the failover can happen and pending IO can continue even if
> >> the daemon is dead. Once the daemon comes alive again, it can execute
> >> recovery procedures if applicable.
> >>
> >> Changes since v2:
> >>   - Don't hold rx_mutex for too long at once
> >>
> >> Changes since v1:
> >>   - Remove module parameter.
> >>   - Always do kernel-side stop work.
> >>   - Block recovery timeout handler if system is dying.
> >>   - send a CONN_TERM stop if the system is dying.
> >>
> >> Cc: Mike Christie <mchristi@redhat.com>
> >> Cc: Lee Duncan <LDuncan@suse.com>
> >> Cc: Bart Van Assche <bvanassche@acm.org>
> >> Co-developed-by: Dave Clausen <dclausen@google.com>
> >> Signed-off-by: Dave Clausen <dclausen@google.com>
> >> Co-developed-by: Nick Black <nlb@google.com>
> >> Signed-off-by: Nick Black <nlb@google.com>
> >> Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
> >> Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
> >> Co-developed-by: Anatol Pomazau <anatol@google.com>
> >> Signed-off-by: Anatol Pomazau <anatol@google.com>
> >> Co-developed-by: Tahsin Erdogan <tahsin@google.com>
> >> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> >> Co-developed-by: Frank Mayhar <fmayhar@google.com>
> >> Signed-off-by: Frank Mayhar <fmayhar@google.com>
> >> Co-developed-by: Junho Ryu <jayr@google.com>
> >> Signed-off-by: Junho Ryu <jayr@google.com>
> >> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
> >> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
> >> Signed-off-by: Bharath Ravi <rbharath@google.com>
> >> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> >> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> >> ---
> >>  drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++
> >>  include/scsi/scsi_transport_iscsi.h |  1 +
> >>  2 files changed, 64 insertions(+)
> >>
> >> diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
> >> index 271afea654e2..c6db6ded60a1 100644
> >> --- a/drivers/scsi/scsi_transport_iscsi.c
> >> +++ b/drivers/scsi/scsi_transport_iscsi.c
> >> @@ -86,6 +86,12 @@ struct iscsi_internal {
> >>         struct transport_container session_cont;
> >>  };
> >>
> >> +/* Worker to perform connection failure on unresponsive connections
> >> + * completely in kernel space.
> >> + */
> >> +static void stop_conn_work_fn(struct work_struct *work);
> >> +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
> >> +
> >>  static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
> >>  static struct workqueue_struct *iscsi_eh_timer_workq;
> >>
> >> @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex);
> >>  static LIST_HEAD(sesslist);
> >>  static DEFINE_SPINLOCK(sesslock);
> >>  static LIST_HEAD(connlist);
> >> +static LIST_HEAD(connlist_err);
> >>  static DEFINE_SPINLOCK(connlock);
> >>
> >>  static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
> >> @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
> >>
> >>         mutex_init(&conn->ep_mutex);
> >>         INIT_LIST_HEAD(&conn->conn_list);
> >> +       INIT_LIST_HEAD(&conn->conn_list_err);
> >>         conn->transport = transport;
> >>         conn->cid = cid;
> >>
> >> @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
> >>
> >>         spin_lock_irqsave(&connlock, flags);
> >>         list_del(&conn->conn_list);
> >> +       list_del(&conn->conn_list_err);
> >>         spin_unlock_irqrestore(&connlock, flags);
> >>
> >>         transport_unregister_device(&conn->dev);
> >> @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
> >>  }
> >>  EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
> >>
> >> +static void stop_conn_work_fn(struct work_struct *work)
> >> +{
> >> +       struct iscsi_cls_conn *conn, *tmp;
> >> +       unsigned long flags;
> >> +       LIST_HEAD(recovery_list);
> >> +
> >> +       spin_lock_irqsave(&connlock, flags);
> >> +       if (list_empty(&connlist_err)) {
> >> +               spin_unlock_irqrestore(&connlock, flags);
> >> +               return;
> >> +       }
> >> +       list_splice_init(&connlist_err, &recovery_list);
> >> +       spin_unlock_irqrestore(&connlock, flags);
> >> +
> >> +       list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
> >> +               uint32_t sid = iscsi_conn_get_sid(conn);
> >> +               struct iscsi_cls_session *session;
> >> +
> >> +               mutex_lock(&rx_queue_mutex);
> > This worried me a bit, but it seems we won't destroy_conn while it's
> > on the err list - cool.
> >> +
> >> +               session = iscsi_session_lookup(sid);
> >> +               if (session) {
> >> +                       if (system_state != SYSTEM_RUNNING) {
> >> +                               session->recovery_tmo = 0;
> >> +                               conn->transport->stop_conn(conn,
> >> +                                                          STOP_CONN_TERM);
> >> +                       } else {
> >> +                               conn->transport->stop_conn(conn,
> >> +                                                          STOP_CONN_RECOVER);
> >> +                       }
> >> +               }
> >> +
> >> +               list_del_init(&conn->conn_list_err);
> >> +
> >> +               mutex_unlock(&rx_queue_mutex);
> >> +
> >> +               /* we don't want to hold rx_queue_mutex for too long,
> >> +                * for instance if many conns failed at the same time,
> >> +                * since this stall other iscsi maintenance operations.
> >> +                * Give other users a chance to proceed.
> >> +                */
> >> +               cond_resched();
> >> +       }
> >> +}
> >> +
> >>  void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
> >>  {
> >>         struct nlmsghdr *nlh;
> >> @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
> >>         struct iscsi_uevent *ev;
> >>         struct iscsi_internal *priv;
> >>         int len = nlmsg_total_size(sizeof(*ev));
> >> +       unsigned long flags;
> >> +
> >> +       spin_lock_irqsave(&connlock, flags);
> >> +       list_add(&conn->conn_list_err, &connlist_err);
> >> +       spin_unlock_irqrestore(&connlock, flags);
> >> +       queue_work(system_unbound_wq, &stop_conn_work);
> >>
> >>         priv = iscsi_if_transport_lookup(conn->transport);
> >>         if (!priv)
> >> @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
> >>         if (!conn)
> >>                 return -EINVAL;
> >>
> >> +       if (!list_empty(&conn->conn_list_err))
> > Does this check need to be under connlock?
>
> My understanding is that it is not necessary, since it is serialized
> against the conn removal itself, through the rx_mutex, it seemed safe to
> do the verification lockless.
>
> It can only race with the insertion, in which case, it will be safely
> removed from the dispatch list here, under rx_mutex, and the worker will
> detect and skipped it.

My worry is the splice, which is under only connlock, not rx_mutex, which
might lead to UB if we're checking empty while modifying the list_head ?

>
> --
> Gabriel Krisman Bertazi
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 271afea654e2..c6db6ded60a1 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -86,6 +86,12 @@  struct iscsi_internal {
 	struct transport_container session_cont;
 };
 
+/* Worker to perform connection failure on unresponsive connections
+ * completely in kernel space.
+ */
+static void stop_conn_work_fn(struct work_struct *work);
+static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
+
 static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
 static struct workqueue_struct *iscsi_eh_timer_workq;
 
@@ -1611,6 +1617,7 @@  static DEFINE_MUTEX(rx_queue_mutex);
 static LIST_HEAD(sesslist);
 static DEFINE_SPINLOCK(sesslock);
 static LIST_HEAD(connlist);
+static LIST_HEAD(connlist_err);
 static DEFINE_SPINLOCK(connlock);
 
 static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
@@ -2247,6 +2254,7 @@  iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
 
 	mutex_init(&conn->ep_mutex);
 	INIT_LIST_HEAD(&conn->conn_list);
+	INIT_LIST_HEAD(&conn->conn_list_err);
 	conn->transport = transport;
 	conn->cid = cid;
 
@@ -2293,6 +2301,7 @@  int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
 
 	spin_lock_irqsave(&connlock, flags);
 	list_del(&conn->conn_list);
+	list_del(&conn->conn_list_err);
 	spin_unlock_irqrestore(&connlock, flags);
 
 	transport_unregister_device(&conn->dev);
@@ -2407,6 +2416,51 @@  int iscsi_offload_mesg(struct Scsi_Host *shost,
 }
 EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
 
+static void stop_conn_work_fn(struct work_struct *work)
+{
+	struct iscsi_cls_conn *conn, *tmp;
+	unsigned long flags;
+	LIST_HEAD(recovery_list);
+
+	spin_lock_irqsave(&connlock, flags);
+	if (list_empty(&connlist_err)) {
+		spin_unlock_irqrestore(&connlock, flags);
+		return;
+	}
+	list_splice_init(&connlist_err, &recovery_list);
+	spin_unlock_irqrestore(&connlock, flags);
+
+	list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
+		uint32_t sid = iscsi_conn_get_sid(conn);
+		struct iscsi_cls_session *session;
+
+		mutex_lock(&rx_queue_mutex);
+
+		session = iscsi_session_lookup(sid);
+		if (session) {
+			if (system_state != SYSTEM_RUNNING) {
+				session->recovery_tmo = 0;
+				conn->transport->stop_conn(conn,
+							   STOP_CONN_TERM);
+			} else {
+				conn->transport->stop_conn(conn,
+							   STOP_CONN_RECOVER);
+			}
+		}
+
+		list_del_init(&conn->conn_list_err);
+
+		mutex_unlock(&rx_queue_mutex);
+
+		/* we don't want to hold rx_queue_mutex for too long,
+		 * for instance if many conns failed at the same time,
+		 * since this stall other iscsi maintenance operations.
+		 * Give other users a chance to proceed.
+		 */
+		cond_resched();
+	}
+}
+
 void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 {
 	struct nlmsghdr	*nlh;
@@ -2414,6 +2468,12 @@  void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
 	struct iscsi_uevent *ev;
 	struct iscsi_internal *priv;
 	int len = nlmsg_total_size(sizeof(*ev));
+	unsigned long flags;
+
+	spin_lock_irqsave(&connlock, flags);
+	list_add(&conn->conn_list_err, &connlist_err);
+	spin_unlock_irqrestore(&connlock, flags);
+	queue_work(system_unbound_wq, &stop_conn_work);
 
 	priv = iscsi_if_transport_lookup(conn->transport);
 	if (!priv)
@@ -2748,6 +2808,9 @@  iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
 	if (!conn)
 		return -EINVAL;
 
+	if (!list_empty(&conn->conn_list_err))
+		return -EAGAIN;
+
 	ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
 	if (transport->destroy_conn)
 		transport->destroy_conn(conn);
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 325ae731d9ad..2129dc9e2dec 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -190,6 +190,7 @@  extern void iscsi_ping_comp_event(uint32_t host_no,
 
 struct iscsi_cls_conn {
 	struct list_head conn_list;	/* item in connlist */
+	struct list_head conn_list_err;	/* item in connlist_err */
 	void *dd_data;			/* LLD private data */
 	struct iscsi_transport *transport;
 	uint32_t cid;			/* connection id */