Message ID | 20191226204746.2197233-1-krisman@collabora.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [v3] iscsi: Perform connection failure entirely in kernel space | expand |
On 12/26/19 12:47 PM, Gabriel Krisman Bertazi wrote: > From: Bharath Ravi <rbharath@google.com> > > Connection failure processing depends on a daemon being present to (at > least) stop the connection and start recovery. This is a problem on a > multipath scenario, where if the daemon failed for whatever reason, the > SCSI path is never marked as down, multipath won't perform the > failover and IO to the device will be forever waiting for that > connection to come back. > > This patch performs the connection failure entirely inside the kernel. > This way, the failover can happen and pending IO can continue even if > the daemon is dead. Once the daemon comes alive again, it can execute > recovery procedures if applicable. > > Changes since v2: > - Don't hold rx_mutex for too long at once > > Changes since v1: > - Remove module parameter. > - Always do kernel-side stop work. > - Block recovery timeout handler if system is dying. > - send a CONN_TERM stop if the system is dying. > > Cc: Mike Christie <mchristi@redhat.com> > Cc: Lee Duncan <LDuncan@suse.com> > Cc: Bart Van Assche <bvanassche@acm.org> > Co-developed-by: Dave Clausen <dclausen@google.com> > Signed-off-by: Dave Clausen <dclausen@google.com> > Co-developed-by: Nick Black <nlb@google.com> > Signed-off-by: Nick Black <nlb@google.com> > Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com> > Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> > Co-developed-by: Anatol Pomazau <anatol@google.com> > Signed-off-by: Anatol Pomazau <anatol@google.com> > Co-developed-by: Tahsin Erdogan <tahsin@google.com> > Signed-off-by: Tahsin Erdogan <tahsin@google.com> > Co-developed-by: Frank Mayhar <fmayhar@google.com> > Signed-off-by: Frank Mayhar <fmayhar@google.com> > Co-developed-by: Junho Ryu <jayr@google.com> > Signed-off-by: Junho Ryu <jayr@google.com> > Co-developed-by: Khazhismel Kumykov <khazhy@google.com> > Signed-off-by: Khazhismel Kumykov <khazhy@google.com> > Signed-off-by: Bharath Ravi <rbharath@google.com> > Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com> > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com> > --- > drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++ > include/scsi/scsi_transport_iscsi.h | 1 + > 2 files changed, 64 insertions(+) > > diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c > index 271afea654e2..c6db6ded60a1 100644 > --- a/drivers/scsi/scsi_transport_iscsi.c > +++ b/drivers/scsi/scsi_transport_iscsi.c > @@ -86,6 +86,12 @@ struct iscsi_internal { > struct transport_container session_cont; > }; > > +/* Worker to perform connection failure on unresponsive connections > + * completely in kernel space. > + */ > +static void stop_conn_work_fn(struct work_struct *work); > +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); > + > static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ > static struct workqueue_struct *iscsi_eh_timer_workq; > > @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex); > static LIST_HEAD(sesslist); > static DEFINE_SPINLOCK(sesslock); > static LIST_HEAD(connlist); > +static LIST_HEAD(connlist_err); > static DEFINE_SPINLOCK(connlock); > > static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn) > @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) > > mutex_init(&conn->ep_mutex); > INIT_LIST_HEAD(&conn->conn_list); > + INIT_LIST_HEAD(&conn->conn_list_err); > conn->transport = transport; > conn->cid = cid; > > @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn) > > spin_lock_irqsave(&connlock, flags); > list_del(&conn->conn_list); > + list_del(&conn->conn_list_err); > spin_unlock_irqrestore(&connlock, flags); > > transport_unregister_device(&conn->dev); > @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, > } > EXPORT_SYMBOL_GPL(iscsi_offload_mesg); > > +static void stop_conn_work_fn(struct work_struct *work) > +{ > + struct iscsi_cls_conn *conn, *tmp; > + unsigned long flags; > + LIST_HEAD(recovery_list); > + > + spin_lock_irqsave(&connlock, flags); > + if (list_empty(&connlist_err)) { > + spin_unlock_irqrestore(&connlock, flags); > + return; > + } > + list_splice_init(&connlist_err, &recovery_list); > + spin_unlock_irqrestore(&connlock, flags); > + > + list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) { > + uint32_t sid = iscsi_conn_get_sid(conn); > + struct iscsi_cls_session *session; > + > + mutex_lock(&rx_queue_mutex); > + > + session = iscsi_session_lookup(sid); > + if (session) { > + if (system_state != SYSTEM_RUNNING) { > + session->recovery_tmo = 0; > + conn->transport->stop_conn(conn, > + STOP_CONN_TERM); > + } else { > + conn->transport->stop_conn(conn, > + STOP_CONN_RECOVER); > + } > + } > + > + list_del_init(&conn->conn_list_err); > + > + mutex_unlock(&rx_queue_mutex); > + > + /* we don't want to hold rx_queue_mutex for too long, > + * for instance if many conns failed at the same time, > + * since this stall other iscsi maintenance operations. > + * Give other users a chance to proceed. > + */ > + cond_resched(); > + } > +} > + > void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) > { > struct nlmsghdr *nlh; > @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) > struct iscsi_uevent *ev; > struct iscsi_internal *priv; > int len = nlmsg_total_size(sizeof(*ev)); > + unsigned long flags; > + > + spin_lock_irqsave(&connlock, flags); > + list_add(&conn->conn_list_err, &connlist_err); > + spin_unlock_irqrestore(&connlock, flags); > + queue_work(system_unbound_wq, &stop_conn_work); > > priv = iscsi_if_transport_lookup(conn->transport); > if (!priv) > @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev > if (!conn) > return -EINVAL; > > + if (!list_empty(&conn->conn_list_err)) > + return -EAGAIN; > + > ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n"); > if (transport->destroy_conn) > transport->destroy_conn(conn); > diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h > index 325ae731d9ad..2129dc9e2dec 100644 > --- a/include/scsi/scsi_transport_iscsi.h > +++ b/include/scsi/scsi_transport_iscsi.h > @@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no, > > struct iscsi_cls_conn { > struct list_head conn_list; /* item in connlist */ > + struct list_head conn_list_err; /* item in connlist_err */ > void *dd_data; /* LLD private data */ > struct iscsi_transport *transport; > uint32_t cid; /* connection id */ > Reviewed-by: Lee Duncan <lduncan@suse.com>
On Thu, Dec 26, 2019 at 3:48 PM Gabriel Krisman Bertazi <krisman@collabora.com> wrote: > > From: Bharath Ravi <rbharath@google.com> > > Connection failure processing depends on a daemon being present to (at > least) stop the connection and start recovery. This is a problem on a > multipath scenario, where if the daemon failed for whatever reason, the > SCSI path is never marked as down, multipath won't perform the > failover and IO to the device will be forever waiting for that > connection to come back. > > This patch performs the connection failure entirely inside the kernel. > This way, the failover can happen and pending IO can continue even if > the daemon is dead. Once the daemon comes alive again, it can execute > recovery procedures if applicable. > > Changes since v2: > - Don't hold rx_mutex for too long at once > > Changes since v1: > - Remove module parameter. > - Always do kernel-side stop work. > - Block recovery timeout handler if system is dying. > - send a CONN_TERM stop if the system is dying. > > Cc: Mike Christie <mchristi@redhat.com> > Cc: Lee Duncan <LDuncan@suse.com> > Cc: Bart Van Assche <bvanassche@acm.org> > Co-developed-by: Dave Clausen <dclausen@google.com> > Signed-off-by: Dave Clausen <dclausen@google.com> > Co-developed-by: Nick Black <nlb@google.com> > Signed-off-by: Nick Black <nlb@google.com> > Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com> > Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> > Co-developed-by: Anatol Pomazau <anatol@google.com> > Signed-off-by: Anatol Pomazau <anatol@google.com> > Co-developed-by: Tahsin Erdogan <tahsin@google.com> > Signed-off-by: Tahsin Erdogan <tahsin@google.com> > Co-developed-by: Frank Mayhar <fmayhar@google.com> > Signed-off-by: Frank Mayhar <fmayhar@google.com> > Co-developed-by: Junho Ryu <jayr@google.com> > Signed-off-by: Junho Ryu <jayr@google.com> > Co-developed-by: Khazhismel Kumykov <khazhy@google.com> > Signed-off-by: Khazhismel Kumykov <khazhy@google.com> > Signed-off-by: Bharath Ravi <rbharath@google.com> > Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com> > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com> > --- > drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++ > include/scsi/scsi_transport_iscsi.h | 1 + > 2 files changed, 64 insertions(+) > > diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c > index 271afea654e2..c6db6ded60a1 100644 > --- a/drivers/scsi/scsi_transport_iscsi.c > +++ b/drivers/scsi/scsi_transport_iscsi.c > @@ -86,6 +86,12 @@ struct iscsi_internal { > struct transport_container session_cont; > }; > > +/* Worker to perform connection failure on unresponsive connections > + * completely in kernel space. > + */ > +static void stop_conn_work_fn(struct work_struct *work); > +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); > + > static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ > static struct workqueue_struct *iscsi_eh_timer_workq; > > @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex); > static LIST_HEAD(sesslist); > static DEFINE_SPINLOCK(sesslock); > static LIST_HEAD(connlist); > +static LIST_HEAD(connlist_err); > static DEFINE_SPINLOCK(connlock); > > static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn) > @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) > > mutex_init(&conn->ep_mutex); > INIT_LIST_HEAD(&conn->conn_list); > + INIT_LIST_HEAD(&conn->conn_list_err); > conn->transport = transport; > conn->cid = cid; > > @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn) > > spin_lock_irqsave(&connlock, flags); > list_del(&conn->conn_list); > + list_del(&conn->conn_list_err); > spin_unlock_irqrestore(&connlock, flags); > > transport_unregister_device(&conn->dev); > @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, > } > EXPORT_SYMBOL_GPL(iscsi_offload_mesg); > > +static void stop_conn_work_fn(struct work_struct *work) > +{ > + struct iscsi_cls_conn *conn, *tmp; > + unsigned long flags; > + LIST_HEAD(recovery_list); > + > + spin_lock_irqsave(&connlock, flags); > + if (list_empty(&connlist_err)) { > + spin_unlock_irqrestore(&connlock, flags); > + return; > + } > + list_splice_init(&connlist_err, &recovery_list); > + spin_unlock_irqrestore(&connlock, flags); > + > + list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) { > + uint32_t sid = iscsi_conn_get_sid(conn); > + struct iscsi_cls_session *session; > + > + mutex_lock(&rx_queue_mutex); This worried me a bit, but it seems we won't destroy_conn while it's on the err list - cool. > + > + session = iscsi_session_lookup(sid); > + if (session) { > + if (system_state != SYSTEM_RUNNING) { > + session->recovery_tmo = 0; > + conn->transport->stop_conn(conn, > + STOP_CONN_TERM); > + } else { > + conn->transport->stop_conn(conn, > + STOP_CONN_RECOVER); > + } > + } > + > + list_del_init(&conn->conn_list_err); > + > + mutex_unlock(&rx_queue_mutex); > + > + /* we don't want to hold rx_queue_mutex for too long, > + * for instance if many conns failed at the same time, > + * since this stall other iscsi maintenance operations. > + * Give other users a chance to proceed. > + */ > + cond_resched(); > + } > +} > + > void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) > { > struct nlmsghdr *nlh; > @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) > struct iscsi_uevent *ev; > struct iscsi_internal *priv; > int len = nlmsg_total_size(sizeof(*ev)); > + unsigned long flags; > + > + spin_lock_irqsave(&connlock, flags); > + list_add(&conn->conn_list_err, &connlist_err); > + spin_unlock_irqrestore(&connlock, flags); > + queue_work(system_unbound_wq, &stop_conn_work); > > priv = iscsi_if_transport_lookup(conn->transport); > if (!priv) > @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev > if (!conn) > return -EINVAL; > > + if (!list_empty(&conn->conn_list_err)) Does this check need to be under connlock? > + return -EAGAIN; > + > ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n"); > if (transport->destroy_conn) > transport->destroy_conn(conn); > diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h > index 325ae731d9ad..2129dc9e2dec 100644 > --- a/include/scsi/scsi_transport_iscsi.h > +++ b/include/scsi/scsi_transport_iscsi.h > @@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no, > > struct iscsi_cls_conn { > struct list_head conn_list; /* item in connlist */ > + struct list_head conn_list_err; /* item in connlist_err */ > void *dd_data; /* LLD private data */ > struct iscsi_transport *transport; > uint32_t cid; /* connection id */ > -- > 2.24.1 >
Khazhismel Kumykov <khazhy@google.com> writes: > On Thu, Dec 26, 2019 at 3:48 PM Gabriel Krisman Bertazi > <krisman@collabora.com> wrote: >> >> From: Bharath Ravi <rbharath@google.com> >> >> Connection failure processing depends on a daemon being present to (at >> least) stop the connection and start recovery. This is a problem on a >> multipath scenario, where if the daemon failed for whatever reason, the >> SCSI path is never marked as down, multipath won't perform the >> failover and IO to the device will be forever waiting for that >> connection to come back. >> >> This patch performs the connection failure entirely inside the kernel. >> This way, the failover can happen and pending IO can continue even if >> the daemon is dead. Once the daemon comes alive again, it can execute >> recovery procedures if applicable. >> >> Changes since v2: >> - Don't hold rx_mutex for too long at once >> >> Changes since v1: >> - Remove module parameter. >> - Always do kernel-side stop work. >> - Block recovery timeout handler if system is dying. >> - send a CONN_TERM stop if the system is dying. >> >> Cc: Mike Christie <mchristi@redhat.com> >> Cc: Lee Duncan <LDuncan@suse.com> >> Cc: Bart Van Assche <bvanassche@acm.org> >> Co-developed-by: Dave Clausen <dclausen@google.com> >> Signed-off-by: Dave Clausen <dclausen@google.com> >> Co-developed-by: Nick Black <nlb@google.com> >> Signed-off-by: Nick Black <nlb@google.com> >> Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com> >> Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> >> Co-developed-by: Anatol Pomazau <anatol@google.com> >> Signed-off-by: Anatol Pomazau <anatol@google.com> >> Co-developed-by: Tahsin Erdogan <tahsin@google.com> >> Signed-off-by: Tahsin Erdogan <tahsin@google.com> >> Co-developed-by: Frank Mayhar <fmayhar@google.com> >> Signed-off-by: Frank Mayhar <fmayhar@google.com> >> Co-developed-by: Junho Ryu <jayr@google.com> >> Signed-off-by: Junho Ryu <jayr@google.com> >> Co-developed-by: Khazhismel Kumykov <khazhy@google.com> >> Signed-off-by: Khazhismel Kumykov <khazhy@google.com> >> Signed-off-by: Bharath Ravi <rbharath@google.com> >> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com> >> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com> >> --- >> drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++ >> include/scsi/scsi_transport_iscsi.h | 1 + >> 2 files changed, 64 insertions(+) >> >> diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c >> index 271afea654e2..c6db6ded60a1 100644 >> --- a/drivers/scsi/scsi_transport_iscsi.c >> +++ b/drivers/scsi/scsi_transport_iscsi.c >> @@ -86,6 +86,12 @@ struct iscsi_internal { >> struct transport_container session_cont; >> }; >> >> +/* Worker to perform connection failure on unresponsive connections >> + * completely in kernel space. >> + */ >> +static void stop_conn_work_fn(struct work_struct *work); >> +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); >> + >> static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ >> static struct workqueue_struct *iscsi_eh_timer_workq; >> >> @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex); >> static LIST_HEAD(sesslist); >> static DEFINE_SPINLOCK(sesslock); >> static LIST_HEAD(connlist); >> +static LIST_HEAD(connlist_err); >> static DEFINE_SPINLOCK(connlock); >> >> static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn) >> @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) >> >> mutex_init(&conn->ep_mutex); >> INIT_LIST_HEAD(&conn->conn_list); >> + INIT_LIST_HEAD(&conn->conn_list_err); >> conn->transport = transport; >> conn->cid = cid; >> >> @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn) >> >> spin_lock_irqsave(&connlock, flags); >> list_del(&conn->conn_list); >> + list_del(&conn->conn_list_err); >> spin_unlock_irqrestore(&connlock, flags); >> >> transport_unregister_device(&conn->dev); >> @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, >> } >> EXPORT_SYMBOL_GPL(iscsi_offload_mesg); >> >> +static void stop_conn_work_fn(struct work_struct *work) >> +{ >> + struct iscsi_cls_conn *conn, *tmp; >> + unsigned long flags; >> + LIST_HEAD(recovery_list); >> + >> + spin_lock_irqsave(&connlock, flags); >> + if (list_empty(&connlist_err)) { >> + spin_unlock_irqrestore(&connlock, flags); >> + return; >> + } >> + list_splice_init(&connlist_err, &recovery_list); >> + spin_unlock_irqrestore(&connlock, flags); >> + >> + list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) { >> + uint32_t sid = iscsi_conn_get_sid(conn); >> + struct iscsi_cls_session *session; >> + >> + mutex_lock(&rx_queue_mutex); > This worried me a bit, but it seems we won't destroy_conn while it's > on the err list - cool. >> + >> + session = iscsi_session_lookup(sid); >> + if (session) { >> + if (system_state != SYSTEM_RUNNING) { >> + session->recovery_tmo = 0; >> + conn->transport->stop_conn(conn, >> + STOP_CONN_TERM); >> + } else { >> + conn->transport->stop_conn(conn, >> + STOP_CONN_RECOVER); >> + } >> + } >> + >> + list_del_init(&conn->conn_list_err); >> + >> + mutex_unlock(&rx_queue_mutex); >> + >> + /* we don't want to hold rx_queue_mutex for too long, >> + * for instance if many conns failed at the same time, >> + * since this stall other iscsi maintenance operations. >> + * Give other users a chance to proceed. >> + */ >> + cond_resched(); >> + } >> +} >> + >> void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) >> { >> struct nlmsghdr *nlh; >> @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) >> struct iscsi_uevent *ev; >> struct iscsi_internal *priv; >> int len = nlmsg_total_size(sizeof(*ev)); >> + unsigned long flags; >> + >> + spin_lock_irqsave(&connlock, flags); >> + list_add(&conn->conn_list_err, &connlist_err); >> + spin_unlock_irqrestore(&connlock, flags); >> + queue_work(system_unbound_wq, &stop_conn_work); >> >> priv = iscsi_if_transport_lookup(conn->transport); >> if (!priv) >> @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev >> if (!conn) >> return -EINVAL; >> >> + if (!list_empty(&conn->conn_list_err)) > Does this check need to be under connlock? My understanding is that it is not necessary, since it is serialized against the conn removal itself, through the rx_mutex, it seemed safe to do the verification lockless. It can only race with the insertion, in which case, it will be safely removed from the dispatch list here, under rx_mutex, and the worker will detect and skipped it.
On Thu, Jan 2, 2020 at 1:13 PM Gabriel Krisman Bertazi <krisman@collabora.com> wrote: > > Khazhismel Kumykov <khazhy@google.com> writes: > > > On Thu, Dec 26, 2019 at 3:48 PM Gabriel Krisman Bertazi > > <krisman@collabora.com> wrote: > >> > >> From: Bharath Ravi <rbharath@google.com> > >> > >> Connection failure processing depends on a daemon being present to (at > >> least) stop the connection and start recovery. This is a problem on a > >> multipath scenario, where if the daemon failed for whatever reason, the > >> SCSI path is never marked as down, multipath won't perform the > >> failover and IO to the device will be forever waiting for that > >> connection to come back. > >> > >> This patch performs the connection failure entirely inside the kernel. > >> This way, the failover can happen and pending IO can continue even if > >> the daemon is dead. Once the daemon comes alive again, it can execute > >> recovery procedures if applicable. > >> > >> Changes since v2: > >> - Don't hold rx_mutex for too long at once > >> > >> Changes since v1: > >> - Remove module parameter. > >> - Always do kernel-side stop work. > >> - Block recovery timeout handler if system is dying. > >> - send a CONN_TERM stop if the system is dying. > >> > >> Cc: Mike Christie <mchristi@redhat.com> > >> Cc: Lee Duncan <LDuncan@suse.com> > >> Cc: Bart Van Assche <bvanassche@acm.org> > >> Co-developed-by: Dave Clausen <dclausen@google.com> > >> Signed-off-by: Dave Clausen <dclausen@google.com> > >> Co-developed-by: Nick Black <nlb@google.com> > >> Signed-off-by: Nick Black <nlb@google.com> > >> Co-developed-by: Vaibhav Nagarnaik <vnagarnaik@google.com> > >> Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> > >> Co-developed-by: Anatol Pomazau <anatol@google.com> > >> Signed-off-by: Anatol Pomazau <anatol@google.com> > >> Co-developed-by: Tahsin Erdogan <tahsin@google.com> > >> Signed-off-by: Tahsin Erdogan <tahsin@google.com> > >> Co-developed-by: Frank Mayhar <fmayhar@google.com> > >> Signed-off-by: Frank Mayhar <fmayhar@google.com> > >> Co-developed-by: Junho Ryu <jayr@google.com> > >> Signed-off-by: Junho Ryu <jayr@google.com> > >> Co-developed-by: Khazhismel Kumykov <khazhy@google.com> > >> Signed-off-by: Khazhismel Kumykov <khazhy@google.com> > >> Signed-off-by: Bharath Ravi <rbharath@google.com> > >> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com> > >> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com> > >> --- > >> drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++ > >> include/scsi/scsi_transport_iscsi.h | 1 + > >> 2 files changed, 64 insertions(+) > >> > >> diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c > >> index 271afea654e2..c6db6ded60a1 100644 > >> --- a/drivers/scsi/scsi_transport_iscsi.c > >> +++ b/drivers/scsi/scsi_transport_iscsi.c > >> @@ -86,6 +86,12 @@ struct iscsi_internal { > >> struct transport_container session_cont; > >> }; > >> > >> +/* Worker to perform connection failure on unresponsive connections > >> + * completely in kernel space. > >> + */ > >> +static void stop_conn_work_fn(struct work_struct *work); > >> +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); > >> + > >> static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ > >> static struct workqueue_struct *iscsi_eh_timer_workq; > >> > >> @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex); > >> static LIST_HEAD(sesslist); > >> static DEFINE_SPINLOCK(sesslock); > >> static LIST_HEAD(connlist); > >> +static LIST_HEAD(connlist_err); > >> static DEFINE_SPINLOCK(connlock); > >> > >> static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn) > >> @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) > >> > >> mutex_init(&conn->ep_mutex); > >> INIT_LIST_HEAD(&conn->conn_list); > >> + INIT_LIST_HEAD(&conn->conn_list_err); > >> conn->transport = transport; > >> conn->cid = cid; > >> > >> @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn) > >> > >> spin_lock_irqsave(&connlock, flags); > >> list_del(&conn->conn_list); > >> + list_del(&conn->conn_list_err); > >> spin_unlock_irqrestore(&connlock, flags); > >> > >> transport_unregister_device(&conn->dev); > >> @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, > >> } > >> EXPORT_SYMBOL_GPL(iscsi_offload_mesg); > >> > >> +static void stop_conn_work_fn(struct work_struct *work) > >> +{ > >> + struct iscsi_cls_conn *conn, *tmp; > >> + unsigned long flags; > >> + LIST_HEAD(recovery_list); > >> + > >> + spin_lock_irqsave(&connlock, flags); > >> + if (list_empty(&connlist_err)) { > >> + spin_unlock_irqrestore(&connlock, flags); > >> + return; > >> + } > >> + list_splice_init(&connlist_err, &recovery_list); > >> + spin_unlock_irqrestore(&connlock, flags); > >> + > >> + list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) { > >> + uint32_t sid = iscsi_conn_get_sid(conn); > >> + struct iscsi_cls_session *session; > >> + > >> + mutex_lock(&rx_queue_mutex); > > This worried me a bit, but it seems we won't destroy_conn while it's > > on the err list - cool. > >> + > >> + session = iscsi_session_lookup(sid); > >> + if (session) { > >> + if (system_state != SYSTEM_RUNNING) { > >> + session->recovery_tmo = 0; > >> + conn->transport->stop_conn(conn, > >> + STOP_CONN_TERM); > >> + } else { > >> + conn->transport->stop_conn(conn, > >> + STOP_CONN_RECOVER); > >> + } > >> + } > >> + > >> + list_del_init(&conn->conn_list_err); > >> + > >> + mutex_unlock(&rx_queue_mutex); > >> + > >> + /* we don't want to hold rx_queue_mutex for too long, > >> + * for instance if many conns failed at the same time, > >> + * since this stall other iscsi maintenance operations. > >> + * Give other users a chance to proceed. > >> + */ > >> + cond_resched(); > >> + } > >> +} > >> + > >> void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) > >> { > >> struct nlmsghdr *nlh; > >> @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) > >> struct iscsi_uevent *ev; > >> struct iscsi_internal *priv; > >> int len = nlmsg_total_size(sizeof(*ev)); > >> + unsigned long flags; > >> + > >> + spin_lock_irqsave(&connlock, flags); > >> + list_add(&conn->conn_list_err, &connlist_err); > >> + spin_unlock_irqrestore(&connlock, flags); > >> + queue_work(system_unbound_wq, &stop_conn_work); > >> > >> priv = iscsi_if_transport_lookup(conn->transport); > >> if (!priv) > >> @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev > >> if (!conn) > >> return -EINVAL; > >> > >> + if (!list_empty(&conn->conn_list_err)) > > Does this check need to be under connlock? > > My understanding is that it is not necessary, since it is serialized > against the conn removal itself, through the rx_mutex, it seemed safe to > do the verification lockless. > > It can only race with the insertion, in which case, it will be safely > removed from the dispatch list here, under rx_mutex, and the worker will > detect and skipped it. My worry is the splice, which is under only connlock, not rx_mutex, which might lead to UB if we're checking empty while modifying the list_head ? > > -- > Gabriel Krisman Bertazi
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 271afea654e2..c6db6ded60a1 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -86,6 +86,12 @@ struct iscsi_internal { struct transport_container session_cont; }; +/* Worker to perform connection failure on unresponsive connections + * completely in kernel space. + */ +static void stop_conn_work_fn(struct work_struct *work); +static DECLARE_WORK(stop_conn_work, stop_conn_work_fn); + static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ static struct workqueue_struct *iscsi_eh_timer_workq; @@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex); static LIST_HEAD(sesslist); static DEFINE_SPINLOCK(sesslock); static LIST_HEAD(connlist); +static LIST_HEAD(connlist_err); static DEFINE_SPINLOCK(connlock); static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn) @@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) mutex_init(&conn->ep_mutex); INIT_LIST_HEAD(&conn->conn_list); + INIT_LIST_HEAD(&conn->conn_list_err); conn->transport = transport; conn->cid = cid; @@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn) spin_lock_irqsave(&connlock, flags); list_del(&conn->conn_list); + list_del(&conn->conn_list_err); spin_unlock_irqrestore(&connlock, flags); transport_unregister_device(&conn->dev); @@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, } EXPORT_SYMBOL_GPL(iscsi_offload_mesg); +static void stop_conn_work_fn(struct work_struct *work) +{ + struct iscsi_cls_conn *conn, *tmp; + unsigned long flags; + LIST_HEAD(recovery_list); + + spin_lock_irqsave(&connlock, flags); + if (list_empty(&connlist_err)) { + spin_unlock_irqrestore(&connlock, flags); + return; + } + list_splice_init(&connlist_err, &recovery_list); + spin_unlock_irqrestore(&connlock, flags); + + list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) { + uint32_t sid = iscsi_conn_get_sid(conn); + struct iscsi_cls_session *session; + + mutex_lock(&rx_queue_mutex); + + session = iscsi_session_lookup(sid); + if (session) { + if (system_state != SYSTEM_RUNNING) { + session->recovery_tmo = 0; + conn->transport->stop_conn(conn, + STOP_CONN_TERM); + } else { + conn->transport->stop_conn(conn, + STOP_CONN_RECOVER); + } + } + + list_del_init(&conn->conn_list_err); + + mutex_unlock(&rx_queue_mutex); + + /* we don't want to hold rx_queue_mutex for too long, + * for instance if many conns failed at the same time, + * since this stall other iscsi maintenance operations. + * Give other users a chance to proceed. + */ + cond_resched(); + } +} + void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) { struct nlmsghdr *nlh; @@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct iscsi_uevent *ev; struct iscsi_internal *priv; int len = nlmsg_total_size(sizeof(*ev)); + unsigned long flags; + + spin_lock_irqsave(&connlock, flags); + list_add(&conn->conn_list_err, &connlist_err); + spin_unlock_irqrestore(&connlock, flags); + queue_work(system_unbound_wq, &stop_conn_work); priv = iscsi_if_transport_lookup(conn->transport); if (!priv) @@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev if (!conn) return -EINVAL; + if (!list_empty(&conn->conn_list_err)) + return -EAGAIN; + ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n"); if (transport->destroy_conn) transport->destroy_conn(conn); diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 325ae731d9ad..2129dc9e2dec 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no, struct iscsi_cls_conn { struct list_head conn_list; /* item in connlist */ + struct list_head conn_list_err; /* item in connlist_err */ void *dd_data; /* LLD private data */ struct iscsi_transport *transport; uint32_t cid; /* connection id */