@@ -244,12 +244,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_DEVICE_REMOVAL:
pr_info("rpcrdma: removing device %s for %pISpc\n",
ep->re_id->device->name, sap);
- fallthrough;
+ ep->re_connect_status = -ENODEV;
+ wake_up_all(&ep->re_connect_wait);
+ goto disconnected;
case RDMA_CM_EVENT_ADDR_CHANGE:
ep->re_connect_status = -ENODEV;
goto disconnected;
case RDMA_CM_EVENT_ESTABLISHED:
rpcrdma_ep_get(ep);
+ ep->re_connect_ref = true;
ep->re_connect_status = 1;
rpcrdma_update_cm_private(ep, &event->param.conn);
trace_xprtrdma_inline_thresh(ep);
@@ -272,7 +275,9 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ep->re_connect_status = -ECONNABORTED;
disconnected:
rpcrdma_force_disconnect(ep);
- return rpcrdma_ep_put(ep);
+ if (ep->re_connect_ref)
+ return rpcrdma_ep_put(ep);
+ return 0;
default:
break;
}
@@ -84,6 +84,7 @@ struct rpcrdma_ep {
unsigned int re_max_inline_recv;
int re_async_rc;
int re_connect_status;
+ bool re_connect_ref;
atomic_t re_receiving;
atomic_t re_force_disconnect;
struct ib_qp_init_attr re_attr;
Under the scenario of IB device bonding, when bringing down one of the ports, or all ports, we saw xprtrdma entering a non-recoverable state where it is not even possible to complete the disconnect and shut it down the mount, requiring a reboot. If a DEVICE_REMOVAL happened, it may be irrespective of whether the CM_ID is connected, and ESTABLISHED may not have happened, so we need to avoid a decref, plus make sure connect path is woken up. Fixes: 2acc5cae2923 ('xprtrdma: Prevent dereferencing r_xprt->rx_ep after it is freed') Signed-off-by: Dan Aloni <dan.aloni@vastdata.com> --- net/sunrpc/xprtrdma/verbs.c | 9 +++++++-- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-)