Message ID | 1444568298-17289-2-git-send-email-matanb@mellanox.com (mailing list archive) |
---|---|
State | Deferred |
Headers | show |
Looks good, just one doubt inline: On Sun, Oct 11, 2015 at 6:28 PM, Matan Barak <matanb@mellanox.com> wrote: > When IP based addressing was introduced, ib_create_ah_from_wc was > changed in order to support a suitable AH. Since this AH should > now contains the DMAC (which isn't a simple derivative of the GID). > In order to find the DMAC, an ARP should sometime be sent. This ARP > is a sleeping context. > > ib_create_ah_from_wc is called from cm_alloc_response_msg, which is > sometimes called from an atomic context. This caused a > sleeping-while-atomic bug. Fixing this by splitting > cm_alloc_response_msg to an atomic and sleep-able part. When > cm_alloc_response_msg is used in an atomic context, we try to create > the AH before entering the atomic context. > > Fixes: 66bd20a72d2f ('IB/core: Ethernet L2 attributes in verbs/cm structures') > Signed-off-by: Matan Barak <matanb@mellanox.com> > --- > > Hi Doug, > This patch fixes an old bug in the CM. IP based addressing requires > ARP resolution which isn't sleep-able by its nature. This resolution > was sometimes done in non sleep-able context. Our regression tests > picked up this bug and verified this fix. > > Thanks, > Matan > > drivers/infiniband/core/cm.c | 60 ++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 49 insertions(+), 11 deletions(-) > > diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c > index ea4db9c..f5cf1c4 100644 > --- a/drivers/infiniband/core/cm.c > +++ b/drivers/infiniband/core/cm.c > @@ -287,17 +287,12 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, > return 0; > } > > -static int cm_alloc_response_msg(struct cm_port *port, > - struct ib_mad_recv_wc *mad_recv_wc, > - struct ib_mad_send_buf **msg) > +static int _cm_alloc_response_msg(struct cm_port *port, > + struct ib_mad_recv_wc *mad_recv_wc, > + struct ib_ah *ah, > + struct ib_mad_send_buf **msg) > { > struct ib_mad_send_buf *m; > - struct ib_ah *ah; > - > - ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, > - mad_recv_wc->recv_buf.grh, port->port_num); > - if (IS_ERR(ah)) > - return PTR_ERR(ah); > > m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, > 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, > @@ -312,6 +307,20 @@ static int cm_alloc_response_msg(struct cm_port *port, > return 0; > } > > +static int cm_alloc_response_msg(struct cm_port *port, > + struct ib_mad_recv_wc *mad_recv_wc, > + struct ib_mad_send_buf **msg) > +{ > + struct ib_ah *ah; > + > + ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, > + mad_recv_wc->recv_buf.grh, port->port_num); > + if (IS_ERR(ah)) > + return PTR_ERR(ah); > + > + return _cm_alloc_response_msg(port, mad_recv_wc, ah, msg); > +} > + > static void cm_free_msg(struct ib_mad_send_buf *msg) > { > ib_destroy_ah(msg->ah); > @@ -2201,6 +2210,7 @@ static int cm_dreq_handler(struct cm_work *work) > struct cm_id_private *cm_id_priv; > struct cm_dreq_msg *dreq_msg; > struct ib_mad_send_buf *msg = NULL; > + struct ib_ah *ah; > int ret; > > dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; > @@ -2213,6 +2223,11 @@ static int cm_dreq_handler(struct cm_work *work) > return -EINVAL; > } > > + ah = ib_create_ah_from_wc(work->port->mad_agent->qp->pd, > + work->mad_recv_wc->wc, > + work->mad_recv_wc->recv_buf.grh, > + work->port->port_num); > + Shouldn't below IS_ERR(ah) on ah be here, instead of there? > work->cm_event.private_data = &dreq_msg->private_data; > > spin_lock_irq(&cm_id_priv->lock); > @@ -2234,9 +2249,13 @@ static int cm_dreq_handler(struct cm_work *work) > case IB_CM_TIMEWAIT: > atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. > counter[CM_DREQ_COUNTER]); > - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) > + if (IS_ERR(ah)) > + goto unlock; > + if (_cm_alloc_response_msg(work->port, work->mad_recv_wc, ah, > + &msg)) > goto unlock; > > + ah = NULL; > cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, > cm_id_priv->private_data, > cm_id_priv->private_data_len); > @@ -2259,6 +2278,8 @@ static int cm_dreq_handler(struct cm_work *work) > list_add_tail(&work->list, &cm_id_priv->work_list); > spin_unlock_irq(&cm_id_priv->lock); > > + if (!IS_ERR_OR_NULL(ah)) > + ib_destroy_ah(ah); > if (ret) > cm_process_work(cm_id_priv, work); > else > @@ -2266,6 +2287,8 @@ static int cm_dreq_handler(struct cm_work *work) > return 0; > > unlock: spin_unlock_irq(&cm_id_priv->lock); > + if (!IS_ERR_OR_NULL(ah)) > + ib_destroy_ah(ah); > deref: cm_deref_id(cm_id_priv); > return -EINVAL; > } > @@ -2761,6 +2784,7 @@ static int cm_lap_handler(struct cm_work *work) > struct cm_lap_msg *lap_msg; > struct ib_cm_lap_event_param *param; > struct ib_mad_send_buf *msg = NULL; > + struct ib_ah *ah; > int ret; > > /* todo: verify LAP request and send reject APR if invalid. */ > @@ -2770,6 +2794,12 @@ static int cm_lap_handler(struct cm_work *work) > if (!cm_id_priv) > return -EINVAL; > > + > + ah = ib_create_ah_from_wc(work->port->mad_agent->qp->pd, > + work->mad_recv_wc->wc, > + work->mad_recv_wc->recv_buf.grh, > + work->port->port_num); > + > param = &work->cm_event.param.lap_rcvd; > param->alternate_path = &work->path[0]; > cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); > @@ -2786,9 +2816,13 @@ static int cm_lap_handler(struct cm_work *work) > case IB_CM_MRA_LAP_SENT: > atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. > counter[CM_LAP_COUNTER]); > - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) > + if (IS_ERR(ah)) > + goto unlock; > + if (_cm_alloc_response_msg(work->port, work->mad_recv_wc, ah, > + &msg)) > goto unlock; > > + ah = NULL; > cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, > CM_MSG_RESPONSE_OTHER, > cm_id_priv->service_timeout, > @@ -2818,6 +2852,8 @@ static int cm_lap_handler(struct cm_work *work) > list_add_tail(&work->list, &cm_id_priv->work_list); > spin_unlock_irq(&cm_id_priv->lock); > > + if (!IS_ERR_OR_NULL(ah)) > + ib_destroy_ah(ah); > if (ret) > cm_process_work(cm_id_priv, work); > else > @@ -2825,6 +2861,8 @@ static int cm_lap_handler(struct cm_work *work) > return 0; > > unlock: spin_unlock_irq(&cm_id_priv->lock); > + if (!IS_ERR_OR_NULL(ah)) > + ib_destroy_ah(ah); > deref: cm_deref_id(cm_id_priv); > return -EINVAL; > } > -- > 2.1.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> When IP based addressing was introduced, ib_create_ah_from_wc was > changed in order to support a suitable AH. Since this AH should > now contains the DMAC (which isn't a simple derivative of the GID). > In order to find the DMAC, an ARP should sometime be sent. This ARP > is a sleeping context. Wait - are you saying that the CM may now be waiting for an ARP response before it can send a message? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Oct 12, 2015 at 7:42 PM, Hefty, Sean <sean.hefty@intel.com> wrote: >> When IP based addressing was introduced, ib_create_ah_from_wc was >> changed in order to support a suitable AH. Since this AH should >> now contains the DMAC (which isn't a simple derivative of the GID). >> In order to find the DMAC, an ARP should sometime be sent. This ARP >> is a sleeping context. > > Wait - are you saying that the CM may now be waiting for an ARP response before it can send a message? > ib_create_ah_from_wc needs to resolve the DMAC in order to create the AH (this may result sending an ARP and waiting for response). CM uses this function (which is now sleepable). > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/12/2015 3:59 PM, Devesh Sharma wrote: > Looks good, just one doubt inline: > Thanks for looking at this patch. > On Sun, Oct 11, 2015 at 6:28 PM, Matan Barak <matanb@mellanox.com> wrote: >> When IP based addressing was introduced, ib_create_ah_from_wc was >> changed in order to support a suitable AH. Since this AH should >> now contains the DMAC (which isn't a simple derivative of the GID). >> In order to find the DMAC, an ARP should sometime be sent. This ARP >> is a sleeping context. >> >> ib_create_ah_from_wc is called from cm_alloc_response_msg, which is >> sometimes called from an atomic context. This caused a >> sleeping-while-atomic bug. Fixing this by splitting >> cm_alloc_response_msg to an atomic and sleep-able part. When >> cm_alloc_response_msg is used in an atomic context, we try to create >> the AH before entering the atomic context. >> >> Fixes: 66bd20a72d2f ('IB/core: Ethernet L2 attributes in verbs/cm structures') >> Signed-off-by: Matan Barak <matanb@mellanox.com> >> --- >> >> Hi Doug, >> This patch fixes an old bug in the CM. IP based addressing requires >> ARP resolution which isn't sleep-able by its nature. This resolution >> was sometimes done in non sleep-able context. Our regression tests >> picked up this bug and verified this fix. >> >> Thanks, >> Matan >> >> drivers/infiniband/core/cm.c | 60 ++++++++++++++++++++++++++++++++++++-------- >> 1 file changed, 49 insertions(+), 11 deletions(-) >> >> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c >> index ea4db9c..f5cf1c4 100644 >> --- a/drivers/infiniband/core/cm.c >> +++ b/drivers/infiniband/core/cm.c >> @@ -287,17 +287,12 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, >> return 0; >> } >> >> -static int cm_alloc_response_msg(struct cm_port *port, >> - struct ib_mad_recv_wc *mad_recv_wc, >> - struct ib_mad_send_buf **msg) >> +static int _cm_alloc_response_msg(struct cm_port *port, >> + struct ib_mad_recv_wc *mad_recv_wc, >> + struct ib_ah *ah, >> + struct ib_mad_send_buf **msg) >> { >> struct ib_mad_send_buf *m; >> - struct ib_ah *ah; >> - >> - ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, >> - mad_recv_wc->recv_buf.grh, port->port_num); >> - if (IS_ERR(ah)) >> - return PTR_ERR(ah); >> >> m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, >> 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, >> @@ -312,6 +307,20 @@ static int cm_alloc_response_msg(struct cm_port *port, >> return 0; >> } >> >> +static int cm_alloc_response_msg(struct cm_port *port, >> + struct ib_mad_recv_wc *mad_recv_wc, >> + struct ib_mad_send_buf **msg) >> +{ >> + struct ib_ah *ah; >> + >> + ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, >> + mad_recv_wc->recv_buf.grh, port->port_num); >> + if (IS_ERR(ah)) >> + return PTR_ERR(ah); >> + >> + return _cm_alloc_response_msg(port, mad_recv_wc, ah, msg); >> +} >> + >> static void cm_free_msg(struct ib_mad_send_buf *msg) >> { >> ib_destroy_ah(msg->ah); >> @@ -2201,6 +2210,7 @@ static int cm_dreq_handler(struct cm_work *work) >> struct cm_id_private *cm_id_priv; >> struct cm_dreq_msg *dreq_msg; >> struct ib_mad_send_buf *msg = NULL; >> + struct ib_ah *ah; >> int ret; >> >> dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; >> @@ -2213,6 +2223,11 @@ static int cm_dreq_handler(struct cm_work *work) >> return -EINVAL; >> } >> >> + ah = ib_create_ah_from_wc(work->port->mad_agent->qp->pd, >> + work->mad_recv_wc->wc, >> + work->mad_recv_wc->recv_buf.grh, >> + work->port->port_num); >> + > > Shouldn't below IS_ERR(ah) on ah be here, instead of there? > I don't think we want to fail on error if the state != IB_CM_TIMEWAIT (other states don't use this ah). >> work->cm_event.private_data = &dreq_msg->private_data; >> >> spin_lock_irq(&cm_id_priv->lock); >> @@ -2234,9 +2249,13 @@ static int cm_dreq_handler(struct cm_work *work) >> case IB_CM_TIMEWAIT: >> atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. >> counter[CM_DREQ_COUNTER]); >> - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) >> + if (IS_ERR(ah)) >> + goto unlock; >> + if (_cm_alloc_response_msg(work->port, work->mad_recv_wc, ah, >> + &msg)) >> goto unlock; >> >> + ah = NULL; >> cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, >> cm_id_priv->private_data, >> cm_id_priv->private_data_len); >> @@ -2259,6 +2278,8 @@ static int cm_dreq_handler(struct cm_work *work) >> list_add_tail(&work->list, &cm_id_priv->work_list); >> spin_unlock_irq(&cm_id_priv->lock); >> >> + if (!IS_ERR_OR_NULL(ah)) >> + ib_destroy_ah(ah); >> if (ret) >> cm_process_work(cm_id_priv, work); >> else >> @@ -2266,6 +2287,8 @@ static int cm_dreq_handler(struct cm_work *work) >> return 0; >> >> unlock: spin_unlock_irq(&cm_id_priv->lock); >> + if (!IS_ERR_OR_NULL(ah)) >> + ib_destroy_ah(ah); >> deref: cm_deref_id(cm_id_priv); >> return -EINVAL; >> } >> @@ -2761,6 +2784,7 @@ static int cm_lap_handler(struct cm_work *work) >> struct cm_lap_msg *lap_msg; >> struct ib_cm_lap_event_param *param; >> struct ib_mad_send_buf *msg = NULL; >> + struct ib_ah *ah; >> int ret; >> >> /* todo: verify LAP request and send reject APR if invalid. */ >> @@ -2770,6 +2794,12 @@ static int cm_lap_handler(struct cm_work *work) >> if (!cm_id_priv) >> return -EINVAL; >> >> + >> + ah = ib_create_ah_from_wc(work->port->mad_agent->qp->pd, >> + work->mad_recv_wc->wc, >> + work->mad_recv_wc->recv_buf.grh, >> + work->port->port_num); >> + >> param = &work->cm_event.param.lap_rcvd; >> param->alternate_path = &work->path[0]; >> cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); >> @@ -2786,9 +2816,13 @@ static int cm_lap_handler(struct cm_work *work) >> case IB_CM_MRA_LAP_SENT: >> atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. >> counter[CM_LAP_COUNTER]); >> - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) >> + if (IS_ERR(ah)) >> + goto unlock; >> + if (_cm_alloc_response_msg(work->port, work->mad_recv_wc, ah, >> + &msg)) >> goto unlock; >> >> + ah = NULL; >> cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, >> CM_MSG_RESPONSE_OTHER, >> cm_id_priv->service_timeout, >> @@ -2818,6 +2852,8 @@ static int cm_lap_handler(struct cm_work *work) >> list_add_tail(&work->list, &cm_id_priv->work_list); >> spin_unlock_irq(&cm_id_priv->lock); >> >> + if (!IS_ERR_OR_NULL(ah)) >> + ib_destroy_ah(ah); >> if (ret) >> cm_process_work(cm_id_priv, work); >> else >> @@ -2825,6 +2861,8 @@ static int cm_lap_handler(struct cm_work *work) >> return 0; >> >> unlock: spin_unlock_irq(&cm_id_priv->lock); >> + if (!IS_ERR_OR_NULL(ah)) >> + ib_destroy_ah(ah); >> deref: cm_deref_id(cm_id_priv); >> return -EINVAL; >> } >> -- >> 2.1.0 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Mon, Oct 12, 2015 at 7:42 PM, Hefty, Sean <sean.hefty@intel.com> wrote: > >> When IP based addressing was introduced, ib_create_ah_from_wc was > >> changed in order to support a suitable AH. Since this AH should > >> now contains the DMAC (which isn't a simple derivative of the GID). > >> In order to find the DMAC, an ARP should sometime be sent. This ARP > >> is a sleeping context. > > > > Wait - are you saying that the CM may now be waiting for an ARP response > before it can send a message? > > > > ib_create_ah_from_wc needs to resolve the DMAC in order to create the > AH (this may result sending an ARP and waiting for response). > CM uses this function (which is now sleepable). This is a significant change to the CM. The CM calls are invoked assuming that they return relatively quickly. They're invoked from callbacks and internally. Having the calls now wait for an ARP response requires that this be re-architected, so the calling thread doesn't go out to lunch for several seconds. - Sean
On Tue, Oct 13, 2015 at 7:18 PM, Hefty, Sean <sean.hefty@intel.com> wrote: >> On Mon, Oct 12, 2015 at 7:42 PM, Hefty, Sean <sean.hefty@intel.com> wrote: >> >> When IP based addressing was introduced, ib_create_ah_from_wc was >> >> changed in order to support a suitable AH. Since this AH should >> >> now contains the DMAC (which isn't a simple derivative of the GID). >> >> In order to find the DMAC, an ARP should sometime be sent. This ARP >> >> is a sleeping context. >> > >> > Wait - are you saying that the CM may now be waiting for an ARP response >> before it can send a message? >> > >> >> ib_create_ah_from_wc needs to resolve the DMAC in order to create the >> AH (this may result sending an ARP and waiting for response). >> CM uses this function (which is now sleepable). > > This is a significant change to the CM. The CM calls are invoked assuming that they return relatively quickly. They're invoked from callbacks and internally. Having the calls now wait for an ARP response requires that this be re-architected, so the calling thread doesn't go out to lunch for several seconds. Agree - this is a significant change, but it was done a long time ago (at v4.3 if I recall). When we need to send a message we need to figure out the destination MAC. Even the passive side needs to do that as some vendors don't report the source MAC of the packet in their wc. Even if they did, since IP based addressing is rout-able by its nature, it should follow the networking stack rules. Some crazy configurations could force sending responses to packets that came from router1 to router2 - so we have no choice than resolving the DMAC at every side. > > - Sean Matan -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> >> ib_create_ah_from_wc needs to resolve the DMAC in order to create the > >> AH (this may result sending an ARP and waiting for response). > >> CM uses this function (which is now sleepable). > > > > This is a significant change to the CM. The CM calls are invoked > assuming that they return relatively quickly. They're invoked from > callbacks and internally. Having the calls now wait for an ARP response > requires that this be re-architected, so the calling thread doesn't go out > to lunch for several seconds. > > Agree - this is a significant change, but it was done a long time ago > (at v4.3 if I recall). When we need to send a message we need to We're at 4.3-rc5? > figure out the destination MAC. Even the passive side needs to do that > as some vendors don't report the source MAC of the packet in their wc. > Even if they did, since IP based addressing is rout-able by its > nature, it should follow the networking stack rules. Some crazy > configurations could force sending responses to packets that came from > router1 to router2 - so we have no choice than resolving the DMAC at > every side. Ib_create_ah_from_wc is broken. It is now an asynchronous operation, only the call itself was left as synchronous. We can't block kernel threads for a minute, or however long ARP takes to resolve. The call itself must change to be async, and all users of it updated to allocate some request, queue it, and handle all race conditions that result -- such as state changes or destruction of the work that caused the request to be initiated.
On Thu, Oct 15, 2015 at 7:58 PM, Hefty, Sean <sean.hefty@intel.com> wrote: >> >> ib_create_ah_from_wc needs to resolve the DMAC in order to create the >> >> AH (this may result sending an ARP and waiting for response). >> >> CM uses this function (which is now sleepable). >> > >> > This is a significant change to the CM. The CM calls are invoked >> assuming that they return relatively quickly. They're invoked from >> callbacks and internally. Having the calls now wait for an ARP response >> requires that this be re-architected, so the calling thread doesn't go out >> to lunch for several seconds. >> >> Agree - this is a significant change, but it was done a long time ago >> (at v4.3 if I recall). When we need to send a message we need to > > We're at 4.3-rc5? > Sorry, meant v3.14. >> figure out the destination MAC. Even the passive side needs to do that >> as some vendors don't report the source MAC of the packet in their wc. >> Even if they did, since IP based addressing is rout-able by its >> nature, it should follow the networking stack rules. Some crazy >> configurations could force sending responses to packets that came from >> router1 to router2 - so we have no choice than resolving the DMAC at >> every side. > > Ib_create_ah_from_wc is broken. It is now an asynchronous operation, only the call itself was left as synchronous. We can't block kernel threads for a minute, or however long ARP takes to resolve. The call itself must change to be async, and all users of it updated to allocate some request, queue it, and handle all race conditions that result -- such as state changes or destruction of the work that caused the request to be initiated. Today, cm assumes paths are reversible primary_path->reversible = 1. That's true for both IB and RoCE. We could say vendors must report the SMAC in WC and then ib_create_ah_from_wc will be atomic (for these cases). If we wish to lift these limitations, we need to make ib_create_ah_from_wc asynchronous, but that's true even prior the RoCE IP based addressing patch. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Today, cm assumes paths are reversible primary_path->reversible = 1.
I can't quickly find a link, but I believe all CM MADs are reversible, per the spec.
On 10/20/2015 11:57 AM, Hefty, Sean wrote: >> Today, cm assumes paths are reversible primary_path->reversible = 1. > > I can't quickly find a link, but I believe all CM MADs are reversible, per the spec. Spec citation for this is: C12-5.1.3: All responses generated by the CM protocol shall follow the rules for response generation that are enumerated in 13.5.4 Response Generation and Reversible Paths. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Oct 20, 2015 at 03:57:54PM +0000, Hefty, Sean wrote: > > Today, cm assumes paths are reversible primary_path->reversible = 1. > > I can't quickly find a link, but I believe all CM MADs are > reversible, per the spec. But the Linux CM code doesn't always create the reverse CM MAD from the GMP headers, it sometimes will do it by looking into the data path in the MAD, which means it still could need to sleep to do the MAC resolution. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/15/2015 12:58 PM, Hefty, Sean wrote: >>>> ib_create_ah_from_wc needs to resolve the DMAC in order to create the >>>> AH (this may result sending an ARP and waiting for response). >>>> CM uses this function (which is now sleepable). >>> >>> This is a significant change to the CM. The CM calls are invoked >> assuming that they return relatively quickly. They're invoked from >> callbacks and internally. Having the calls now wait for an ARP response >> requires that this be re-architected, so the calling thread doesn't go out >> to lunch for several seconds. >> >> Agree - this is a significant change, but it was done a long time ago >> (at v4.3 if I recall). When we need to send a message we need to > > We're at 4.3-rc5? > >> figure out the destination MAC. Even the passive side needs to do that >> as some vendors don't report the source MAC of the packet in their wc. >> Even if they did, since IP based addressing is rout-able by its >> nature, it should follow the networking stack rules. Some crazy >> configurations could force sending responses to packets that came from >> router1 to router2 - so we have no choice than resolving the DMAC at >> every side. > > Ib_create_ah_from_wc is broken. It is now an asynchronous operation, only the call itself was left as synchronous. We can't block kernel threads for a minute, or however long ARP takes to resolve. The call itself must change to be async, and all users of it updated to allocate some request, queue it, and handle all race conditions that result -- such as state changes or destruction of the work that caused the request to be initiated. > I don't know who had intended to address this, but it got left out of the 4.4 work. We need to not let this drop through the cracks (for another release). Can someone please put fixing this properly on their TODO list?
On Wed, Dec 23, 2015 at 10:04 PM, Doug Ledford <dledford@redhat.com> wrote: > On 10/15/2015 12:58 PM, Hefty, Sean wrote: >>>>> ib_create_ah_from_wc needs to resolve the DMAC in order to create the >>>>> AH (this may result sending an ARP and waiting for response). >>>>> CM uses this function (which is now sleepable). >>>> >>>> This is a significant change to the CM. The CM calls are invoked >>> assuming that they return relatively quickly. They're invoked from >>> callbacks and internally. Having the calls now wait for an ARP response >>> requires that this be re-architected, so the calling thread doesn't go out >>> to lunch for several seconds. >>> >>> Agree - this is a significant change, but it was done a long time ago >>> (at v4.3 if I recall). When we need to send a message we need to >> >> We're at 4.3-rc5? >> >>> figure out the destination MAC. Even the passive side needs to do that >>> as some vendors don't report the source MAC of the packet in their wc. >>> Even if they did, since IP based addressing is rout-able by its >>> nature, it should follow the networking stack rules. Some crazy >>> configurations could force sending responses to packets that came from >>> router1 to router2 - so we have no choice than resolving the DMAC at >>> every side. >> >> Ib_create_ah_from_wc is broken. It is now an asynchronous operation, only the call itself was left as synchronous. We can't block kernel threads for a minute, or however long ARP takes to resolve. The call itself must change to be async, and all users of it updated to allocate some request, queue it, and handle all race conditions that result -- such as state changes or destruction of the work that caused the request to be initiated. >> > > I don't know who had intended to address this, but it got left out of > the 4.4 work. We need to not let this drop through the cracks (for > another release). Can someone please put fixing this properly on their > TODO list? > IMHO, the proposed patch makes things better. Not applying the current patch means we have a "sleeping while atomic" error (in addition to the fact that kernel threads could wait until the ARP process finishes), which is pretty bad. I tend to agree that adding another CM state is probably a better approach, but unless someone steps up and add this for v4.5, I think that's the best thing we have. > -- > Doug Ledford <dledford@redhat.com> > GPG KeyID: 0E572FDD > > Matan -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index ea4db9c..f5cf1c4 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -287,17 +287,12 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, return 0; } -static int cm_alloc_response_msg(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc, - struct ib_mad_send_buf **msg) +static int _cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_ah *ah, + struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; - struct ib_ah *ah; - - ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, - mad_recv_wc->recv_buf.grh, port->port_num); - if (IS_ERR(ah)) - return PTR_ERR(ah); m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, @@ -312,6 +307,20 @@ static int cm_alloc_response_msg(struct cm_port *port, return 0; } +static int cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf **msg) +{ + struct ib_ah *ah; + + ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, port->port_num); + if (IS_ERR(ah)) + return PTR_ERR(ah); + + return _cm_alloc_response_msg(port, mad_recv_wc, ah, msg); +} + static void cm_free_msg(struct ib_mad_send_buf *msg) { ib_destroy_ah(msg->ah); @@ -2201,6 +2210,7 @@ static int cm_dreq_handler(struct cm_work *work) struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; + struct ib_ah *ah; int ret; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; @@ -2213,6 +2223,11 @@ static int cm_dreq_handler(struct cm_work *work) return -EINVAL; } + ah = ib_create_ah_from_wc(work->port->mad_agent->qp->pd, + work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + work->port->port_num); + work->cm_event.private_data = &dreq_msg->private_data; spin_lock_irq(&cm_id_priv->lock); @@ -2234,9 +2249,13 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + if (IS_ERR(ah)) + goto unlock; + if (_cm_alloc_response_msg(work->port, work->mad_recv_wc, ah, + &msg)) goto unlock; + ah = NULL; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); @@ -2259,6 +2278,8 @@ static int cm_dreq_handler(struct cm_work *work) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); + if (!IS_ERR_OR_NULL(ah)) + ib_destroy_ah(ah); if (ret) cm_process_work(cm_id_priv, work); else @@ -2266,6 +2287,8 @@ static int cm_dreq_handler(struct cm_work *work) return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); + if (!IS_ERR_OR_NULL(ah)) + ib_destroy_ah(ah); deref: cm_deref_id(cm_id_priv); return -EINVAL; } @@ -2761,6 +2784,7 @@ static int cm_lap_handler(struct cm_work *work) struct cm_lap_msg *lap_msg; struct ib_cm_lap_event_param *param; struct ib_mad_send_buf *msg = NULL; + struct ib_ah *ah; int ret; /* todo: verify LAP request and send reject APR if invalid. */ @@ -2770,6 +2794,12 @@ static int cm_lap_handler(struct cm_work *work) if (!cm_id_priv) return -EINVAL; + + ah = ib_create_ah_from_wc(work->port->mad_agent->qp->pd, + work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + work->port->port_num); + param = &work->cm_event.param.lap_rcvd; param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); @@ -2786,9 +2816,13 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + if (IS_ERR(ah)) + goto unlock; + if (_cm_alloc_response_msg(work->port, work->mad_recv_wc, ah, + &msg)) goto unlock; + ah = NULL; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, cm_id_priv->service_timeout, @@ -2818,6 +2852,8 @@ static int cm_lap_handler(struct cm_work *work) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); + if (!IS_ERR_OR_NULL(ah)) + ib_destroy_ah(ah); if (ret) cm_process_work(cm_id_priv, work); else @@ -2825,6 +2861,8 @@ static int cm_lap_handler(struct cm_work *work) return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); + if (!IS_ERR_OR_NULL(ah)) + ib_destroy_ah(ah); deref: cm_deref_id(cm_id_priv); return -EINVAL; }
When IP based addressing was introduced, ib_create_ah_from_wc was changed in order to support a suitable AH. Since this AH should now contains the DMAC (which isn't a simple derivative of the GID). In order to find the DMAC, an ARP should sometime be sent. This ARP is a sleeping context. ib_create_ah_from_wc is called from cm_alloc_response_msg, which is sometimes called from an atomic context. This caused a sleeping-while-atomic bug. Fixing this by splitting cm_alloc_response_msg to an atomic and sleep-able part. When cm_alloc_response_msg is used in an atomic context, we try to create the AH before entering the atomic context. Fixes: 66bd20a72d2f ('IB/core: Ethernet L2 attributes in verbs/cm structures') Signed-off-by: Matan Barak <matanb@mellanox.com> --- Hi Doug, This patch fixes an old bug in the CM. IP based addressing requires ARP resolution which isn't sleep-able by its nature. This resolution was sometimes done in non sleep-able context. Our regression tests picked up this bug and verified this fix. Thanks, Matan drivers/infiniband/core/cm.c | 60 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 11 deletions(-)