diff mbox series

[rdma-next] IB/cma: Define options to set CM timeouts and retries

Message ID ZgxeQbxfKXHkUlQG@eaujamesDDN (mailing list archive)
State Superseded
Headers show
Series [rdma-next] IB/cma: Define options to set CM timeouts and retries | expand

Commit Message

Etienne AUJAMES April 2, 2024, 7:36 p.m. UTC
Define new options in 'rdma_set_option' to override default CM retries
("Max CM retries") and timeouts ("Local CM Response Timeout" and "Remote
CM Response Timeout").

These options can be useful for RoCE networks (no SM) to decrease the
overall connection timeout with an unreachable node (by default, it can
take several minutes).

Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
---
 drivers/infiniband/core/cma.c      | 92 ++++++++++++++++++++++++++++--
 drivers/infiniband/core/cma_priv.h |  4 ++
 drivers/infiniband/core/ucma.c     | 14 +++++
 include/rdma/rdma_cm.h             |  5 ++
 include/uapi/rdma/rdma_user_cm.h   |  4 +-
 5 files changed, 113 insertions(+), 6 deletions(-)

Comments

Mark Zhang April 8, 2024, 3:26 a.m. UTC | #1
On 4/3/2024 3:36 AM, Etienne AUJAMES wrote:
> External email: Use caution opening links or attachments
> 
> 
> Define new options in 'rdma_set_option' to override default CM retries
> ("Max CM retries") and timeouts ("Local CM Response Timeout" and "Remote
> CM Response Timeout").
> 

Sometimes user-facting tunable is not preferred but let's see:

https://lore.kernel.org/linux-rdma/EC1346C3-3888-4FFB-B302-1DB200AAE00D@oracle.com/

> These options can be useful for RoCE networks (no SM) to decrease the
> overall connection timeout with an unreachable node (by default, it can
> take several minutes).
> 

This patch is not only for RoCE, right?

> Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
> ---
>   drivers/infiniband/core/cma.c      | 92 ++++++++++++++++++++++++++++--
>   drivers/infiniband/core/cma_priv.h |  4 ++
>   drivers/infiniband/core/ucma.c     | 14 +++++
>   include/rdma/rdma_cm.h             |  5 ++
>   include/uapi/rdma/rdma_user_cm.h   |  4 +-
>   5 files changed, 113 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> index 1e2cd7c8716e..cc73b9708862 100644
> --- a/drivers/infiniband/core/cma.c
> +++ b/drivers/infiniband/core/cma.c
> @@ -1002,6 +1002,8 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
>          id_priv->tos_set = false;
>          id_priv->timeout_set = false;
>          id_priv->min_rnr_timer_set = false;
> +       id_priv->max_cm_retries = false;
> +       id_priv->cm_timeout = false;
>          id_priv->gid_type = IB_GID_TYPE_IB;
>          spin_lock_init(&id_priv->lock);
>          mutex_init(&id_priv->qp_mutex);
> @@ -2845,6 +2847,80 @@ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
>   }
>   EXPORT_SYMBOL(rdma_set_min_rnr_timer);
> 
> +/**
> + * rdma_set_cm_retries() - Set the maximum of CM retries of the QP associated
> + *                        with a connection identifier.

This comment (and the one below) seems not accuruate, as it's not for 
the QP. This is different from the rdma_set_ack_timeout().

> + * @id: Communication identifier associated with service type.
> + * @max_cm_retries: 4-bit value definied as "Max CM Retries" REQ field

typo: definied -> defined

> + *                 (Table 99 "REQ Message Contents" in the IBTA specification).
> + *
> + * This function should be called before rdma_connect() on active side.
> + * The value will determine the maximum number of times the CM should
> + * resend a connection request or reply (REQ/REP) before giving up (UNREACHABLE
> + * event).
> + *
> + * Return: 0 for success
> + */
> +int rdma_set_cm_retries(struct rdma_cm_id *id, u8 max_cm_retries)
> +{
> +       struct rdma_id_private *id_priv;
> +
> +       /* It is a 4-bit value */
> +       if (max_cm_retries & 0xf0)
> +               return -EINVAL;
> +
> +       if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
> +               return -EINVAL;
> +
Maybe we don't need a warning here.
I think UD also need these 2 parameters, as UD also has Resp. see 
cma_resolve_ib_udp() below.

> +       id_priv = container_of(id, struct rdma_id_private, id);
> +       mutex_lock(&id_priv->qp_mutex);
> +       id_priv->max_cm_retries = max_cm_retries;
> +       id_priv->max_cm_retries_set = true;
> +       mutex_unlock(&id_priv->qp_mutex);
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL(rdma_set_cm_retries);
> +
> +/**
> + * rdma_set_cm_timeout() - Set the CM timeouts of the QP associated with a
> + *                        connection identifier.
> + * @id: Communication identifier associated with service type.
> + * @cm_timeout: 5-bit value, expressed as 4.096 * 2^(timeout) usec.
> + *             This value should be superior than 0.
> + *
> + * This function should be called before rdma_connect() on active side.
> + * The value will affect the "Remote CM Response Timeout" and the
> + * "Local CM Response Timeout" timeouts to respond to a connection request (REQ)
> + * and to wait for connection reply (REP) ack on the remote node.
> + *
> + * Round-trip timeouts for a REQ and REP requests:
> + * REQ_timeout_ms = remote_cm_response_timeout_ms + 2* PacketLifeTime_ms
> + * REP_timeout_ms = local_cm_response_timeout_ms
> + *
> + * Return: 0 for success
> + */
> +int rdma_set_cm_timeout(struct rdma_cm_id *id, u8 cm_timeout)
> +{
> +       struct rdma_id_private *id_priv;
> +
> +       /* It is a 5-bit value */
> +       if (!cm_timeout || (cm_timeout & 0xe0))
> +               return -EINVAL;
> +
> +       if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
> +               return -EINVAL;

likewise

> +
> +       id_priv = container_of(id, struct rdma_id_private, id);
> +       mutex_lock(&id_priv->qp_mutex);
> +       id_priv->cm_timeout = cm_timeout;
> +       id_priv->cm_timeout_set = true;
> +       mutex_unlock(&id_priv->qp_mutex);
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL(rdma_set_cm_timeout);
> +
>   static int route_set_path_rec_inbound(struct cma_work *work,
>                                        struct sa_path_rec *path_rec)
>   {
> @@ -4295,8 +4371,11 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
>          req.path = id_priv->id.route.path_rec;
>          req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
>          req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
> -       req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
> -       req.max_cm_retries = CMA_MAX_CM_RETRIES;
> +       req.timeout_ms = id_priv->cm_timeout_set ?
> +               id_priv->cm_timeout : CMA_CM_RESPONSE_TIMEOUT;
> +       req.timeout_ms = 1 << (req.timeout_ms - 8);

So req.timeout_ms must be greater than 8? Maybe we need to check in 
rdma_set_cm_timeout().
Sean Hefty April 8, 2024, 4:10 p.m. UTC | #2
> Define new options in 'rdma_set_option' to override default CM retries ("Max
> CM retries") and timeouts ("Local CM Response Timeout" and "Remote CM
> Response Timeout").
> 
> These options can be useful for RoCE networks (no SM) to decrease the overall
> connection timeout with an unreachable node (by default, it can take several
> minutes).

I've been looking into this problem, plus related timeout issues myself:

1. Allow a client to timeout quicker when trying to connect to an unreachable node.
2. Prevent a client from ignoring a CM response for an extended period.
   This forces the server to hold resources.
   Problem also occurs if the client node crashes after trying to connect.
3. Improve connection setup time when packets are lost.

I was thinking of aligning closer with the behavior of the TCP stack, plus a couple other adjustments.

a. Reduce the hard-coded CM retries from 15 down to 6.
b. Reduce the hard-coded CM response timeout from 20 (4s) to 18 (1s).
c. Switch CM MADs to use exponential backoff timeouts (1s, 2s, 4s, 8s, etc. + random variation)
d. Selectively send MRA responses -- only in response to a REQ
e. Finally, add tunables to the above options for recovery purposes.

Most of the issues are common to RoCE and IB.  Changes a, b, & c are based on my system's TCP defaults, but my goal was to get timeouts down to about 1 minute.  Change d should help address problem 2.

If the expectation is that most users will want to change the timeout/retry, which I think would be the case, then adjusting the defaults may avoid the overhead of setting them on every cm_id.  The ability to set the values as proposed can substitute for change e, but may require users update their librdamcm.

- Sean
Etienne AUJAMES April 9, 2024, 1:07 p.m. UTC | #3
> I was thinking of aligning closer with the behavior of the TCP stack, plus a couple other adjustments.
> 
> a. Reduce the hard-coded CM retries from 15 down to 6.
15 retries is the maximum (the field size is 4 bits). According to the
documentation that I read, there is no minimum retries required. So I
don't know why this is statically defined to 15. Maybe this is related
to hardware interoperability/compatibility.

> b. Reduce the hard-coded CM response timeout from 20 (4s) to 18 (1s).
The NVIDIA MOFED set the CM timeout to 22 (17s) instead of 20. This
makes an overall connection timeout of 5 min for an unreachable node.

Some patches seem to argue that 20 is too short:
https://lore.kernel.org/lkml/20190217170909.1178575-1-haakon.bugge@oracle.com/

> c. Switch CM MADs to use exponential backoff timeouts (1s, 2s, 4s, 8s, etc. + random variation)
> d. Selectively send MRA responses -- only in response to a REQ
> e. Finally, add tunables to the above options for recovery purposes.
>
> Most of the issues are common to RoCE and IB.  Changes a, b, & c are based on my system's TCP defaults, but my goal was to get timeouts down to about 1 minute.  Change d should help address problem 2.
Please notes here, that we don't hit this timeout issue on Infiniband
network with an unreachable node.
Infiniband have a SM, rdma_resolve_route() fails before rdma_connect()
for an unreachable node. The SM returns an empty "path record".

Maybe there are some other ways to mitigate that RoCE issue.
For example, when I troubleshooted this issue, I saw that the RoCE HCA
received ICMP "Destination unreachable" packets for the CM requests. So,
maybe we could listen to those messages and abort the connection process.

> If the expectation is that most users will want to change the timeout/retry, which I think would be the case, then adjusting the defaults may avoid the overhead of setting them on every cm_id.  The ability to set the values as proposed can substitute for change e, but may require users update their librdamcm.

Our particular use case is the Lustre RoCE/Infiniband module with a
RocE network.
Lustre relies on "pings" to monitor nodes/services/routes, this is use
for example for High Availability or the selections of Lustre network
routes.
For Lustre FS, a timeout superior to 1 min is not acceptable to detect
an unreachable node.

More information can be found here:
https://jira.whamcloud.com/browse/LU-17480

I don't think that most users needs to tune those parameters. But if
some use cases require a smaller connection timeout, this should be
available.

I agree that finding a common ground to adjust the defaults would be
better but this can be challenging and break non-common fabrics or use
cases.

This look like the same that for "RDMA_OPTION_ID_ACK_TIMEOUT": not all
users need to alter PacketLifeTime for RoCE, but if the network
requires to increase that value, they can do it (on Infiniband this
value is given by the SM).

Etienne
Sean Hefty April 9, 2024, 2:44 p.m. UTC | #4
> > I was thinking of aligning closer with the behavior of the TCP stack, plus a
> couple other adjustments.
> >
> > a. Reduce the hard-coded CM retries from 15 down to 6.
> 15 retries is the maximum (the field size is 4 bits). According to the
> documentation that I read, there is no minimum retries required. So I
> don't know why this is statically defined to 15. Maybe this is related
> to hardware interoperability/compatibility.

15 is the max defined by the IB spec.  With a linear retry timeout, setting this
to the max can make sense.  But if we switched to using a backoff timer, I believe
we can get by with a smaller value.

> > b. Reduce the hard-coded CM response timeout from 20 (4s) to 18 (1s).
> The NVIDIA MOFED set the CM timeout to 22 (17s) instead of 20. This
> makes an overall connection timeout of 5 min for an unreachable node.
> 
> Some patches seem to argue that 20 is too short:
> https://lore.kernel.org/lkml/20190217170909.1178575-1-
> haakon.bugge@oracle.com/

A backoff timer can reduce retries.  I don't know how you decide
what the initial backoff should be.  I was going with what seems to be the
behavior with tcp.  Maybe the backoff adjusts based on IB vs RoCE.

In any case, a 5-minute timeout seems unfriendly.

> > c. Switch CM MADs to use exponential backoff timeouts (1s, 2s, 4s, 8s, etc. +
> random variation)
> > d. Selectively send MRA responses -- only in response to a REQ
> > e. Finally, add tunables to the above options for recovery purposes.
> >
> > Most of the issues are common to RoCE and IB.  Changes a, b, & c are based
> on my system's TCP defaults, but my goal was to get timeouts down to about
> 1 minute.  Change d should help address problem 2.
> Please notes here, that we don't hit this timeout issue on Infiniband
> network with an unreachable node.
> Infiniband have a SM, rdma_resolve_route() fails before rdma_connect()
> for an unreachable node. The SM returns an empty "path record".

I guess this depends on when the node goes down and how quickly the SM
can identify it.  But this does suggest that having separate defaults for IB vs
RoCE may be necessary.

> Maybe there are some other ways to mitigate that RoCE issue.
> For example, when I troubleshooted this issue, I saw that the RoCE HCA
> received ICMP "Destination unreachable" packets for the CM requests. So,
> maybe we could listen to those messages and abort the connection process.

Hmm.. I wonder what it would take to do this.

> > If the expectation is that most users will want to change the timeout/retry,
> which I think would be the case, then adjusting the defaults may avoid the
> overhead of setting them on every cm_id.  The ability to set the values as
> proposed can substitute for change e, but may require users update their
> librdamcm.
> 
> Our particular use case is the Lustre RoCE/Infiniband module with a
> RocE network.
> Lustre relies on "pings" to monitor nodes/services/routes, this is use
> for example for High Availability or the selections of Lustre network
> routes.
> For Lustre FS, a timeout superior to 1 min is not acceptable to detect
> an unreachable node.
> 
> More information can be found here:
> https://jira.whamcloud.com/browse/LU-17480
> 
> I don't think that most users needs to tune those parameters. But if
> some use cases require a smaller connection timeout, this should be
> available.
> 
> I agree that finding a common ground to adjust the defaults would be
> better but this can be challenging and break non-common fabrics or use
> cases.

IMO, if we can improve that out of the box experience, that would be ideal.
I agree that there will always be situations where the kernel defaults are
not optimal and either require changing them system wide, or possibly 
per rdma_cm_id.

If we believe that switching to a backoff retry timer is a better direction
or should be an option, does that change the approach for this patch?
A retry count still makes sense, but the timeout is more complex.  On that
note, I would specify a timeout in something straightforward, like milliseconds.

- Sean
Etienne AUJAMES April 9, 2024, 4:11 p.m. UTC | #5
Thanks for the review.

On 08/04/24, Mark Zhang wrote:
> On 4/3/2024 3:36 AM, Etienne AUJAMES wrote:
> > External email: Use caution opening links or attachments
> > 
> > 
> > Define new options in 'rdma_set_option' to override default CM retries
> > ("Max CM retries") and timeouts ("Local CM Response Timeout" and "Remote
> > CM Response Timeout").
> > 
> 
> Sometimes user-facting tunable is not preferred but let's see:
> 
> https://lore.kernel.org/linux-rdma/EC1346C3-3888-4FFB-B302-1DB200AAE00D@oracle.com/

Note here that we hit this timeout issue with RoCE on a routed network.
This means that no ARP is sent directly to the remote host and
rdma_resolve_addr() always exit without error.

As mention in the ticket, this is still possible on a flat network.
If a node become unresponsive, the application tries to reconnect to the
node, remote IP is still in the ARP cache, rdma_resolve_addr() exits
without error.

> 
> > These options can be useful for RoCE networks (no SM) to decrease the
> > overall connection timeout with an unreachable node (by default, it can
> > take several minutes).
> > 
> 
> This patch is not only for RoCE, right?

Yes, you are right. But the connection timeout issue is seen only on RoCE
for an unreachable node.

With an Infiniband network, the "Subnet Manager" will return an empty
"path record" and rdma_resolve_route() will return an error before
calling rdma_connect().

So, the purpose of this patch is to mitigate this RoCE connection
timeout issue.

> 
> > Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
> > ---
> >   drivers/infiniband/core/cma.c      | 92 ++++++++++++++++++++++++++++--
> >   drivers/infiniband/core/cma_priv.h |  4 ++
> >   drivers/infiniband/core/ucma.c     | 14 +++++
> >   include/rdma/rdma_cm.h             |  5 ++
> >   include/uapi/rdma/rdma_user_cm.h   |  4 +-
> >   5 files changed, 113 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> > index 1e2cd7c8716e..cc73b9708862 100644
> > --- a/drivers/infiniband/core/cma.c
> > +++ b/drivers/infiniband/core/cma.c
> > @@ -1002,6 +1002,8 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
> >          id_priv->tos_set = false;
> >          id_priv->timeout_set = false;
> >          id_priv->min_rnr_timer_set = false;
> > +       id_priv->max_cm_retries = false;
> > +       id_priv->cm_timeout = false;
> >          id_priv->gid_type = IB_GID_TYPE_IB;
> >          spin_lock_init(&id_priv->lock);
> >          mutex_init(&id_priv->qp_mutex);
> > @@ -2845,6 +2847,80 @@ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
> >   }
> >   EXPORT_SYMBOL(rdma_set_min_rnr_timer);
> > 
> > +/**
> > + * rdma_set_cm_retries() - Set the maximum of CM retries of the QP associated
> > + *                        with a connection identifier.
> 
> This comment (and the one below) seems not accuruate, as it's not for the
> QP. This is different from the rdma_set_ack_timeout().

What do you suggest?

CM parameters are not used when the QP is connected, but to establish the
connection. For me, those parameters are still associated with the
connection.

What about:
/**
 * rdma_set_cm_retries() - Configure CM retries to establish an active
 *			   connection.
 * @id: Connection identifier to connect.

> 
> > + * @id: Communication identifier associated with service type.
> > + * @max_cm_retries: 4-bit value definied as "Max CM Retries" REQ field
> 
> typo: definied -> defined

Ack

> 
> > + *                 (Table 99 "REQ Message Contents" in the IBTA specification).
> > + *
> > + * This function should be called before rdma_connect() on active side.
> > + * The value will determine the maximum number of times the CM should
> > + * resend a connection request or reply (REQ/REP) before giving up (UNREACHABLE
> > + * event).
> > + *
> > + * Return: 0 for success
> > + */
> > +int rdma_set_cm_retries(struct rdma_cm_id *id, u8 max_cm_retries)
> > +{
> > +       struct rdma_id_private *id_priv;
> > +
> > +       /* It is a 4-bit value */
> > +       if (max_cm_retries & 0xf0)
> > +               return -EINVAL;
> > +
> > +       if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
> > +               return -EINVAL;
> > +
> Maybe we don't need a warning here.
> I think UD also need these 2 parameters, as UD also has Resp. see
> cma_resolve_ib_udp() below.

Yes, this seems right. UD used CM timeout to compute req.timeout_ms and
CM retries for req.max_cm_retries. But unlike RC, those values are not
sent on the wire.

> 
> > +       id_priv = container_of(id, struct rdma_id_private, id);
> > +       mutex_lock(&id_priv->qp_mutex);
> > +       id_priv->max_cm_retries = max_cm_retries;
> > +       id_priv->max_cm_retries_set = true;
> > +       mutex_unlock(&id_priv->qp_mutex);
> > +
> > +       return 0;
> > +}
> > +EXPORT_SYMBOL(rdma_set_cm_retries);
> > +
> > +/**
> > + * rdma_set_cm_timeout() - Set the CM timeouts of the QP associated with a
> > + *                        connection identifier.
> > + * @id: Communication identifier associated with service type.
> > + * @cm_timeout: 5-bit value, expressed as 4.096 * 2^(timeout) usec.
> > + *             This value should be superior than 0.
> > + *
> > + * This function should be called before rdma_connect() on active side.
> > + * The value will affect the "Remote CM Response Timeout" and the
> > + * "Local CM Response Timeout" timeouts to respond to a connection request (REQ)
> > + * and to wait for connection reply (REP) ack on the remote node.
> > + *
> > + * Round-trip timeouts for a REQ and REP requests:
> > + * REQ_timeout_ms = remote_cm_response_timeout_ms + 2* PacketLifeTime_ms
> > + * REP_timeout_ms = local_cm_response_timeout_ms
> > + *
> > + * Return: 0 for success
> > + */
> > +int rdma_set_cm_timeout(struct rdma_cm_id *id, u8 cm_timeout)
> > +{
> > +       struct rdma_id_private *id_priv;
> > +
> > +       /* It is a 5-bit value */
> > +       if (!cm_timeout || (cm_timeout & 0xe0))
> > +               return -EINVAL;
> > +
> > +       if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
> > +               return -EINVAL;
> 
> likewise

Ack

> > +
> > +       id_priv = container_of(id, struct rdma_id_private, id);
> > +       mutex_lock(&id_priv->qp_mutex);
> > +       id_priv->cm_timeout = cm_timeout;
> > +       id_priv->cm_timeout_set = true;
> > +       mutex_unlock(&id_priv->qp_mutex);
> > +
> > +       return 0;
> > +}
> > +EXPORT_SYMBOL(rdma_set_cm_timeout);
> > +
> >   static int route_set_path_rec_inbound(struct cma_work *work,
> >                                        struct sa_path_rec *path_rec)
> >   {
> > @@ -4295,8 +4371,11 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
> >          req.path = id_priv->id.route.path_rec;
> >          req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
> >          req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
> > -       req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
> > -       req.max_cm_retries = CMA_MAX_CM_RETRIES;
> > +       req.timeout_ms = id_priv->cm_timeout_set ?
> > +               id_priv->cm_timeout : CMA_CM_RESPONSE_TIMEOUT;
> > +       req.timeout_ms = 1 << (req.timeout_ms - 8);
> 
> So req.timeout_ms must be greater than 8? Maybe we need to check in
> rdma_set_cm_timeout().

Yes. For RC the active connection timeout is given by:

static inline int cm_convert_to_ms(int iba_time)
{
        /* approximate conversion to ms from 4.096us x 2^iba_time */
        return 1 << max(iba_time - 8, 0);
}

int ib_send_cm_req(struct ib_cm_id *cm_id,
....
        cm_id_priv->timeout_ms = cm_convert_to_ms(
                                    param->primary_path->packet_life_time) * 2 +
                                 cm_convert_to_ms(                              
                                    param->remote_cm_response_timeout);

So, we can check the timeout value in rdma_set_cm_timeout() or modify cma_resolve_ib_udp() to match cm_convert_to_ms() implementation:

 -       req.timeout_ms = 1 << (req.timeout_ms - 8);
 +       req.timeout_ms = 1 << max(req.timeout_ms - 8, 0);

Etienne
Etienne AUJAMES April 11, 2024, 4:04 p.m. UTC | #6
> A backoff timer can reduce retries.  I don't know how you decide
> what the initial backoff should be.  I was going with what seems to be the
> behavior with tcp.  Maybe the backoff adjusts based on IB vs RoCE.

Ok, I understand it now. So, with a retries of 5 and a initial timeout
of 18 (~1s), this would make:

connect_timeout = 1 + 2 + 4 + 8 + 16 + 32 = 63s
connect_timeout = initial * (2^(retries + 1) - 1)

> 
> > I don't think that most users needs to tune those parameters. But if
> > some use cases require a smaller connection timeout, this should be
> > available.
> > 
> > I agree that finding a common ground to adjust the defaults would be
> > better but this can be challenging and break non-common fabrics or use
> > cases.
> 
> IMO, if we can improve that out of the box experience, that would be ideal.
> I agree that there will always be situations where the kernel defaults are
> not optimal and either require changing them system wide, or possibly 
> per rdma_cm_id.
> 
> If we believe that switching to a backoff retry timer is a better direction
> or should be an option, does that change the approach for this patch?
> A retry count still makes sense, but the timeout is more complex.  On that
> note, I would specify a timeout in something straightforward, like milliseconds.

An exponential backoff timer seems to be a good solution to reduce
temporary contentions (when several node reconnect simultaneously).
But it makes the overall connection timeout more complex. That why
you don't want to expose the initial CM timeout to the user.

So, if I follow you here. You suggest to expose only a "connection
timeout in ms" to the user and determine a retries count with that.

For example, if an user defines a timeout of 50s (with an initial
timeout of 1s), we should configure 4 retries. But this would make an
effective timeout of 31s.

I don't like that idea because it hides what is actually done: 
A user will set a value in ms and he could have several seconds or
minutes of difference with what he expect.

So, I would prefer the kernel TCP way. They defined "tcp_retries2" to
configure the maximum number of retransmissions for an active connection.
The initial timeout value is not configurable (TCP_RTO_MIN). And the
retransmission timeout is between TCP_RTO_MIN (200ms) and TCP_RTO_MAX
(120s).

Etienne
Sean Hefty April 11, 2024, 5:15 p.m. UTC | #7
> > A backoff timer can reduce retries.  I don't know how you decide what
> > the initial backoff should be.  I was going with what seems to be the
> > behavior with tcp.  Maybe the backoff adjusts based on IB vs RoCE.
> 
> Ok, I understand it now. So, with a retries of 5 and a initial timeout of 18 (~1s),
> this would make:
> 
> connect_timeout = 1 + 2 + 4 + 8 + 16 + 32 = 63s connect_timeout = initial *
> (2^(retries + 1) - 1)

Correct - plus random additional time added in to stagger bursts.

> > > I don't think that most users needs to tune those parameters. But if
> > > some use cases require a smaller connection timeout, this should be
> > > available.
> > >
> > > I agree that finding a common ground to adjust the defaults would be
> > > better but this can be challenging and break non-common fabrics or
> > > use cases.
> >
> > IMO, if we can improve that out of the box experience, that would be ideal.
> > I agree that there will always be situations where the kernel defaults
> > are not optimal and either require changing them system wide, or
> > possibly per rdma_cm_id.
> >
> > If we believe that switching to a backoff retry timer is a better
> > direction or should be an option, does that change the approach for this
> patch?
> > A retry count still makes sense, but the timeout is more complex.  On
> > that note, I would specify a timeout in something straightforward, like
> milliseconds.
> 
> An exponential backoff timer seems to be a good solution to reduce temporary
> contentions (when several node reconnect simultaneously).
> But it makes the overall connection timeout more complex. That why you
> don't want to expose the initial CM timeout to the user.
> 
> So, if I follow you here. You suggest to expose only a "connection timeout in
> ms" to the user and determine a retries count with that.

Not quite.  I agree with you and wouldn't go this route.

I was saying *if* we expose a timeout value, that we use ms or seconds, not Infiniband Bizarre Time.

The main point is to avoid exposing options that assume a linear retry timeout.

> For example, if an user defines a timeout of 50s (with an initial timeout of 1s),
> we should configure 4 retries. But this would make an effective timeout of 31s.
> 
> I don't like that idea because it hides what is actually done:
> A user will set a value in ms and he could have several seconds or minutes of
> difference with what he expect.
> 
> So, I would prefer the kernel TCP way. They defined "tcp_retries2" to configure
> the maximum number of retransmissions for an active connection.
> The initial timeout value is not configurable (TCP_RTO_MIN). And the
> retransmission timeout is between TCP_RTO_MIN (200ms) and
> TCP_RTO_MAX (120s).

I prefer the TCP way as well, including a way to configure the system min/max timeouts in case the defaults don't work in some environment.  Having a per rdma_cm_id option to change the number of retries seems reasonable.  Personally, I'd like it so that apps never need to touch it.  Trying to expose a timeout value is more difficult if we switch to using backoff retry timer.

- Sean
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 1e2cd7c8716e..cc73b9708862 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1002,6 +1002,8 @@  __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
 	id_priv->tos_set = false;
 	id_priv->timeout_set = false;
 	id_priv->min_rnr_timer_set = false;
+	id_priv->max_cm_retries = false;
+	id_priv->cm_timeout = false;
 	id_priv->gid_type = IB_GID_TYPE_IB;
 	spin_lock_init(&id_priv->lock);
 	mutex_init(&id_priv->qp_mutex);
@@ -2845,6 +2847,80 @@  int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
 }
 EXPORT_SYMBOL(rdma_set_min_rnr_timer);
 
+/**
+ * rdma_set_cm_retries() - Set the maximum of CM retries of the QP associated
+ *			   with a connection identifier.
+ * @id: Communication identifier associated with service type.
+ * @max_cm_retries: 4-bit value definied as "Max CM Retries" REQ field
+ *		    (Table 99 "REQ Message Contents" in the IBTA specification).
+ *
+ * This function should be called before rdma_connect() on active side.
+ * The value will determine the maximum number of times the CM should
+ * resend a connection request or reply (REQ/REP) before giving up (UNREACHABLE
+ * event).
+ *
+ * Return: 0 for success
+ */
+int rdma_set_cm_retries(struct rdma_cm_id *id, u8 max_cm_retries)
+{
+	struct rdma_id_private *id_priv;
+
+	/* It is a 4-bit value */
+	if (max_cm_retries & 0xf0)
+		return -EINVAL;
+
+	if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
+		return -EINVAL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	id_priv->max_cm_retries = max_cm_retries;
+	id_priv->max_cm_retries_set = true;
+	mutex_unlock(&id_priv->qp_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(rdma_set_cm_retries);
+
+/**
+ * rdma_set_cm_timeout() - Set the CM timeouts of the QP associated with a
+ *			   connection identifier.
+ * @id: Communication identifier associated with service type.
+ * @cm_timeout: 5-bit value, expressed as 4.096 * 2^(timeout) usec.
+ *		This value should be superior than 0.
+ *
+ * This function should be called before rdma_connect() on active side.
+ * The value will affect the "Remote CM Response Timeout" and the
+ * "Local CM Response Timeout" timeouts to respond to a connection request (REQ)
+ * and to wait for connection reply (REP) ack on the remote node.
+ *
+ * Round-trip timeouts for a REQ and REP requests:
+ * REQ_timeout_ms = remote_cm_response_timeout_ms + 2* PacketLifeTime_ms
+ * REP_timeout_ms = local_cm_response_timeout_ms
+ *
+ * Return: 0 for success
+ */
+int rdma_set_cm_timeout(struct rdma_cm_id *id, u8 cm_timeout)
+{
+	struct rdma_id_private *id_priv;
+
+	/* It is a 5-bit value */
+	if (!cm_timeout || (cm_timeout & 0xe0))
+		return -EINVAL;
+
+	if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
+		return -EINVAL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	id_priv->cm_timeout = cm_timeout;
+	id_priv->cm_timeout_set = true;
+	mutex_unlock(&id_priv->qp_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(rdma_set_cm_timeout);
+
 static int route_set_path_rec_inbound(struct cma_work *work,
 				      struct sa_path_rec *path_rec)
 {
@@ -4295,8 +4371,11 @@  static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
 	req.path = id_priv->id.route.path_rec;
 	req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
 	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
-	req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
-	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.timeout_ms = id_priv->cm_timeout_set ?
+		id_priv->cm_timeout : CMA_CM_RESPONSE_TIMEOUT;
+	req.timeout_ms = 1 << (req.timeout_ms - 8);
+	req.max_cm_retries = id_priv->max_cm_retries_set ?
+		id_priv->max_cm_retries : CMA_MAX_CM_RETRIES;
 
 	trace_cm_send_sidr_req(id_priv);
 	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
@@ -4368,9 +4447,12 @@  static int cma_connect_ib(struct rdma_id_private *id_priv,
 	req.flow_control = conn_param->flow_control;
 	req.retry_count = min_t(u8, 7, conn_param->retry_count);
 	req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
-	req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
-	req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
-	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.remote_cm_response_timeout = id_priv->cm_timeout_set ?
+		id_priv->cm_timeout : CMA_CM_RESPONSE_TIMEOUT;
+	req.local_cm_response_timeout = id_priv->cm_timeout_set ?
+		id_priv->cm_timeout : CMA_CM_RESPONSE_TIMEOUT;
+	req.max_cm_retries = id_priv->max_cm_retries_set ?
+		id_priv->max_cm_retries : CMA_MAX_CM_RETRIES;
 	req.srq = id_priv->srq ? 1 : 0;
 	req.ece.vendor_id = id_priv->ece.vendor_id;
 	req.ece.attr_mod = id_priv->ece.attr_mod;
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index b7354c94cf1b..e3a35eb1bf96 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -95,10 +95,14 @@  struct rdma_id_private {
 	u8			tos_set:1;
 	u8                      timeout_set:1;
 	u8			min_rnr_timer_set:1;
+	u8			max_cm_retries_set:1;
+	u8			cm_timeout_set:1;
 	u8			reuseaddr;
 	u8			afonly;
 	u8			timeout;
 	u8			min_rnr_timer;
+	u8			max_cm_retries;
+	u8			cm_timeout;
 	u8 used_resolve_ip;
 	enum ib_gid_type	gid_type;
 
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 5f5ad8faf86e..a95f513077ac 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1284,6 +1284,20 @@  static int ucma_set_option_id(struct ucma_context *ctx, int optname,
 		}
 		ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
 		break;
+	case RDMA_OPTION_ID_CM_RETRIES:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_cm_retries(ctx->cm_id, *((u8 *)optval));
+		break;
+	case RDMA_OPTION_ID_CM_TIMEOUTS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_cm_timeout(ctx->cm_id, *((u8 *)optval));
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 8a8ab2f793ab..b5923ceb9853 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -344,6 +344,11 @@  int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
 int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout);
 
 int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer);
+
+int rdma_set_cm_retries(struct rdma_cm_id *id, u8 max_cm_retries);
+
+int rdma_set_cm_timeout(struct rdma_cm_id *id, u8 cm_timeout);
+
  /**
  * rdma_get_service_id - Return the IB service ID for a specified address.
  * @id: Communication identifier associated with the address.
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index 7cea03581f79..eadff72ecd54 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -313,7 +313,9 @@  enum {
 	RDMA_OPTION_ID_TOS	 = 0,
 	RDMA_OPTION_ID_REUSEADDR = 1,
 	RDMA_OPTION_ID_AFONLY	 = 2,
-	RDMA_OPTION_ID_ACK_TIMEOUT = 3
+	RDMA_OPTION_ID_ACK_TIMEOUT = 3,
+	RDMA_OPTION_ID_CM_RETRIES = 4,
+	RDMA_OPTION_ID_CM_TIMEOUTS = 5
 };
 
 enum {