Message ID | 67b6ea0621b22b77db4cd637a4f9b48a2f447898.1681132096.git.leon@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | Query IBoE speed directly | expand |
On Mon, Apr 10, 2023 at 04:12:06PM +0300, Leon Romanovsky wrote: > From: Mark Zhang <markzhang@nvidia.com> > > Currently the ethtool API is used to get IBoE link speed, which must be > protected with the rtnl lock. This becomes a bottleneck when try to setup > many rdma-cm connections at the same time, especially with multiple > processes. > > In order to avoid it, a new driver API is introduced to query the IBoE > rate. It will be used firstly, and back to ethtool way if it fails. > > Signed-off-by: Mark Zhang <markzhang@nvidia.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > drivers/infiniband/core/cma.c | 6 ++++-- > drivers/infiniband/core/device.c | 1 + > include/rdma/ib_addr.h | 31 ++++++++++++++++++++----------- > include/rdma/ib_verbs.h | 3 +++ > 4 files changed, 28 insertions(+), 13 deletions(-) > > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c > index 9c7d26a7d243..ff706d2e39c6 100644 > --- a/drivers/infiniband/core/cma.c > +++ b/drivers/infiniband/core/cma.c > @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) > route->path_rec->traffic_class = tos; > route->path_rec->mtu = iboe_get_mtu(ndev->mtu); > route->path_rec->rate_selector = IB_SA_EQ; > - route->path_rec->rate = iboe_get_rate(ndev); > + route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device, > + id_priv->id.port_num); > dev_put(ndev); > route->path_rec->packet_life_time_selector = IB_SA_EQ; > /* In case ACK timeout is set, use this value to calculate > @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, > if (!ndev) > return -ENODEV; > > - ib.rec.rate = iboe_get_rate(ndev); > + ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device, > + id_priv->id.port_num); > ib.rec.hop_limit = 1; > ib.rec.mtu = iboe_get_mtu(ndev->mtu); What do we even use rate for in roce mode in the first place? Jason
On 4/11/2023 7:23 AM, Jason Gunthorpe wrote: > On Mon, Apr 10, 2023 at 04:12:06PM +0300, Leon Romanovsky wrote: >> From: Mark Zhang <markzhang@nvidia.com> >> >> Currently the ethtool API is used to get IBoE link speed, which must be >> protected with the rtnl lock. This becomes a bottleneck when try to setup >> many rdma-cm connections at the same time, especially with multiple >> processes. >> >> In order to avoid it, a new driver API is introduced to query the IBoE >> rate. It will be used firstly, and back to ethtool way if it fails. >> >> Signed-off-by: Mark Zhang <markzhang@nvidia.com> >> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> >> --- >> drivers/infiniband/core/cma.c | 6 ++++-- >> drivers/infiniband/core/device.c | 1 + >> include/rdma/ib_addr.h | 31 ++++++++++++++++++++----------- >> include/rdma/ib_verbs.h | 3 +++ >> 4 files changed, 28 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c >> index 9c7d26a7d243..ff706d2e39c6 100644 >> --- a/drivers/infiniband/core/cma.c >> +++ b/drivers/infiniband/core/cma.c >> @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) >> route->path_rec->traffic_class = tos; >> route->path_rec->mtu = iboe_get_mtu(ndev->mtu); >> route->path_rec->rate_selector = IB_SA_EQ; >> - route->path_rec->rate = iboe_get_rate(ndev); >> + route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device, >> + id_priv->id.port_num); >> dev_put(ndev); >> route->path_rec->packet_life_time_selector = IB_SA_EQ; >> /* In case ACK timeout is set, use this value to calculate >> @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, >> if (!ndev) >> return -ENODEV; >> >> - ib.rec.rate = iboe_get_rate(ndev); >> + ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device, >> + id_priv->id.port_num); >> ib.rec.hop_limit = 1; >> ib.rec.mtu = iboe_get_mtu(ndev->mtu); > > What do we even use rate for in roce mode in the first place? > In mlx5 it is set to "address_path.stat_rate", but I believe we always set 0 for roce actually. Not sure about other devices?
On Tue, Apr 11, 2023 at 07:42:37AM +0800, Mark Zhang wrote: > On 4/11/2023 7:23 AM, Jason Gunthorpe wrote: > > On Mon, Apr 10, 2023 at 04:12:06PM +0300, Leon Romanovsky wrote: > > > From: Mark Zhang <markzhang@nvidia.com> > > > > > > Currently the ethtool API is used to get IBoE link speed, which must be > > > protected with the rtnl lock. This becomes a bottleneck when try to setup > > > many rdma-cm connections at the same time, especially with multiple > > > processes. > > > > > > In order to avoid it, a new driver API is introduced to query the IBoE > > > rate. It will be used firstly, and back to ethtool way if it fails. > > > > > > Signed-off-by: Mark Zhang <markzhang@nvidia.com> > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > > > --- > > > drivers/infiniband/core/cma.c | 6 ++++-- > > > drivers/infiniband/core/device.c | 1 + > > > include/rdma/ib_addr.h | 31 ++++++++++++++++++++----------- > > > include/rdma/ib_verbs.h | 3 +++ > > > 4 files changed, 28 insertions(+), 13 deletions(-) > > > > > > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c > > > index 9c7d26a7d243..ff706d2e39c6 100644 > > > --- a/drivers/infiniband/core/cma.c > > > +++ b/drivers/infiniband/core/cma.c > > > @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) > > > route->path_rec->traffic_class = tos; > > > route->path_rec->mtu = iboe_get_mtu(ndev->mtu); > > > route->path_rec->rate_selector = IB_SA_EQ; > > > - route->path_rec->rate = iboe_get_rate(ndev); > > > + route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device, > > > + id_priv->id.port_num); > > > dev_put(ndev); > > > route->path_rec->packet_life_time_selector = IB_SA_EQ; > > > /* In case ACK timeout is set, use this value to calculate > > > @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, > > > if (!ndev) > > > return -ENODEV; > > > - ib.rec.rate = iboe_get_rate(ndev); > > > + ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device, > > > + id_priv->id.port_num); > > > ib.rec.hop_limit = 1; > > > ib.rec.mtu = iboe_get_mtu(ndev->mtu); > > > > What do we even use rate for in roce mode in the first place? > > > In mlx5 it is set to "address_path.stat_rate", but I believe we always set 0 > for roce actually. Not sure about other devices? "rate" is to reduce the packet rate of connections, it should always be 0 for roce, AFAIK. Maybe we should look into making that the case instead of this? Jason
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9c7d26a7d243..ff706d2e39c6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; - route->path_rec->rate = iboe_get_rate(ndev); + route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device, + id_priv->id.port_num); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (!ndev) return -ENODEV; - ib.rec.rate = iboe_get_rate(ndev); + ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device, + id_priv->id.port_num); ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a666847bd714..ba06a08c6497 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2693,6 +2693,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); + SET_DEVICE_OP(dev_ops, query_iboe_speed); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index d808dc3d239e..de762210ebd1 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -194,24 +194,33 @@ static inline enum ib_mtu iboe_get_mtu(int mtu) return 0; } -static inline int iboe_get_rate(struct net_device *dev) +static inline int iboe_get_rate(struct net_device *ndev, + struct ib_device *ibdev, u32 port_num) { struct ethtool_link_ksettings cmd; - int err; + int speed, err; - rtnl_lock(); - err = __ethtool_get_link_ksettings(dev, &cmd); - rtnl_unlock(); - if (err) - return IB_RATE_PORT_CURRENT; + if (ibdev->ops.query_iboe_speed) { + err = ibdev->ops.query_iboe_speed(ibdev, port_num, &speed); + if (err) + return IB_RATE_PORT_CURRENT; + } else { + rtnl_lock(); + err = __ethtool_get_link_ksettings(ndev, &cmd); + rtnl_unlock(); + if (err) + return IB_RATE_PORT_CURRENT; + + speed = cmd.base.speed; + } - if (cmd.base.speed >= 40000) + if (speed >= 40000) return IB_RATE_40_GBPS; - else if (cmd.base.speed >= 30000) + else if (speed >= 30000) return IB_RATE_30_GBPS; - else if (cmd.base.speed >= 20000) + else if (speed >= 20000) return IB_RATE_20_GBPS; - else if (cmd.base.speed >= 10000) + else if (speed >= 10000) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cc2ddd4e6c12..b143258b847f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2678,6 +2678,9 @@ struct ib_device_ops { int (*query_ucontext)(struct ib_ucontext *context, struct uverbs_attr_bundle *attrs); + /* Query driver for IBoE link speed */ + int (*query_iboe_speed)(struct ib_device *device, u32 port_num, + int *speed); /* * Provide NUMA node. This API exists for rdmavt/hfi1 only. * Everyone else relies on Linux memory management model.