diff mbox series

[rdma-next,1/2] IB/core: Query IBoE link speed with a new driver API

Message ID 67b6ea0621b22b77db4cd637a4f9b48a2f447898.1681132096.git.leon@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series Query IBoE speed directly | expand

Commit Message

Leon Romanovsky April 10, 2023, 1:12 p.m. UTC
From: Mark Zhang <markzhang@nvidia.com>

Currently the ethtool API is used to get IBoE link speed, which must be
protected with the rtnl lock. This becomes a bottleneck when try to setup
many rdma-cm connections at the same time, especially with multiple
processes.

In order to avoid it, a new driver API is introduced to query the IBoE
rate. It will be used firstly, and back to ethtool way if it fails.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cma.c    |  6 ++++--
 drivers/infiniband/core/device.c |  1 +
 include/rdma/ib_addr.h           | 31 ++++++++++++++++++++-----------
 include/rdma/ib_verbs.h          |  3 +++
 4 files changed, 28 insertions(+), 13 deletions(-)

Comments

Jason Gunthorpe April 10, 2023, 11:23 p.m. UTC | #1
On Mon, Apr 10, 2023 at 04:12:06PM +0300, Leon Romanovsky wrote:
> From: Mark Zhang <markzhang@nvidia.com>
> 
> Currently the ethtool API is used to get IBoE link speed, which must be
> protected with the rtnl lock. This becomes a bottleneck when try to setup
> many rdma-cm connections at the same time, especially with multiple
> processes.
> 
> In order to avoid it, a new driver API is introduced to query the IBoE
> rate. It will be used firstly, and back to ethtool way if it fails.
> 
> Signed-off-by: Mark Zhang <markzhang@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/infiniband/core/cma.c    |  6 ++++--
>  drivers/infiniband/core/device.c |  1 +
>  include/rdma/ib_addr.h           | 31 ++++++++++++++++++++-----------
>  include/rdma/ib_verbs.h          |  3 +++
>  4 files changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> index 9c7d26a7d243..ff706d2e39c6 100644
> --- a/drivers/infiniband/core/cma.c
> +++ b/drivers/infiniband/core/cma.c
> @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
>  	route->path_rec->traffic_class = tos;
>  	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
>  	route->path_rec->rate_selector = IB_SA_EQ;
> -	route->path_rec->rate = iboe_get_rate(ndev);
> +	route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device,
> +					      id_priv->id.port_num);
>  	dev_put(ndev);
>  	route->path_rec->packet_life_time_selector = IB_SA_EQ;
>  	/* In case ACK timeout is set, use this value to calculate
> @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
>  	if (!ndev)
>  		return -ENODEV;
>  
> -	ib.rec.rate = iboe_get_rate(ndev);
> +	ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device,
> +				    id_priv->id.port_num);
>  	ib.rec.hop_limit = 1;
>  	ib.rec.mtu = iboe_get_mtu(ndev->mtu);

What do we even use rate for in roce mode in the first place?

Jason
Mark Zhang April 10, 2023, 11:42 p.m. UTC | #2
On 4/11/2023 7:23 AM, Jason Gunthorpe wrote:
> On Mon, Apr 10, 2023 at 04:12:06PM +0300, Leon Romanovsky wrote:
>> From: Mark Zhang <markzhang@nvidia.com>
>>
>> Currently the ethtool API is used to get IBoE link speed, which must be
>> protected with the rtnl lock. This becomes a bottleneck when try to setup
>> many rdma-cm connections at the same time, especially with multiple
>> processes.
>>
>> In order to avoid it, a new driver API is introduced to query the IBoE
>> rate. It will be used firstly, and back to ethtool way if it fails.
>>
>> Signed-off-by: Mark Zhang <markzhang@nvidia.com>
>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>> ---
>>   drivers/infiniband/core/cma.c    |  6 ++++--
>>   drivers/infiniband/core/device.c |  1 +
>>   include/rdma/ib_addr.h           | 31 ++++++++++++++++++++-----------
>>   include/rdma/ib_verbs.h          |  3 +++
>>   4 files changed, 28 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
>> index 9c7d26a7d243..ff706d2e39c6 100644
>> --- a/drivers/infiniband/core/cma.c
>> +++ b/drivers/infiniband/core/cma.c
>> @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
>>   	route->path_rec->traffic_class = tos;
>>   	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
>>   	route->path_rec->rate_selector = IB_SA_EQ;
>> -	route->path_rec->rate = iboe_get_rate(ndev);
>> +	route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device,
>> +					      id_priv->id.port_num);
>>   	dev_put(ndev);
>>   	route->path_rec->packet_life_time_selector = IB_SA_EQ;
>>   	/* In case ACK timeout is set, use this value to calculate
>> @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
>>   	if (!ndev)
>>   		return -ENODEV;
>>   
>> -	ib.rec.rate = iboe_get_rate(ndev);
>> +	ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device,
>> +				    id_priv->id.port_num);
>>   	ib.rec.hop_limit = 1;
>>   	ib.rec.mtu = iboe_get_mtu(ndev->mtu);
> 
> What do we even use rate for in roce mode in the first place?
> 
In mlx5 it is set to "address_path.stat_rate", but I believe we always 
set 0 for roce actually. Not sure about other devices?
Jason Gunthorpe April 11, 2023, 11:20 a.m. UTC | #3
On Tue, Apr 11, 2023 at 07:42:37AM +0800, Mark Zhang wrote:
> On 4/11/2023 7:23 AM, Jason Gunthorpe wrote:
> > On Mon, Apr 10, 2023 at 04:12:06PM +0300, Leon Romanovsky wrote:
> > > From: Mark Zhang <markzhang@nvidia.com>
> > > 
> > > Currently the ethtool API is used to get IBoE link speed, which must be
> > > protected with the rtnl lock. This becomes a bottleneck when try to setup
> > > many rdma-cm connections at the same time, especially with multiple
> > > processes.
> > > 
> > > In order to avoid it, a new driver API is introduced to query the IBoE
> > > rate. It will be used firstly, and back to ethtool way if it fails.
> > > 
> > > Signed-off-by: Mark Zhang <markzhang@nvidia.com>
> > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > ---
> > >   drivers/infiniband/core/cma.c    |  6 ++++--
> > >   drivers/infiniband/core/device.c |  1 +
> > >   include/rdma/ib_addr.h           | 31 ++++++++++++++++++++-----------
> > >   include/rdma/ib_verbs.h          |  3 +++
> > >   4 files changed, 28 insertions(+), 13 deletions(-)
> > > 
> > > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> > > index 9c7d26a7d243..ff706d2e39c6 100644
> > > --- a/drivers/infiniband/core/cma.c
> > > +++ b/drivers/infiniband/core/cma.c
> > > @@ -3296,7 +3296,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
> > >   	route->path_rec->traffic_class = tos;
> > >   	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
> > >   	route->path_rec->rate_selector = IB_SA_EQ;
> > > -	route->path_rec->rate = iboe_get_rate(ndev);
> > > +	route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device,
> > > +					      id_priv->id.port_num);
> > >   	dev_put(ndev);
> > >   	route->path_rec->packet_life_time_selector = IB_SA_EQ;
> > >   	/* In case ACK timeout is set, use this value to calculate
> > > @@ -4962,7 +4963,8 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
> > >   	if (!ndev)
> > >   		return -ENODEV;
> > > -	ib.rec.rate = iboe_get_rate(ndev);
> > > +	ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device,
> > > +				    id_priv->id.port_num);
> > >   	ib.rec.hop_limit = 1;
> > >   	ib.rec.mtu = iboe_get_mtu(ndev->mtu);
> > 
> > What do we even use rate for in roce mode in the first place?
> > 
> In mlx5 it is set to "address_path.stat_rate", but I believe we always set 0
> for roce actually. Not sure about other devices?

"rate" is to reduce the packet rate of connections, it should always
be 0 for roce, AFAIK.

Maybe we should look into making that the case instead of this?

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9c7d26a7d243..ff706d2e39c6 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -3296,7 +3296,8 @@  static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 	route->path_rec->traffic_class = tos;
 	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
 	route->path_rec->rate_selector = IB_SA_EQ;
-	route->path_rec->rate = iboe_get_rate(ndev);
+	route->path_rec->rate = iboe_get_rate(ndev, id_priv->id.device,
+					      id_priv->id.port_num);
 	dev_put(ndev);
 	route->path_rec->packet_life_time_selector = IB_SA_EQ;
 	/* In case ACK timeout is set, use this value to calculate
@@ -4962,7 +4963,8 @@  static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 	if (!ndev)
 		return -ENODEV;
 
-	ib.rec.rate = iboe_get_rate(ndev);
+	ib.rec.rate = iboe_get_rate(ndev, id_priv->id.device,
+				    id_priv->id.port_num);
 	ib.rec.hop_limit = 1;
 	ib.rec.mtu = iboe_get_mtu(ndev->mtu);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a666847bd714..ba06a08c6497 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2693,6 +2693,7 @@  void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, query_ah);
 	SET_DEVICE_OP(dev_ops, query_device);
 	SET_DEVICE_OP(dev_ops, query_gid);
+	SET_DEVICE_OP(dev_ops, query_iboe_speed);
 	SET_DEVICE_OP(dev_ops, query_pkey);
 	SET_DEVICE_OP(dev_ops, query_port);
 	SET_DEVICE_OP(dev_ops, query_qp);
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index d808dc3d239e..de762210ebd1 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -194,24 +194,33 @@  static inline enum ib_mtu iboe_get_mtu(int mtu)
 		return 0;
 }
 
-static inline int iboe_get_rate(struct net_device *dev)
+static inline int iboe_get_rate(struct net_device *ndev,
+				struct ib_device *ibdev, u32 port_num)
 {
 	struct ethtool_link_ksettings cmd;
-	int err;
+	int speed, err;
 
-	rtnl_lock();
-	err = __ethtool_get_link_ksettings(dev, &cmd);
-	rtnl_unlock();
-	if (err)
-		return IB_RATE_PORT_CURRENT;
+	if (ibdev->ops.query_iboe_speed) {
+		err = ibdev->ops.query_iboe_speed(ibdev, port_num, &speed);
+		if (err)
+			return IB_RATE_PORT_CURRENT;
+	} else {
+		rtnl_lock();
+		err = __ethtool_get_link_ksettings(ndev, &cmd);
+		rtnl_unlock();
+		if (err)
+			return IB_RATE_PORT_CURRENT;
+
+		speed = cmd.base.speed;
+	}
 
-	if (cmd.base.speed >= 40000)
+	if (speed >= 40000)
 		return IB_RATE_40_GBPS;
-	else if (cmd.base.speed >= 30000)
+	else if (speed >= 30000)
 		return IB_RATE_30_GBPS;
-	else if (cmd.base.speed >= 20000)
+	else if (speed >= 20000)
 		return IB_RATE_20_GBPS;
-	else if (cmd.base.speed >= 10000)
+	else if (speed >= 10000)
 		return IB_RATE_10_GBPS;
 	else
 		return IB_RATE_PORT_CURRENT;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index cc2ddd4e6c12..b143258b847f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2678,6 +2678,9 @@  struct ib_device_ops {
 	int (*query_ucontext)(struct ib_ucontext *context,
 			      struct uverbs_attr_bundle *attrs);
 
+	/* Query driver for IBoE link speed */
+	int (*query_iboe_speed)(struct ib_device *device, u32 port_num,
+				int *speed);
 	/*
 	 * Provide NUMA node. This API exists for rdmavt/hfi1 only.
 	 * Everyone else relies on Linux memory management model.