diff mbox series

[rdma-next,4/8] IB/core: Skip device which doesn't have necessary capabilities

Message ID 20210405055000.215792-5-leon@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series Generalize if ULP supported check | expand

Commit Message

Leon Romanovsky April 5, 2021, 5:49 a.m. UTC
From: Parav Pandit <parav@nvidia.com>

If device doesn't have multicast capability, avoid client registration
for it. This saves 16Kbytes of memory for a RDMA device consist of 128
ports.

If device doesn't support subnet administration, avoid client
registration for it. This saves 8Kbytes of memory for a RDMA device
consist of 128 ports.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/multicast.c | 15 ++++++++++++++-
 drivers/infiniband/core/sa_query.c  | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

Comments

Jason Gunthorpe April 6, 2021, 3:46 p.m. UTC | #1
On Mon, Apr 05, 2021 at 08:49:56AM +0300, Leon Romanovsky wrote:
> @@ -2293,6 +2295,17 @@ static void ib_sa_event(struct ib_event_handler *handler,
>  	}
>  }
>  
> +static bool ib_sa_client_supported(struct ib_device *device)
> +{
> +	unsigned int i;
> +
> +	rdma_for_each_port(device, i) {
> +		if (rdma_cap_ib_sa(device, i))
> +			return true;
> +	}
> +	return false;
> +}

This is already done though:

	for (i = 0; i <= e - s; ++i) {
		spin_lock_init(&sa_dev->port[i].ah_lock);
		if (!rdma_cap_ib_sa(device, i + 1))
			continue;
[..]

	if (!count) {
		ret = -EOPNOTSUPP;
		goto free;

Why does it need to be duplicated? The other patches are all basically
like that too.

The add_one function should return -EOPNOTSUPP if it doesn't want to
run on this device and any supported checks should just be at the
front - this is how things work right now

Jason
Parav Pandit April 7, 2021, 3:06 p.m. UTC | #2
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, April 6, 2021 9:17 PM
> 
> On Mon, Apr 05, 2021 at 08:49:56AM +0300, Leon Romanovsky wrote:
> > @@ -2293,6 +2295,17 @@ static void ib_sa_event(struct ib_event_handler
> *handler,
> >  	}
> >  }
> >
> > +static bool ib_sa_client_supported(struct ib_device *device) {
> > +	unsigned int i;
> > +
> > +	rdma_for_each_port(device, i) {
> > +		if (rdma_cap_ib_sa(device, i))
> > +			return true;
> > +	}
> > +	return false;
> > +}
> 
> This is already done though:
It is but, ib_sa_device() allocates ib_sa_device worth of struct for each port without checking the rdma_cap_ib_sa().
This results into allocating 40 * 512 = 20480 rounded of to power of 2 to 32K bytes of memory for the rdma device with 512 ports.
Other modules are also similarly wasting such memory.

> 
> 	for (i = 0; i <= e - s; ++i) {
> 		spin_lock_init(&sa_dev->port[i].ah_lock);
> 		if (!rdma_cap_ib_sa(device, i + 1))
> 			continue;
> [..]
> 
> 	if (!count) {
> 		ret = -EOPNOTSUPP;
> 		goto free;
> 
> Why does it need to be duplicated? The other patches are all basically like
> that too.
> 
> The add_one function should return -EOPNOTSUPP if it doesn't want to run
> on this device and any supported checks should just be at the front - this is
> how things work right now
> 
I am ok to fold this check at the beginning of add callback.
When 512 to 1K RoCE devices are used, they do not have SA, CM, CMA etc caps on and all the client needs to go through refcnt + xa + sem and unroll them.
Is_supported() routine helps to cut down all of it. I didn't calculate the usec saved with it.

Please let me know.
Jason Gunthorpe April 7, 2021, 3:13 p.m. UTC | #3
On Wed, Apr 07, 2021 at 03:06:35PM +0000, Parav Pandit wrote:
> 
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, April 6, 2021 9:17 PM
> > 
> > On Mon, Apr 05, 2021 at 08:49:56AM +0300, Leon Romanovsky wrote:
> > > @@ -2293,6 +2295,17 @@ static void ib_sa_event(struct ib_event_handler
> > *handler,
> > >  	}
> > >  }
> > >
> > > +static bool ib_sa_client_supported(struct ib_device *device) {
> > > +	unsigned int i;
> > > +
> > > +	rdma_for_each_port(device, i) {
> > > +		if (rdma_cap_ib_sa(device, i))
> > > +			return true;
> > > +	}
> > > +	return false;
> > > +}
> > 
> > This is already done though:

> It is but, ib_sa_device() allocates ib_sa_device worth of struct for
> each port without checking the rdma_cap_ib_sa().  This results into
> allocating 40 * 512 = 20480 rounded of to power of 2 to 32K bytes of
> memory for the rdma device with 512 ports.  Other modules are also
> similarly wasting such memory.

If it returns EOPNOTUPP then the remove is never called so if it
allocated memory and left it allocated then it is leaking memory.

If you are saying 32k bytes of temporary allocation matters during
device startup then it needs benchmarks and a use case.

> > The add_one function should return -EOPNOTSUPP if it doesn't want to run
> > on this device and any supported checks should just be at the front - this is
> > how things work right now

> I am ok to fold this check at the beginning of add callback.  When
> 512 to 1K RoCE devices are used, they do not have SA, CM, CMA etc
> caps on and all the client needs to go through refcnt + xa + sem and
> unroll them.  Is_supported() routine helps to cut down all of it. I
> didn't calculate the usec saved with it.

If that is the reason then explain in the cover letter and provide
benchmarks

Jason
Parav Pandit April 7, 2021, 3:44 p.m. UTC | #4
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, April 7, 2021 8:44 PM
> 
> On Wed, Apr 07, 2021 at 03:06:35PM +0000, Parav Pandit wrote:
> >
> >
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Tuesday, April 6, 2021 9:17 PM
> > >
> > > On Mon, Apr 05, 2021 at 08:49:56AM +0300, Leon Romanovsky wrote:
> > > > @@ -2293,6 +2295,17 @@ static void ib_sa_event(struct
> > > > ib_event_handler
> > > *handler,
> > > >  	}
> > > >  }
> > > >
> > > > +static bool ib_sa_client_supported(struct ib_device *device) {
> > > > +	unsigned int i;
> > > > +
> > > > +	rdma_for_each_port(device, i) {
> > > > +		if (rdma_cap_ib_sa(device, i))
> > > > +			return true;
> > > > +	}
> > > > +	return false;
> > > > +}
> > >
> > > This is already done though:
> 
> > It is but, ib_sa_device() allocates ib_sa_device worth of struct for
> > each port without checking the rdma_cap_ib_sa().  This results into
> > allocating 40 * 512 = 20480 rounded of to power of 2 to 32K bytes of
> > memory for the rdma device with 512 ports.  Other modules are also
> > similarly wasting such memory.
> 
> If it returns EOPNOTUPP then the remove is never called so if it allocated
> memory and left it allocated then it is leaking memory.
> 
I probably confused you. There is no leak today because add_one allocates memory, and later on when SA/CM etc per port cap is not present, it is unused left there which is freed on remove_one().
Returning EOPNOTUPP is fine at start of add_one() before allocation.

> If you are saying 32k bytes of temporary allocation matters during device
> startup then it needs benchmarks and a use case.
> 
Use case is clear and explained in commit logs, i.e. to not allocate the memory which is never used.

> > > The add_one function should return -EOPNOTSUPP if it doesn't want to
> > > run on this device and any supported checks should just be at the
> > > front - this is how things work right now
> 
> > I am ok to fold this check at the beginning of add callback.  When
> > 512 to 1K RoCE devices are used, they do not have SA, CM, CMA etc caps
> > on and all the client needs to go through refcnt + xa + sem and unroll
> > them.  Is_supported() routine helps to cut down all of it. I didn't
> > calculate the usec saved with it.
> 
> If that is the reason then explain in the cover letter and provide benchmarks
I doubt it will be significant but I will do a benchmark.
Jason Gunthorpe April 8, 2021, 12:16 p.m. UTC | #5
On Wed, Apr 07, 2021 at 03:44:35PM +0000, Parav Pandit wrote:

> > If it returns EOPNOTUPP then the remove is never called so if it allocated
> > memory and left it allocated then it is leaking memory.
> > 
> I probably confused you. There is no leak today because add_one
> allocates memory, and later on when SA/CM etc per port cap is not
> present, it is unused left there which is freed on remove_one().
> Returning EOPNOTUPP is fine at start of add_one() before allocation.

Most of ULPs are OK, eg umad does:

	umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL);
	if (!umad_dev)
		return -ENOMEM;
	for (i = s; i <= e; ++i) {
		if (!rdma_cap_ib_mad(device, i))
			continue;

	if (!count) {
		ret = -EOPNOTSUPP;
		goto free;
free:
	/* balances kref_init */
	ib_umad_dev_put(umad_dev);

It looks like only cm.c and cma.c need fixing, just fix those two.

The CM using ULPs have a different issue though..

Jason
Parav Pandit April 9, 2021, 12:31 p.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, April 8, 2021 5:46 PM
> On Wed, Apr 07, 2021 at 03:44:35PM +0000, Parav Pandit wrote:
> 
> > > If it returns EOPNOTUPP then the remove is never called so if it
> > > allocated memory and left it allocated then it is leaking memory.
> > >
> > I probably confused you. There is no leak today because add_one
> > allocates memory, and later on when SA/CM etc per port cap is not
> > present, it is unused left there which is freed on remove_one().
> > Returning EOPNOTUPP is fine at start of add_one() before allocation.
> 
> Most of ULPs are OK, eg umad does:
> 
> 	umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1),
> GFP_KERNEL);
> 	if (!umad_dev)
> 		return -ENOMEM;
> 	for (i = s; i <= e; ++i) {
> 		if (!rdma_cap_ib_mad(device, i))
> 			continue;
> 
> 	if (!count) {
> 		ret = -EOPNOTSUPP;
> 		goto free;
> free:
> 	/* balances kref_init */
> 	ib_umad_dev_put(umad_dev);
> 
> It looks like only cm.c and cma.c need fixing, just fix those two.
Only cma.c needs a fixing. cm.c also reports EOPNOTSUPP.
I will send the simplified fix through Leon.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index a5dd4b7a74bc..8c81acc24e3e 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -44,11 +44,13 @@ 
 
 static int mcast_add_one(struct ib_device *device);
 static void mcast_remove_one(struct ib_device *device, void *client_data);
+static bool mcast_client_supported(struct ib_device *device);
 
 static struct ib_client mcast_client = {
 	.name   = "ib_multicast",
 	.add    = mcast_add_one,
-	.remove = mcast_remove_one
+	.remove = mcast_remove_one,
+	.is_supported = mcast_client_supported,
 };
 
 static struct ib_sa_client	sa_client;
@@ -816,6 +818,17 @@  static void mcast_event_handler(struct ib_event_handler *handler,
 	}
 }
 
+static bool mcast_client_supported(struct ib_device *device)
+{
+	u32 i;
+
+	rdma_for_each_port(device, i) {
+		if (rdma_cap_ib_mcast(device, i))
+			return true;
+	}
+	return false;
+}
+
 static int mcast_add_one(struct ib_device *device)
 {
 	struct mcast_device *dev;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 9a4a49c37922..7e00e24d9423 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -176,11 +176,13 @@  static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
 
 static int ib_sa_add_one(struct ib_device *device);
 static void ib_sa_remove_one(struct ib_device *device, void *client_data);
+static bool ib_sa_client_supported(struct ib_device *device);
 
 static struct ib_client sa_client = {
 	.name   = "sa",
 	.add    = ib_sa_add_one,
-	.remove = ib_sa_remove_one
+	.remove = ib_sa_remove_one,
+	.is_supported = ib_sa_client_supported,
 };
 
 static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
@@ -2293,6 +2295,17 @@  static void ib_sa_event(struct ib_event_handler *handler,
 	}
 }
 
+static bool ib_sa_client_supported(struct ib_device *device)
+{
+	unsigned int i;
+
+	rdma_for_each_port(device, i) {
+		if (rdma_cap_ib_sa(device, i))
+			return true;
+	}
+	return false;
+}
+
 static int ib_sa_add_one(struct ib_device *device)
 {
 	struct ib_sa_device *sa_dev;