diff mbox

[PATCHv12,1/3] rdmacg: Added rdma cgroup controller

Message ID 1472632647-1525-2-git-send-email-pandit.parav@gmail.com (mailing list archive)
State Superseded
Headers show

Commit Message

Parav Pandit Aug. 31, 2016, 8:37 a.m. UTC
Added rdma cgroup controller that does accounting, limit enforcement
on rdma/IB verbs and hw resources.

Added rdma cgroup header file which defines its APIs to perform
charing/uncharing functionality. It also defined APIs for RDMA/IB
stack for device registration. Devices which are registered will
participate in controller functions of accounting and limit
enforcements. It define rdmacg_device structure to bind IB stack
and RDMA cgroup controller.

RDMA resources are tracked using resource pool. Resource pool is per
device, per cgroup entity which allows setting up accounting limits
on per device basis.

Currently resources are defined by the RDMA cgroup.

Resource pool is created/destroyed dynamically whenever
charging/uncharging occurs respectively and whenever user
configuration is done. Its a tradeoff of memory vs little more code
space that creates resource pool object whenever necessary, instead of
creating them during cgroup creation and device registration time.

Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
---
 include/linux/cgroup_rdma.h   |  66 +++++
 include/linux/cgroup_subsys.h |   4 +
 init/Kconfig                  |  10 +
 kernel/Makefile               |   1 +
 kernel/cgroup_rdma.c          | 664 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 745 insertions(+)
 create mode 100644 include/linux/cgroup_rdma.h
 create mode 100644 kernel/cgroup_rdma.c

Comments

Leon Romanovsky Aug. 31, 2016, 9:38 a.m. UTC | #1
On Wed, Aug 31, 2016 at 02:07:25PM +0530, Parav Pandit wrote:
> Added rdma cgroup controller that does accounting, limit enforcement
> on rdma/IB verbs and hw resources.
>
> Added rdma cgroup header file which defines its APIs to perform
> charing/uncharing functionality. It also defined APIs for RDMA/IB
> stack for device registration. Devices which are registered will
> participate in controller functions of accounting and limit
> enforcements. It define rdmacg_device structure to bind IB stack
> and RDMA cgroup controller.
>
> RDMA resources are tracked using resource pool. Resource pool is per
> device, per cgroup entity which allows setting up accounting limits
> on per device basis.
>
> Currently resources are defined by the RDMA cgroup.
>
> Resource pool is created/destroyed dynamically whenever
> charging/uncharging occurs respectively and whenever user
> configuration is done. Its a tradeoff of memory vs little more code
> space that creates resource pool object whenever necessary, instead of
> creating them during cgroup creation and device registration time.
>
> Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
> ---

<...>

> +
> +static struct rdmacg_resource_pool *
> +get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
> +{
> +	struct rdmacg_resource_pool *rpool;
> +
> +	rpool = find_cg_rpool_locked(cg, device);
> +	if (rpool)
> +		return rpool;
> +
> +	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
> +	if (!rpool)
> +		return ERR_PTR(-ENOMEM);
> +
> +	rpool->device = device;
> +	set_all_resource_max_limit(rpool);
> +
> +	INIT_LIST_HEAD(&rpool->cg_node);
> +	INIT_LIST_HEAD(&rpool->dev_node);
> +	list_add_tail(&rpool->cg_node, &cg->rpools);
> +	list_add_tail(&rpool->dev_node, &device->rpools);
> +	return rpool;
> +}

<...>

> +	for (p = cg; p; p = parent_rdmacg(p)) {
> +		rpool = get_cg_rpool_locked(p, device);
> +		if (IS_ERR_OR_NULL(rpool)) {

get_cg_rpool_locked always returns !NULL (error, or pointer)

> +			ret = PTR_ERR(rpool);
> +			goto err;

I didn't review the whole series yet.
Matan Barak Aug. 31, 2016, 3:07 p.m. UTC | #2
On 31/08/2016 11:37, Parav Pandit wrote:
> Added rdma cgroup controller that does accounting, limit enforcement
> on rdma/IB verbs and hw resources.
>
> Added rdma cgroup header file which defines its APIs to perform
> charing/uncharing functionality. It also defined APIs for RDMA/IB
> stack for device registration. Devices which are registered will
> participate in controller functions of accounting and limit
> enforcements. It define rdmacg_device structure to bind IB stack
> and RDMA cgroup controller.
>
> RDMA resources are tracked using resource pool. Resource pool is per
> device, per cgroup entity which allows setting up accounting limits
> on per device basis.
>
> Currently resources are defined by the RDMA cgroup.
>
> Resource pool is created/destroyed dynamically whenever
> charging/uncharging occurs respectively and whenever user
> configuration is done. Its a tradeoff of memory vs little more code
> space that creates resource pool object whenever necessary, instead of
> creating them during cgroup creation and device registration time.
>
> Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
> ---
>  include/linux/cgroup_rdma.h   |  66 +++++
>  include/linux/cgroup_subsys.h |   4 +
>  init/Kconfig                  |  10 +
>  kernel/Makefile               |   1 +
>  kernel/cgroup_rdma.c          | 664 ++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 745 insertions(+)
>  create mode 100644 include/linux/cgroup_rdma.h
>  create mode 100644 kernel/cgroup_rdma.c
>
> diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
> new file mode 100644
> index 0000000..6710e28
> --- /dev/null
> +++ b/include/linux/cgroup_rdma.h
> @@ -0,0 +1,66 @@
> +/*
> + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
> + *
> + * This file is subject to the terms and conditions of version 2 of the GNU
> + * General Public License. See the file COPYING in the main directory of the
> + * Linux distribution for more details.
> + */
> +
> +#ifndef _CGROUP_RDMA_H
> +#define _CGROUP_RDMA_H
> +
> +#include <linux/cgroup.h>
> +
> +enum rdmacg_resource_type {
> +	RDMACG_VERB_RESOURCE_UCTX,
> +	RDMACG_VERB_RESOURCE_AH,
> +	RDMACG_VERB_RESOURCE_PD,
> +	RDMACG_VERB_RESOURCE_CQ,
> +	RDMACG_VERB_RESOURCE_MR,
> +	RDMACG_VERB_RESOURCE_MW,
> +	RDMACG_VERB_RESOURCE_SRQ,
> +	RDMACG_VERB_RESOURCE_QP,
> +	RDMACG_VERB_RESOURCE_FLOW,
> +	/*
> +	 * add any hw specific resource here as RDMA_HW_RESOURCE_NAME
> +	 */
> +	RDMACG_RESOURCE_MAX,
> +};
> +
> +#ifdef CONFIG_CGROUP_RDMA
> +

Currently, there are some discussions regarding the RDMA ABI. The 
current proposed approach (after a lot of discussions in the OFVWG) is 
to have driver dependent object types rather than the fixed set of IB 
object types we have today.
AFAIK, some vendors might want to use the RDMA subsystem for a different 
fabrics which has a different set of objects.
You could see RFCs for such concepts both from Mellanox and Intel on the 
linux-rdma mailing list.

Saying that, maybe we need to make the resource types a bit more 
flexible and dynamic.

Regards,
Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo Aug. 31, 2016, 9:16 p.m. UTC | #3
Hello,

On Wed, Aug 31, 2016 at 06:07:30PM +0300, Matan Barak wrote:
> Currently, there are some discussions regarding the RDMA ABI. The current
> proposed approach (after a lot of discussions in the OFVWG) is to have
> driver dependent object types rather than the fixed set of IB object types
> we have today.
> AFAIK, some vendors might want to use the RDMA subsystem for a different
> fabrics which has a different set of objects.
> You could see RFCs for such concepts both from Mellanox and Intel on the
> linux-rdma mailing list.
> 
> Saying that, maybe we need to make the resource types a bit more flexible
> and dynamic.

That'd be back to square one and Christoph was dead against it too,
so...

Thanks.
Matan Barak Sept. 1, 2016, 7:25 a.m. UTC | #4
On 01/09/2016 00:16, Tejun Heo wrote:
> Hello,
>
> On Wed, Aug 31, 2016 at 06:07:30PM +0300, Matan Barak wrote:
>> Currently, there are some discussions regarding the RDMA ABI. The current
>> proposed approach (after a lot of discussions in the OFVWG) is to have
>> driver dependent object types rather than the fixed set of IB object types
>> we have today.
>> AFAIK, some vendors might want to use the RDMA subsystem for a different
>> fabrics which has a different set of objects.
>> You could see RFCs for such concepts both from Mellanox and Intel on the
>> linux-rdma mailing list.
>>
>> Saying that, maybe we need to make the resource types a bit more flexible
>> and dynamic.
>
> That'd be back to square one and Christoph was dead against it too,
> so...
>

Well, if I recall, the reason doing so last time was in order to allow 
flexible updating of ib_core independently, which is obviously not a 
good reason (to say the least).

Since the new ABI will probably define new object types (all recent 
proposals go this way), the current approach could lead to either trying 
to map new objects to existing cgroup resource types, which could lead 
to some weird non 1:1 mapping, or having a split definitions - such that 
each driver will declare its objects both in the cgroups mechanism and 
in its driver dispatch table.
Even worse than that, drivers could simply ignore the cgroups support 
while implementing their own resource types and get a very broken 
containers support.


> Thanks.
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Sept. 1, 2016, 8:44 a.m. UTC | #5
On Thu, Sep 01, 2016 at 10:25:40AM +0300, Matan Barak wrote:
> Well, if I recall, the reason doing so last time was in order to allow 
> flexible updating of ib_core independently, which is obviously not a good 
> reason (to say the least).
>
> Since the new ABI will probably define new object types (all recent 
> proposals go this way), the current approach could lead to either trying to 
> map new objects to existing cgroup resource types, which could lead to some 
> weird non 1:1 mapping, or having a split definitions - such that each 
> driver will declare its objects both in the cgroups mechanism and in its 
> driver dispatch table.
> Even worse than that, drivers could simply ignore the cgroups support while 
> implementing their own resource types and get a very broken containers 
> support.

Sorry guys, but arbitrary extensibility for something not finished is the
worst idea ever.  We have two options here:

 a) delay cgroups support until the grand rewrite is done
 b) add it now and deal with the consequences later

That being said, adding random non-Verbs hardwasre to the RDMA core is
the second worst idea ever.  Guess I need to catch up with the
discussion and start using the flame thrower.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Sept. 7, 2016, 7:55 a.m. UTC | #6
Hi Matan,

On Thu, Sep 1, 2016 at 2:14 PM, Christoph Hellwig <hch@lst.de> wrote:
> On Thu, Sep 01, 2016 at 10:25:40AM +0300, Matan Barak wrote:
>> Well, if I recall, the reason doing so last time was in order to allow
>> flexible updating of ib_core independently, which is obviously not a good
>> reason (to say the least).
>>
>> Since the new ABI will probably define new object types (all recent
>> proposals go this way), the current approach could lead to either trying to
>> map new objects to existing cgroup resource types, which could lead to some
>> weird non 1:1 mapping, or having a split definitions - such that each
>> driver will declare its objects both in the cgroups mechanism and in its
>> driver dispatch table.

>> Even worse than that, drivers could simply ignore the cgroups support while
>> implementing their own resource types and get a very broken containers
>> support.
If drivers are broken due to ignorance of not-calling cgroup APIs,
that should be ok.
That particular driver should fix it.
If the resource creation using uverbs is using well defined rdma level
resource, uverbs level will make sure to honor cgroup limits without
involving hw drivers anyway.

RDMA Verb level resource is charged/uncharged by RDMA core.
RDMA HW level resource is charged/uncharged by RDMA HW driver using
well defined API and resource by cgroup core.
This scheme ensures that there is 1:1 mapping.

I don't think current definition of resource type is carved out on stone.
They can be extended as we forward.
As many of us agree that, they should be well defined and it should be
agreed by cgroup and rdma community.

For example, today we have RDMA_VERB_xxx resources.
New well defined RDMA HW resources can be defined in rdma_cgroup.h
file as RDMA_HW_xx in same enum table.

>
> Sorry guys, but arbitrary extensibility for something not finished is the
> worst idea ever.  We have two options here:
>
>  a) delay cgroups support until the grand rewrite is done
>  b) add it now and deal with the consequences later
>
Can we do (b) now and differ adding any HW resources to cgroup until
they are clearly called out.
Architecture and APIs are already in place to support this.

> That being said, adding random non-Verbs hardwasre to the RDMA core is
> the second worst idea ever.

We can differ adding HW resource to core and cgroup until they are
clearly defined.
In that case current architecture still holds good.

> Guess I need to catch up with the
> discussion and start using the flame thrower.

Matan,
Can you please point us to the new RFC/ABI email thread which
describes the design, partitioning of code between core vs hw drivers
etc.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak Sept. 7, 2016, 8:51 a.m. UTC | #7
On 07/09/2016 10:55, Parav Pandit wrote:
> Hi Matan,
>
> On Thu, Sep 1, 2016 at 2:14 PM, Christoph Hellwig <hch@lst.de> wrote:
>> On Thu, Sep 01, 2016 at 10:25:40AM +0300, Matan Barak wrote:
>>> Well, if I recall, the reason doing so last time was in order to allow
>>> flexible updating of ib_core independently, which is obviously not a good
>>> reason (to say the least).
>>>
>>> Since the new ABI will probably define new object types (all recent
>>> proposals go this way), the current approach could lead to either trying to
>>> map new objects to existing cgroup resource types, which could lead to some
>>> weird non 1:1 mapping, or having a split definitions - such that each
>>> driver will declare its objects both in the cgroups mechanism and in its
>>> driver dispatch table.
>
>>> Even worse than that, drivers could simply ignore the cgroups support while
>>> implementing their own resource types and get a very broken containers
>>> support.
> If drivers are broken due to ignorance of not-calling cgroup APIs,
> that should be ok.
> That particular driver should fix it.
> If the resource creation using uverbs is using well defined rdma level
> resource, uverbs level will make sure to honor cgroup limits without
> involving hw drivers anyway.
>

All recent proposals of the new ABI schema deals with extending the 
flexibility of the current schema by letting drivers define their 
specific types, actions, attributes, etc. Even more than that, the 
dispatching starts from the driver and it chooses if it wants to use the 
common RDMA core layer or have it's own wise implementation instead.
Some drivers might even prefer not to implement the current verbs types.
These decisions were made in the OFVWG meetings.

Anyway, maybe we should consider using a more higher-level logic objects 
that could fit multiple drivers requirements.

> RDMA Verb level resource is charged/uncharged by RDMA core.
> RDMA HW level resource is charged/uncharged by RDMA HW driver using
> well defined API and resource by cgroup core.
> This scheme ensures that there is 1:1 mapping.
>

Sounds reasonable, but what about drivers which ignore the common code 
and implement it in their own way? What about drivers which don't 
support the standard RDMA types at all?
I guess we should find a balance between something abstract and common 
enough that will ease administrator configuration but be specific enough 
for the various models we have (or will have) in the RDMA stack.

> I don't think current definition of resource type is carved out on stone.
> They can be extended as we forward.
> As many of us agree that, they should be well defined and it should be
> agreed by cgroup and rdma community.
>

Of course, but since the ABI and cgroups model are somehow related, they 
should be dealt with together and approved by Doug who participated in 
some of the OFVWG meetings.

> For example, today we have RDMA_VERB_xxx resources.
> New well defined RDMA HW resources can be defined in rdma_cgroup.h
> file as RDMA_HW_xx in same enum table.
>

So a driver will change the cgroups file for every new type it adds?
Will we just have a super set enum of all crazy types vendors added?

>>
>> Sorry guys, but arbitrary extensibility for something not finished is the
>> worst idea ever.  We have two options here:
>>
>>  a) delay cgroups support until the grand rewrite is done
>>  b) add it now and deal with the consequences later
>>
> Can we do (b) now and differ adding any HW resources to cgroup until
> they are clearly called out.
> Architecture and APIs are already in place to support this.
>

Since this affect the user, it's better to think how it fits our 
"optional standard"/"vendor types" model first. Maybe we could force all 
standard types even if the driver we use doesn't support any of them.

>> That being said, adding random non-Verbs hardwasre to the RDMA core is
>> the second worst idea ever.
>
> We can differ adding HW resource to core and cgroup until they are
> clearly defined.
> In that case current architecture still holds good.
>

Clearly we should differ adding the actual code until a driver could 
declare such objects, but we need to decide how to expose the standard 
optional RDMA types (basically, the types you've added) and how future 
driver specific types fit in.

>> Guess I need to catch up with the
>> discussion and start using the flame thrower.
>
> Matan,
> Can you please point us to the new RFC/ABI email thread which
> describes the design, partitioning of code between core vs hw drivers
> etc.
>

One proposal is [1]. There's another one from Sean which aims for 
similar targets regards the driver specific types.

[1] https://www.spinics.net/lists/linux-rdma/msg38997.html

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Sept. 7, 2016, 2:54 p.m. UTC | #8
On Wed, Sep 7, 2016 at 2:21 PM, Matan Barak <matanb@mellanox.com> wrote:
> On 07/09/2016 10:55, Parav Pandit wrote:
>>
>> Hi Matan,
>>
>> On Thu, Sep 1, 2016 at 2:14 PM, Christoph Hellwig <hch@lst.de> wrote:
>>>
>>> On Thu, Sep 01, 2016 at 10:25:40AM +0300, Matan Barak wrote:
>>>>
>>>> Well, if I recall, the reason doing so last time was in order to allow
>>>> flexible updating of ib_core independently, which is obviously not a
>>>> good
>>>> reason (to say the least).
>>>>
>>>> Since the new ABI will probably define new object types (all recent
>>>> proposals go this way), the current approach could lead to either trying
>>>> to
>>>> map new objects to existing cgroup resource types, which could lead to
>>>> some
>>>> weird non 1:1 mapping, or having a split definitions - such that each
>>>> driver will declare its objects both in the cgroups mechanism and in its
>>>> driver dispatch table.
>>
>>
>>>> Even worse than that, drivers could simply ignore the cgroups support
>>>> while
>>>> implementing their own resource types and get a very broken containers
>>>> support.
>>
>> If drivers are broken due to ignorance of not-calling cgroup APIs,
>> that should be ok.
>> That particular driver should fix it.
>> If the resource creation using uverbs is using well defined rdma level
>> resource, uverbs level will make sure to honor cgroup limits without
>> involving hw drivers anyway.
>>
>
> All recent proposals of the new ABI schema deals with extending the
> flexibility of the current schema by letting drivers define their specific
> types, actions, attributes, etc. Even more than that, the dispatching starts
> from the driver and it chooses if it wants to use the common RDMA core layer
> or have it's own wise implementation instead.
> Some drivers might even prefer not to implement the current verbs types.
> These decisions were made in the OFVWG meetings.

o.k. If some drivers doesn't implement current verbs type, there is no
point in controlling it either.
In such case application space library won't even invoke resource
allocation/free for unsupported resources.
For resources (type in your word) which are not defined in cgroup, but
exist in hw driver, cannot be controlled by cgroup.
As you highlighted in your [1], "driver's specific attributes could
someday become core's standard attributes", we should be able to add
new resource type to existing rdma cgroup.

>
> Anyway, maybe we should consider using a more higher-level logic objects
> that could fit multiple drivers requirements.
I do not have any other objects other than QP, MR etc in mind currently.
I can think of a RDMA specific socket that encompass one PD, AH,
couple of MRs, and QP.
But we don't have such resource abstraction and its data transport
APIs in place.
There is rsocket, various versions of MPI, libfabric etc in place.
So I am not sure which high level objects to defined at this point.
If you have such objects nailed down, we should be able to add them in
incremental development.

>
>> RDMA Verb level resource is charged/uncharged by RDMA core.
>> RDMA HW level resource is charged/uncharged by RDMA HW driver using
>> well defined API and resource by cgroup core.
>> This scheme ensures that there is 1:1 mapping.
>>
>
> Sounds reasonable, but what about drivers which ignore the common code and
> implement it in their own way?
Shouldn't Doug ask them to use cgroup infrastructure instead of
implementing resource charging/uncharing in their own way.
It still unlikely or difficult for drivers to group processes them
selves like cgroup to implement things in their own way.
I agree, they can completely ignore RDMA verbs resources and implement
their own HW resources.

As you pointed below, we need to find balance between which hw
resource to be defined and which not.
For example, newly added XRQ object, which could be a pure buffer_tag
with receive queue for other vendor. I am not sure how to abstract
them into single object.



> What about drivers which don't support the
> standard RDMA types at all?
> I guess we should find a balance between something abstract and common
> enough that will ease administrator configuration but be specific enough for
> the various models we have (or will have) in the RDMA stack.
>
>> I don't think current definition of resource type is carved out on stone.
>> They can be extended as we forward.
>> As many of us agree that, they should be well defined and it should be
>> agreed by cgroup and rdma community.
>>
>
> Of course, but since the ABI and cgroups model are somehow related, they
> should be dealt with together and approved by Doug who participated in some
> of the OFVWG meetings.
Sure.
>
>> For example, today we have RDMA_VERB_xxx resources.
>> New well defined RDMA HW resources can be defined in rdma_cgroup.h
>> file as RDMA_HW_xx in same enum table.
>>
>
> So a driver will change the cgroups file for every new type it adds?
Well, we wanted to avoid that such churn in cgroup file, thats why v11
was defining resources in IB core. But I guess that was not
acceptable. I had NAK from Christoph and Tejun on that idea.

> Will we just have a super set enum of all crazy types vendors added?
As you said, we need to find balance. I frankly don't know how to do
so. There has to be some reasonable number of types. As we go along
Doug, Tejun and others should approve adding such.
If I am not wrong in past one year, may be two more resource types got
added? XRQ, state-less Queues?

>
>>>
>>> Sorry guys, but arbitrary extensibility for something not finished is the
>>> worst idea ever.  We have two options here:
>>>
>>>  a) delay cgroups support until the grand rewrite is done
>>>  b) add it now and deal with the consequences later
>>>
>> Can we do (b) now and differ adding any HW resources to cgroup until
>> they are clearly called out.
>> Architecture and APIs are already in place to support this.
>>
>
> Since this affect the user, it's better to think how it fits our "optional
> standard"/"vendor types" model first. Maybe we could force all standard
> types even if the driver we use doesn't support any of them.

If vendor doesn't support a given type, user won't allocate it. So its
just don't care condition.
I dont see a need to force standard types either.

>
>>> That being said, adding random non-Verbs hardwasre to the RDMA core is
>>> the second worst idea ever.
>>
>>
>> We can differ adding HW resource to core and cgroup until they are
>> clearly defined.
>> In that case current architecture still holds good.
>>
>
> Clearly we should differ adding the actual code until a driver could declare
> such objects, but we need to decide how to expose the standard optional RDMA
> types (basically, the types you've added) and how future driver specific
> types fit in.
>

o.k. Few more handful of driver specific types should be ok to add in cgroup.
I will let others speak up if thats not acceptable. Current code
already documents and provide infrastructure for that.

>>
>>
>> Matan,
>> Can you please point us to the new RFC/ABI email thread which
>> describes the design, partitioning of code between core vs hw drivers
>> etc.
>>
>
> One proposal is [1]. There's another one from Sean which aims for similar
> targets regards the driver specific types.
>
> [1] https://www.spinics.net/lists/linux-rdma/msg38997.html
>
I looked at the RFC briefly. I can see that old infrastructure (a) and
(b) is not going away.
So It should be ok. to charge/uncharge those standard resources from
those hooks.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Sept. 7, 2016, 3:07 p.m. UTC | #9
Hi Leon,

>> Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
>> +static struct rdmacg_resource_pool *
>> +get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
>> +{
>> +     struct rdmacg_resource_pool *rpool;
>> +
>> +     rpool = find_cg_rpool_locked(cg, device);
>> +     if (rpool)
>> +             return rpool;
>> +
>> +     rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
>> +     if (!rpool)
>> +             return ERR_PTR(-ENOMEM);
>> +
>> +     rpool->device = device;
>> +     set_all_resource_max_limit(rpool);
>> +
>> +     INIT_LIST_HEAD(&rpool->cg_node);
>> +     INIT_LIST_HEAD(&rpool->dev_node);
>> +     list_add_tail(&rpool->cg_node, &cg->rpools);
>> +     list_add_tail(&rpool->dev_node, &device->rpools);
>> +     return rpool;
>> +}
>
> <...>
>
>> +     for (p = cg; p; p = parent_rdmacg(p)) {
>> +             rpool = get_cg_rpool_locked(p, device);
>> +             if (IS_ERR_OR_NULL(rpool)) {
>
> get_cg_rpool_locked always returns !NULL (error, or pointer)

Can this change go as incremental change after this patch, since this
patch is close to completion?
Or I need to revise v13?

>
>> +                     ret = PTR_ERR(rpool);
>> +                     goto err;
>
> I didn't review the whole series yet.

Did you get a chance to review the series?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Sept. 8, 2016, 6:12 a.m. UTC | #10
On Wed, Sep 07, 2016 at 08:37:23PM +0530, Parav Pandit wrote:
> Hi Leon,
>
> >> Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
> >> +static struct rdmacg_resource_pool *
> >> +get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
> >> +{
> >> +     struct rdmacg_resource_pool *rpool;
> >> +
> >> +     rpool = find_cg_rpool_locked(cg, device);
> >> +     if (rpool)
> >> +             return rpool;
> >> +
> >> +     rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
> >> +     if (!rpool)
> >> +             return ERR_PTR(-ENOMEM);
> >> +
> >> +     rpool->device = device;
> >> +     set_all_resource_max_limit(rpool);
> >> +
> >> +     INIT_LIST_HEAD(&rpool->cg_node);
> >> +     INIT_LIST_HEAD(&rpool->dev_node);
> >> +     list_add_tail(&rpool->cg_node, &cg->rpools);
> >> +     list_add_tail(&rpool->dev_node, &device->rpools);
> >> +     return rpool;
> >> +}
> >
> > <...>
> >
> >> +     for (p = cg; p; p = parent_rdmacg(p)) {
> >> +             rpool = get_cg_rpool_locked(p, device);
> >> +             if (IS_ERR_OR_NULL(rpool)) {
> >
> > get_cg_rpool_locked always returns !NULL (error, or pointer)
>
> Can this change go as incremental change after this patch, since this
> patch is close to completion?
> Or I need to revise v13?

Sure, it is cleanup. It is not worth of respinning.

>
> >
> >> +                     ret = PTR_ERR(rpool);
> >> +                     goto err;
> >
> > I didn't review the whole series yet.
>
> Did you get a chance to review the series?

We need to decide on fundamental question before reviewing it, which is
"how to fit rdmacg to new ABI model".

Thanks
Parav Pandit Sept. 8, 2016, 10:20 a.m. UTC | #11
On Thu, Sep 8, 2016 at 11:42 AM, Leon Romanovsky <leon@kernel.org> wrote:
> On Wed, Sep 07, 2016 at 08:37:23PM +0530, Parav Pandit wrote:
>> Did you get a chance to review the series?
>
> We need to decide on fundamental question before reviewing it, which is
> "how to fit rdmacg to new ABI model".

From last discussion with Matan in this email thread, it appears that -
only broken case are:
(a) HW vendor driver specific resources (if they have crazy big list),
which cannot be abstracted out well enough, won't be controlled by
rdma cgroup.
(b) Such resource objects are not well defined today with new ABI model.
If such objects are well defined today, lets call them out and discuss
with Doug, Tejun, Christoph and larger group, whether they qualify for
inclusion or not.

rdma cgroup currently supports including handful of HW resource that
can be abstracted (at least at functionality level).

Please include any other option issue, if any.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Sept. 10, 2016, 4:12 p.m. UTC | #12
On Wed, Sep 07, 2016 at 01:25:13PM +0530, Parav Pandit wrote:
> >  a) delay cgroups support until the grand rewrite is done
> >  b) add it now and deal with the consequences later
> >
> Can we do (b) now and differ adding any HW resources to cgroup until
> they are clearly called out.
> Architecture and APIs are already in place to support this.

Sounds fine to me.  The only thing I want to avoid is pie in the
sky "future proofing" that leads to horrible architectures.  And I assume
that's what Matan proposed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Sept. 10, 2016, 4:14 p.m. UTC | #13
On Wed, Sep 07, 2016 at 11:51:42AM +0300, Matan Barak wrote:
> All recent proposals of the new ABI schema deals with extending the 
> flexibility of the current schema by letting drivers define their specific 
> types, actions, attributes, etc. Even more than that, the dispatching 
> starts from the driver and it chooses if it wants to use the common RDMA 
> core layer or have it's own wise implementation instead.
> Some drivers might even prefer not to implement the current verbs types.
> These decisions were made in the OFVWG meetings.

OFVWG meetings have absolutely zero relevance for Linux development.
More "flexibility" for drivers just means giving up on designing a
coherent API and leaving it to drivers authors to add crap to their
own drivers.  That's a major step backwards.

> Sounds reasonable, but what about drivers which ignore the common code and 
> implement it in their own way? What about drivers which don't support the 
> standard RDMA types at all?

They should not be using the code in drivers/infiniband.  usnic is such
an example of a driver that should never have been added in it's current
form.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 10, 2016, 5:01 p.m. UTC | #14
On Sat, Sep 10, 2016 at 06:14:42PM +0200, Christoph Hellwig wrote:
> OFVWG meetings have absolutely zero relevance for Linux development.

Well, to be fair there are a fair number of kernel developers on that
particular call..

> More "flexibility" for drivers just means giving up on designing a
> coherent API and leaving it to drivers authors to add crap to their
> own drivers.  That's a major step backwards.

Sadly, it isn't a step backwards, it is status quo - at least as far
as the uapi is concerned.

Every single user space driver has its own private abi file, carefully
hidden in their driver, and dutifully copied over to user space:

providers/cxgb3/iwch-abi.h
providers/cxgb4/cxgb4-abi.h
providers/hfi1verbs/hfi-abi.h
providers/i40iw/i40iw-abi.h
providers/ipathverbs/ipath-abi.h
providers/mlx4/mlx4-abi.h
providers/mlx5/mlx5-abi.h
providers/mthca/mthca-abi.h
providers/nes/nes-abi.h
providers/ocrdma/ocrdma_abi.h
providers/rxe/rxe-abi.h

Just to pick two random examples:

struct mlx5_create_cq {
        struct ibv_create_cq            ibv_cmd;
        __u64                           buf_addr;
        __u64                           db_addr;
        __u32				cqe_size;
};

struct iwch_create_cq {
        struct ibv_create_cq ibv_cmd;
        uint64_t user_rptr_addr;
};

Love to hear ideas on a way forward that doesn't involve rewriting
everything :(

> They should not be using the code in drivers/infiniband.  usnic is such
> an example of a driver that should never have been added in it's current
> form.

+1

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak Sept. 11, 2016, 7:40 a.m. UTC | #15
On 10/09/2016 19:12, Christoph Hellwig wrote:
> On Wed, Sep 07, 2016 at 01:25:13PM +0530, Parav Pandit wrote:
>>>  a) delay cgroups support until the grand rewrite is done
>>>  b) add it now and deal with the consequences later
>>>
>> Can we do (b) now and differ adding any HW resources to cgroup until
>> they are clearly called out.
>> Architecture and APIs are already in place to support this.
>
> Sounds fine to me.  The only thing I want to avoid is pie in the
> sky "future proofing" that leads to horrible architectures.  And I assume
> that's what Matan proposed.
>

NO, that not what I proposed. User-kernel API/ABI should be designed 
with drivers differences in mind. The internal design or internals APIs 
could ignore such things as they can be changed later.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak Sept. 11, 2016, 8:07 a.m. UTC | #16
On 10/09/2016 20:01, Jason Gunthorpe wrote:
> On Sat, Sep 10, 2016 at 06:14:42PM +0200, Christoph Hellwig wrote:
>> OFVWG meetings have absolutely zero relevance for Linux development.
>
> Well, to be fair there are a fair number of kernel developers on that
> particular call..
>
>> More "flexibility" for drivers just means giving up on designing a
>> coherent API and leaving it to drivers authors to add crap to their
>> own drivers.  That's a major step backwards.
>
> Sadly, it isn't a step backwards, it is status quo - at least as far
> as the uapi is concerned.
>
> Every single user space driver has its own private abi file, carefully
> hidden in their driver, and dutifully copied over to user space:
>
> providers/cxgb3/iwch-abi.h
> providers/cxgb4/cxgb4-abi.h
> providers/hfi1verbs/hfi-abi.h
> providers/i40iw/i40iw-abi.h
> providers/ipathverbs/ipath-abi.h
> providers/mlx4/mlx4-abi.h
> providers/mlx5/mlx5-abi.h
> providers/mthca/mthca-abi.h
> providers/nes/nes-abi.h
> providers/ocrdma/ocrdma_abi.h
> providers/rxe/rxe-abi.h
>
> Just to pick two random examples:
>
> struct mlx5_create_cq {
>         struct ibv_create_cq            ibv_cmd;
>         __u64                           buf_addr;
>         __u64                           db_addr;
>         __u32				cqe_size;
> };
>
> struct iwch_create_cq {
>         struct ibv_create_cq ibv_cmd;
>         uint64_t user_rptr_addr;
> };
>
> Love to hear ideas on a way forward that doesn't involve rewriting
> everything :(
>

Yeah, unfortunately, the RDMA ABI is more driver specific ABI than a 
common user-kernel ABI. I guess this will become even worse, as the RDMA 
subsystem is evolving to serve more drivers with different object types. 
For example, I would like to hear how hfi1 are going to define their 
user-kernel ABI (once they leave the custom ioctls).

>> They should not be using the code in drivers/infiniband.  usnic is such
>> an example of a driver that should never have been added in it's current
>> form.
>
> +1
>
> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Sept. 11, 2016, 1:34 p.m. UTC | #17
On Sat, Sep 10, 2016 at 11:01:51AM -0600, Jason Gunthorpe wrote:
> Sadly, it isn't a step backwards, it is status quo - at least as far
> as the uapi is concerned.

Sort of, see below:

> struct mlx5_create_cq {
>         struct ibv_create_cq            ibv_cmd;
>         __u64                           buf_addr;
>         __u64                           db_addr;
>         __u32				cqe_size;
> };
> 
> struct iwch_create_cq {
>         struct ibv_create_cq ibv_cmd;
>         uint64_t user_rptr_addr;
> };
> 
> Love to hear ideas on a way forward that doesn't involve rewriting
> everything :(

We stil always have the common structure first.  And at least for
cgroups supports that's what matters.

Re the actual structures - we'll really need to make sure we

 a) expose proper userspace abi headers in the kernel for all code
    in the RDMA subsystem
 b) actually use that in the userspace components

I've posted some initial work toward a) a while ago, and once we
agree on adopting your common repo I'd really like to start through
with that work.  I think it's a pre-requisite for any major new
userspace ABI work.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Sept. 11, 2016, 2:35 p.m. UTC | #18
On Sun, Sep 11, 2016 at 03:34:21PM +0200, Christoph Hellwig wrote:
> On Sat, Sep 10, 2016 at 11:01:51AM -0600, Jason Gunthorpe wrote:
> > Sadly, it isn't a step backwards, it is status quo - at least as far
> > as the uapi is concerned.
>
> Sort of, see below:
>
> > struct mlx5_create_cq {
> >         struct ibv_create_cq            ibv_cmd;
> >         __u64                           buf_addr;
> >         __u64                           db_addr;
> >         __u32				cqe_size;
> > };
> >
> > struct iwch_create_cq {
> >         struct ibv_create_cq ibv_cmd;
> >         uint64_t user_rptr_addr;
> > };
> >
> > Love to hear ideas on a way forward that doesn't involve rewriting
> > everything :(
>
> We stil always have the common structure first.  And at least for
> cgroups supports that's what matters.
>
> Re the actual structures - we'll really need to make sure we
>
>  a) expose proper userspace abi headers in the kernel for all code
>     in the RDMA subsystem
>  b) actually use that in the userspace components
>
> I've posted some initial work toward a) a while ago, and once we
> agree on adopting your common repo I'd really like to start through
> with that work.  I think it's a pre-requisite for any major new
> userspace ABI work.

I started to work on it over weekend and it is worth do not do same work twice.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 11, 2016, 5:14 p.m. UTC | #19
On Sun, Sep 11, 2016 at 05:35:22PM +0300, Leon Romanovsky wrote:

> > We stil always have the common structure first.  And at least for
> > cgroups supports that's what matters.
> >
> > Re the actual structures - we'll really need to make sure we
> >
> >  a) expose proper userspace abi headers in the kernel for all code
> >     in the RDMA subsystem
> >  b) actually use that in the userspace components
> >
> > I've posted some initial work toward a) a while ago, and once we

Did it get merged? Do you have a pointer?

> > agree on adopting your common repo I'd really like to start through
> > with that work.  I think it's a pre-requisite for any major new
> > userspace ABI work.
> 
> I started to work on it over weekend and it is worth do not do same work twice.

Yes, I also agree that it is important before we tackle the uapi
conversion to get this fully sorted.

I've already done several cases working with the existing uapi headers:

https://github.com/jgunthorpe/rdma-plumbing/commit/f4f40689440dbc9c57b55548b04b15fe808a1767
https://github.com/jgunthorpe/rdma-plumbing/commit/0cf1893dce4791dafa035bcb6ee045a6ce0ff3c3
https://github.com/jgunthorpe/rdma-plumbing/commit/0522fc42aac4a5e8fc888dcca4341c9bc1dc58ca

[.. and this is a strong argument why we need the common repo, doing
this without it would be very hard, as everything is cross-linked, I
couldnn't unwind libibcm until I fixed a bit of verbs, and rdmacm can't
even include its uapi header until the duplicate definitions in the
verbs copy are delt with .. and I've also learned we are making
changing to the kernel uapi header and since nothing uses them we never even
compile test :( :( eg
https://github.com/torvalds/linux/commit/b493d91d333e867a043f7ff1397bcba6e2d0dda2]

However, everything under verbs is not straightforward. The files in
userspace are not copies...

user:

struct ibv_query_device {
       __u32 command;
       __u16 in_words;
       __u16 out_words;
       __u64 response;
       __u64 driver_data[0];
};

kernel:

struct ib_uverbs_query_device {
        __u64 response;
        __u64 driver_data[0];
};

eg the userspace version stuffs the header into the struct and the
kernel version does not. Presumably this is for efficiency so that no
copies are required when marshaling. This impacts everything :(

I'm thinking the best way forward might be to use a script and
transform userspace into:

struct ibv_query_device {
	struct ib_uverbs_cmd_hdr hdr;
	struct ib_uverbs_query_device cmd;
};

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Sept. 11, 2016, 5:24 p.m. UTC | #20
On Sun, Sep 11, 2016 at 11:14:09AM -0600, Jason Gunthorpe wrote:
> > > We stil always have the common structure first.  And at least for
> > > cgroups supports that's what matters.
> > >
> > > Re the actual structures - we'll really need to make sure we
> > >
> > >  a) expose proper userspace abi headers in the kernel for all code
> > >     in the RDMA subsystem
> > >  b) actually use that in the userspace components
> > >
> > > I've posted some initial work toward a) a while ago, and once we
> 
> Did it get merged? Do you have a pointer?

http://www.spinics.net/lists/linux-rdma/msg31958.html

> this without it would be very hard, as everything is cross-linked, I
> couldnn't unwind libibcm until I fixed a bit of verbs, and rdmacm can't
> even include its uapi header until the duplicate definitions in the
> verbs copy are delt with .. and I've also learned we are making
> changing to the kernel uapi header and since nothing uses them we never even
> compile test :( :( eg
> https://github.com/torvalds/linux/commit/b493d91d333e867a043f7ff1397bcba6e2d0dda2]

> However, everything under verbs is not straightforward. The files in
> userspace are not copies...
> 
> user:
> 
> struct ibv_query_device {
>        __u32 command;
>        __u16 in_words;
>        __u16 out_words;
>        __u64 response;
>        __u64 driver_data[0];
> };
> 
> kernel:
> 
> struct ib_uverbs_query_device {
>         __u64 response;
>         __u64 driver_data[0];
> };

We'll obviously need different strutures for the libibvers API
and the kernel interface in this case, and we'll need to figure out
how to properly translate them.  I think a cast, plus compile time
type checking ala BUILD_BUG_ON is the way to go.

> eg the userspace version stuffs the header into the struct and the
> kernel version does not. Presumably this is for efficiency so that no
> copies are required when marshaling. This impacts everything :(
> 
> I'm thinking the best way forward might be to use a script and
> transform userspace into:
> 
> struct ibv_query_device {
> 	struct ib_uverbs_cmd_hdr hdr;
> 	struct ib_uverbs_query_device cmd;
> };

That would break the users of the interface.  However automatically
generating the user ABI from the kernel one might still be a good idea
in the long run.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 11, 2016, 5:52 p.m. UTC | #21
On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
> > > > I've posted some initial work toward a) a while ago, and once we
> > 
> > Did it get merged? Do you have a pointer?
> 
> http://www.spinics.net/lists/linux-rdma/msg31958.html

Right, I remember that. Certainly the right direction

> > However, everything under verbs is not straightforward. The files in
> > userspace are not copies...
> > 
> > user:
> > 
> > struct ibv_query_device {
> >        __u32 command;
> >        __u16 in_words;
> >        __u16 out_words;
> >        __u64 response;
> >        __u64 driver_data[0];
> > };
> > 
> > kernel:
> > 
> > struct ib_uverbs_query_device {
> >         __u64 response;
> >         __u64 driver_data[0];
> > };
> 
> We'll obviously need different strutures for the libibvers API
> and the kernel interface in this case, and we'll need to figure out
> how to properly translate them.  I think a cast, plus compile time
> type checking ala BUILD_BUG_ON is the way to go.

I'm not sure I follow, which would I cast?

BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
             sizeof(ib_uverbs_query_device))

?

> > I'm thinking the best way forward might be to use a script and
> > transform userspace into:
> > 
> > struct ibv_query_device {
> > 	struct ib_uverbs_cmd_hdr hdr;
> > 	struct ib_uverbs_query_device cmd;
> > };
> 
> That would break the users of the interface.

Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
identical the modified libibverbs would still be binary compatible
with all providers but not source compatible. Since all kernel
supported providers are in rdma-plumbing we can add the '.cmd.' at the
same time.

The kernel uapi header would stay the same.

> However automatically generating the user ABI from the kernel one
> might still be a good idea in the long run.

My preference would be to try and use the kernel headers directly.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Sept. 12, 2016, 5:07 a.m. UTC | #22
On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:
> On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
> > > > > I've posted some initial work toward a) a while ago, and once we
> > >
> > > Did it get merged? Do you have a pointer?
> >
> > http://www.spinics.net/lists/linux-rdma/msg31958.html
>
> Right, I remember that. Certainly the right direction
>
> > > However, everything under verbs is not straightforward. The files in
> > > userspace are not copies...
> > >
> > > user:
> > >
> > > struct ibv_query_device {
> > >        __u32 command;
> > >        __u16 in_words;
> > >        __u16 out_words;
> > >        __u64 response;
> > >        __u64 driver_data[0];
> > > };
> > >
> > > kernel:
> > >
> > > struct ib_uverbs_query_device {
> > >         __u64 response;
> > >         __u64 driver_data[0];
> > > };
> >
> > We'll obviously need different strutures for the libibvers API
> > and the kernel interface in this case, and we'll need to figure out
> > how to properly translate them.  I think a cast, plus compile time
> > type checking ala BUILD_BUG_ON is the way to go.
>
> I'm not sure I follow, which would I cast?
>
> BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
>              sizeof(ib_uverbs_query_device))
>
> ?
>
> > > I'm thinking the best way forward might be to use a script and
> > > transform userspace into:
> > >
> > > struct ibv_query_device {
> > > 	struct ib_uverbs_cmd_hdr hdr;
> > > 	struct ib_uverbs_query_device cmd;
> > > };
> >
> > That would break the users of the interface.
>
> Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
> identical the modified libibverbs would still be binary compatible
> with all providers but not source compatible. Since all kernel
> supported providers are in rdma-plumbing we can add the '.cmd.' at the
> same time.
>
> The kernel uapi header would stay the same.
>
> > However automatically generating the user ABI from the kernel one
> > might still be a good idea in the long run.
>
> My preference would be to try and use the kernel headers directly.

I thought the same, especially after realizing that they are almost
copy/paste from the vendor *-abi.h files.

>
> Jason
Parav Pandit Sept. 14, 2016, 7:06 a.m. UTC | #23
Hi Dennis,

Do you know how would HFI1 driver would work along with rdma cgroup?

Hi Matan, Leon, Jason,
Apart from HFI1, is there any other concern?
Or Patch is good to go?

4.8 dates are close by (2 weeks) and there are two git trees involved
(that might cause merge error to Linus) so if there are no issues, I
would like to make request to Doug to consider it for 4.8 early on.

Parav

On Mon, Sep 12, 2016 at 10:37 AM, Leon Romanovsky <leon@kernel.org> wrote:
> On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:
>> On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
>> > > > > I've posted some initial work toward a) a while ago, and once we
>> > >
>> > > Did it get merged? Do you have a pointer?
>> >
>> > http://www.spinics.net/lists/linux-rdma/msg31958.html
>>
>> Right, I remember that. Certainly the right direction
>>
>> > > However, everything under verbs is not straightforward. The files in
>> > > userspace are not copies...
>> > >
>> > > user:
>> > >
>> > > struct ibv_query_device {
>> > >        __u32 command;
>> > >        __u16 in_words;
>> > >        __u16 out_words;
>> > >        __u64 response;
>> > >        __u64 driver_data[0];
>> > > };
>> > >
>> > > kernel:
>> > >
>> > > struct ib_uverbs_query_device {
>> > >         __u64 response;
>> > >         __u64 driver_data[0];
>> > > };
>> >
>> > We'll obviously need different strutures for the libibvers API
>> > and the kernel interface in this case, and we'll need to figure out
>> > how to properly translate them.  I think a cast, plus compile time
>> > type checking ala BUILD_BUG_ON is the way to go.
>>
>> I'm not sure I follow, which would I cast?
>>
>> BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
>>              sizeof(ib_uverbs_query_device))
>>
>> ?
>>
>> > > I'm thinking the best way forward might be to use a script and
>> > > transform userspace into:
>> > >
>> > > struct ibv_query_device {
>> > >   struct ib_uverbs_cmd_hdr hdr;
>> > >   struct ib_uverbs_query_device cmd;
>> > > };
>> >
>> > That would break the users of the interface.
>>
>> Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
>> identical the modified libibverbs would still be binary compatible
>> with all providers but not source compatible. Since all kernel
>> supported providers are in rdma-plumbing we can add the '.cmd.' at the
>> same time.
>>
>> The kernel uapi header would stay the same.
>>
>> > However automatically generating the user ABI from the kernel one
>> > might still be a good idea in the long run.
>>
>> My preference would be to try and use the kernel headers directly.
>
> I thought the same, especially after realizing that they are almost
> copy/paste from the vendor *-abi.h files.
>
>>
>> Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak Sept. 14, 2016, 8:14 a.m. UTC | #24
On 14/09/2016 10:06, Parav Pandit wrote:
> Hi Dennis,
>
> Do you know how would HFI1 driver would work along with rdma cgroup?
>
> Hi Matan, Leon, Jason,
> Apart from HFI1, is there any other concern?

I just wonder how things like RSS will work. For example, a RSS QP 
doesn't really have a queue (if I recall, it's connected to work queues 
via an indirection table). So, when a user creates such a QP, do you 
want to account it as a regular QP?
How are work queues accounted?


> Or Patch is good to go?
>
> 4.8 dates are close by (2 weeks) and there are two git trees involved
> (that might cause merge error to Linus) so if there are no issues, I
> would like to make request to Doug to consider it for 4.8 early on.
>
> Parav
>
> On Mon, Sep 12, 2016 at 10:37 AM, Leon Romanovsky <leon@kernel.org> wrote:
>> On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:
>>> On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
>>>>>>> I've posted some initial work toward a) a while ago, and once we
>>>>>
>>>>> Did it get merged? Do you have a pointer?
>>>>
>>>> http://www.spinics.net/lists/linux-rdma/msg31958.html
>>>
>>> Right, I remember that. Certainly the right direction
>>>
>>>>> However, everything under verbs is not straightforward. The files in
>>>>> userspace are not copies...
>>>>>
>>>>> user:
>>>>>
>>>>> struct ibv_query_device {
>>>>>        __u32 command;
>>>>>        __u16 in_words;
>>>>>        __u16 out_words;
>>>>>        __u64 response;
>>>>>        __u64 driver_data[0];
>>>>> };
>>>>>
>>>>> kernel:
>>>>>
>>>>> struct ib_uverbs_query_device {
>>>>>         __u64 response;
>>>>>         __u64 driver_data[0];
>>>>> };
>>>>
>>>> We'll obviously need different strutures for the libibvers API
>>>> and the kernel interface in this case, and we'll need to figure out
>>>> how to properly translate them.  I think a cast, plus compile time
>>>> type checking ala BUILD_BUG_ON is the way to go.
>>>
>>> I'm not sure I follow, which would I cast?
>>>
>>> BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
>>>              sizeof(ib_uverbs_query_device))
>>>
>>> ?
>>>
>>>>> I'm thinking the best way forward might be to use a script and
>>>>> transform userspace into:
>>>>>
>>>>> struct ibv_query_device {
>>>>>   struct ib_uverbs_cmd_hdr hdr;
>>>>>   struct ib_uverbs_query_device cmd;
>>>>> };
>>>>
>>>> That would break the users of the interface.
>>>
>>> Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
>>> identical the modified libibverbs would still be binary compatible
>>> with all providers but not source compatible. Since all kernel
>>> supported providers are in rdma-plumbing we can add the '.cmd.' at the
>>> same time.
>>>
>>> The kernel uapi header would stay the same.
>>>
>>>> However automatically generating the user ABI from the kernel one
>>>> might still be a good idea in the long run.
>>>
>>> My preference would be to try and use the kernel headers directly.
>>
>> I thought the same, especially after realizing that they are almost
>> copy/paste from the vendor *-abi.h files.
>>
>>>
>>> Jason

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Sept. 14, 2016, 9:19 a.m. UTC | #25
Hi Matan,

On Wed, Sep 14, 2016 at 1:44 PM, Matan Barak <matanb@mellanox.com> wrote:
> On 14/09/2016 10:06, Parav Pandit wrote:
>>
>> Hi Dennis,
>>
>> Do you know how would HFI1 driver would work along with rdma cgroup?
>>
>> Hi Matan, Leon, Jason,
>> Apart from HFI1, is there any other concern?
>
>
> I just wonder how things like RSS will work. For example, a RSS QP doesn't
> really have a queue (if I recall, it's connected to work queues via an
> indirection table). So, when a user creates such a QP, do you want to
> account it as a regular QP?
> How are work queues accounted?

ib_create_rwq_ind_table verb allows creating indirection table.
I assume it allows creating multiple such tables.
If it is so, than number of tables should be a cgroup resource that we
can add in follow on patch.
By doing so, one container doesn't takeaway all the tables.

>
>
>> Or Patch is good to go?
>>
>> 4.8 dates are close by (2 weeks) and there are two git trees involved
>> (that might cause merge error to Linus) so if there are no issues, I
>> would like to make request to Doug to consider it for 4.8 early on.
>>
>> Parav
>>
>> On Mon, Sep 12, 2016 at 10:37 AM, Leon Romanovsky <leon@kernel.org> wrote:
>>>
>>> On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:
>>>>
>>>> On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
>>>>>>>>
>>>>>>>> I've posted some initial work toward a) a while ago, and once we
>>>>>>
>>>>>>
>>>>>> Did it get merged? Do you have a pointer?
>>>>>
>>>>>
>>>>> http://www.spinics.net/lists/linux-rdma/msg31958.html
>>>>
>>>>
>>>> Right, I remember that. Certainly the right direction
>>>>
>>>>>> However, everything under verbs is not straightforward. The files in
>>>>>> userspace are not copies...
>>>>>>
>>>>>> user:
>>>>>>
>>>>>> struct ibv_query_device {
>>>>>>        __u32 command;
>>>>>>        __u16 in_words;
>>>>>>        __u16 out_words;
>>>>>>        __u64 response;
>>>>>>        __u64 driver_data[0];
>>>>>> };
>>>>>>
>>>>>> kernel:
>>>>>>
>>>>>> struct ib_uverbs_query_device {
>>>>>>         __u64 response;
>>>>>>         __u64 driver_data[0];
>>>>>> };
>>>>>
>>>>>
>>>>> We'll obviously need different strutures for the libibvers API
>>>>> and the kernel interface in this case, and we'll need to figure out
>>>>> how to properly translate them.  I think a cast, plus compile time
>>>>> type checking ala BUILD_BUG_ON is the way to go.
>>>>
>>>>
>>>> I'm not sure I follow, which would I cast?
>>>>
>>>> BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
>>>>              sizeof(ib_uverbs_query_device))
>>>>
>>>> ?
>>>>
>>>>>> I'm thinking the best way forward might be to use a script and
>>>>>> transform userspace into:
>>>>>>
>>>>>> struct ibv_query_device {
>>>>>>   struct ib_uverbs_cmd_hdr hdr;
>>>>>>   struct ib_uverbs_query_device cmd;
>>>>>> };
>>>>>
>>>>>
>>>>> That would break the users of the interface.
>>>>
>>>>
>>>> Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
>>>> identical the modified libibverbs would still be binary compatible
>>>> with all providers but not source compatible. Since all kernel
>>>> supported providers are in rdma-plumbing we can add the '.cmd.' at the
>>>> same time.
>>>>
>>>> The kernel uapi header would stay the same.
>>>>
>>>>> However automatically generating the user ABI from the kernel one
>>>>> might still be a good idea in the long run.
>>>>
>>>>
>>>> My preference would be to try and use the kernel headers directly.
>>>
>>>
>>> I thought the same, especially after realizing that they are almost
>>> copy/paste from the vendor *-abi.h files.
>>>
>>>>
>>>> Jason
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Sept. 15, 2016, 6:56 p.m. UTC | #26
On Wed, Sep 14, 2016 at 12:36:19PM +0530, Parav Pandit wrote:
> Hi Dennis,
>
> Do you know how would HFI1 driver would work along with rdma cgroup?
>
> Hi Matan, Leon, Jason,
> Apart from HFI1, is there any other concern?
> Or Patch is good to go?

I didn't review it yet :(.
Sorry

>
> 4.8 dates are close by (2 weeks) and there are two git trees involved
> (that might cause merge error to Linus) so if there are no issues, I
> would like to make request to Doug to consider it for 4.8 early on.
>
> Parav
>
> On Mon, Sep 12, 2016 at 10:37 AM, Leon Romanovsky <leon@kernel.org> wrote:
> > On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:
> >> On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
> >> > > > > I've posted some initial work toward a) a while ago, and once we
> >> > >
> >> > > Did it get merged? Do you have a pointer?
> >> >
> >> > http://www.spinics.net/lists/linux-rdma/msg31958.html
> >>
> >> Right, I remember that. Certainly the right direction
> >>
> >> > > However, everything under verbs is not straightforward. The files in
> >> > > userspace are not copies...
> >> > >
> >> > > user:
> >> > >
> >> > > struct ibv_query_device {
> >> > >        __u32 command;
> >> > >        __u16 in_words;
> >> > >        __u16 out_words;
> >> > >        __u64 response;
> >> > >        __u64 driver_data[0];
> >> > > };
> >> > >
> >> > > kernel:
> >> > >
> >> > > struct ib_uverbs_query_device {
> >> > >         __u64 response;
> >> > >         __u64 driver_data[0];
> >> > > };
> >> >
> >> > We'll obviously need different strutures for the libibvers API
> >> > and the kernel interface in this case, and we'll need to figure out
> >> > how to properly translate them.  I think a cast, plus compile time
> >> > type checking ala BUILD_BUG_ON is the way to go.
> >>
> >> I'm not sure I follow, which would I cast?
> >>
> >> BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
> >>              sizeof(ib_uverbs_query_device))
> >>
> >> ?
> >>
> >> > > I'm thinking the best way forward might be to use a script and
> >> > > transform userspace into:
> >> > >
> >> > > struct ibv_query_device {
> >> > >   struct ib_uverbs_cmd_hdr hdr;
> >> > >   struct ib_uverbs_query_device cmd;
> >> > > };
> >> >
> >> > That would break the users of the interface.
> >>
> >> Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
> >> identical the modified libibverbs would still be binary compatible
> >> with all providers but not source compatible. Since all kernel
> >> supported providers are in rdma-plumbing we can add the '.cmd.' at the
> >> same time.
> >>
> >> The kernel uapi header would stay the same.
> >>
> >> > However automatically generating the user ABI from the kernel one
> >> > might still be a good idea in the long run.
> >>
> >> My preference would be to try and use the kernel headers directly.
> >
> > I thought the same, especially after realizing that they are almost
> > copy/paste from the vendor *-abi.h files.
> >
> >>
> >> Jason
Dennis Dalessandro Sept. 19, 2016, 1:10 p.m. UTC | #27
On Wed, 2016-09-14 at 12:36 +0530, Parav Pandit wrote:
> Hi Dennis,

> 

> Do you know how would HFI1 driver would work along with rdma cgroup?


Keep in mind HFI1 driver has two "modes" of operation. We support
verbs, and would surely fall in line with whatever cgroups do for IB
core. For our psm interface, not sure how cgroups would come into play.
Psm is designed to expose the hw to user and avoid the kernel when
possible adding more kernel control is sort of contrary to that.

Now that being said, Christoph recently made mention of maybe having a
drivers/psm [1]. I really haven't had a chance to think about the
implications of that, but maybe it's worth considering, after all we
have two implementations, qib and hfi1. So anyway I'm not sure we need
to be too concerned about cgroups right now as far as psm side of
things goes.

Depending how things shake out for the uAPI rewrite, or verbs 2.0 or
whatever we are calling it today things may change.

[1] http://marc.info/?l=linux-rdma&m=147401714313831&w=2

-Denny

> Hi Matan, Leon, Jason,

> Apart from HFI1, is there any other concern?

> Or Patch is good to go?

> 

> 4.8 dates are close by (2 weeks) and there are two git trees involved

> (that might cause merge error to Linus) so if there are no issues, I

> would like to make request to Doug to consider it for 4.8 early on.

> 

> Parav

> 

> On Mon, Sep 12, 2016 at 10:37 AM, Leon Romanovsky <leon@kernel.org>

> wrote:

> > On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:

> > > On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig

> > > wrote:

> > > > > > > I've posted some initial work toward a) a while ago, and

> > > > > > > once we

> > > > > 

> > > > > Did it get merged? Do you have a pointer?

> > > > 

> > > > http://www.spinics.net/lists/linux-rdma/msg31958.html

> > > 

> > > Right, I remember that. Certainly the right direction

> > > 

> > > > > However, everything under verbs is not straightforward. The

> > > > > files in

> > > > > userspace are not copies...

> > > > > 

> > > > > user:

> > > > > 

> > > > > struct ibv_query_device {

> > > > >        __u32 command;

> > > > >        __u16 in_words;

> > > > >        __u16 out_words;

> > > > >        __u64 response;

> > > > >        __u64 driver_data[0];

> > > > > };

> > > > > 

> > > > > kernel:

> > > > > 

> > > > > struct ib_uverbs_query_device {

> > > > >         __u64 response;

> > > > >         __u64 driver_data[0];

> > > > > };

> > > > 

> > > > We'll obviously need different strutures for the libibvers API

> > > > and the kernel interface in this case, and we'll need to figure

> > > > out

> > > > how to properly translate them.  I think a cast, plus compile

> > > > time

> > > > type checking ala BUILD_BUG_ON is the way to go.

> > > 

> > > I'm not sure I follow, which would I cast?

> > > 

> > > BUILD_BUG_ON(sizeof(ibv_query_device) ==

> > > sizeof(ib_uverbs_cmd_hdr) +

> > >              sizeof(ib_uverbs_query_device))

> > > 

> > > ?

> > > 

> > > > > I'm thinking the best way forward might be to use a script

> > > > > and

> > > > > transform userspace into:

> > > > > 

> > > > > struct ibv_query_device {

> > > > >   struct ib_uverbs_cmd_hdr hdr;

> > > > >   struct ib_uverbs_query_device cmd;

> > > > > };

> > > > 

> > > > That would break the users of the interface.

> > > 

> > > Sorry, I mean doing this inside rdma-plumbing. Since the change

> > > is ABI

> > > identical the modified libibverbs would still be binary

> > > compatible

> > > with all providers but not source compatible. Since all kernel

> > > supported providers are in rdma-plumbing we can add the '.cmd.'

> > > at the

> > > same time.

> > > 

> > > The kernel uapi header would stay the same.

> > > 

> > > > However automatically generating the user ABI from the kernel

> > > > one

> > > > might still be a good idea in the long run.

> > > 

> > > My preference would be to try and use the kernel headers

> > > directly.

> > 

> > I thought the same, especially after realizing that they are almost

> > copy/paste from the vendor *-abi.h files.

> > 

> > > 

> > > Jason

> --

> To unsubscribe from this list: send the line "unsubscribe linux-rdma" 

> in

> the body of a message to majordomo@vger.kernel.org

> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Sept. 19, 2016, 5 p.m. UTC | #28
Hi Denny,

On Mon, Sep 19, 2016 at 6:40 PM, Dalessandro, Dennis
<dennis.dalessandro@intel.com> wrote:
> On Wed, 2016-09-14 at 12:36 +0530, Parav Pandit wrote:
>> Hi Dennis,
>>
>> Do you know how would HFI1 driver would work along with rdma cgroup?
>
> Keep in mind HFI1 driver has two "modes" of operation. We support
> verbs, and would surely fall in line with whatever cgroups do for IB
> core.
Thanks for the feedback.

> For our psm interface, not sure how cgroups would come into play.
> Psm is designed to expose the hw to user and avoid the kernel when
> possible adding more kernel control is sort of contrary to that.
>
Yes, PSM is currently out of RDMA cgroup and in future we can take a
look on how things shape as subsystem if it does.

> Now that being said, Christoph recently made mention of maybe having a
> drivers/psm [1]. I really haven't had a chance to think about the
> implications of that, but maybe it's worth considering, after all we
> have two implementations, qib and hfi1. So anyway I'm not sure we need
> to be too concerned about cgroups right now as far as psm side of
> things goes.
>
o.k.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Sept. 21, 2016, 4:43 a.m. UTC | #29
Hi Leon,

On Fri, Sep 16, 2016 at 12:26 AM, Leon Romanovsky <leon@kernel.org> wrote:
> On Wed, Sep 14, 2016 at 12:36:19PM +0530, Parav Pandit wrote:
>> Hi Dennis,
>>
>> Do you know how would HFI1 driver would work along with rdma cgroup?
>>
>> Hi Matan, Leon, Jason,
>> Apart from HFI1, is there any other concern?
>> Or Patch is good to go?
>
> I didn't review it yet :(.
> Sorry
>

We have completed review from Tejun, Christoph.
HFI driver folks also provided feedback for Intel drivers.
Matan's also doesn't have any more comments.

If possible, if you can also review, it will be helpful.

I have some more changes unrelated to cgroup in same files in both the git tree.
Pushing them now either results into merge conflict later on for
Doug/Tejun, or requires rebase and resending patch.
If you can review, we can avoid such rework.

>>
>> 4.8 dates are close by (2 weeks) and there are two git trees involved
>> (that might cause merge error to Linus) so if there are no issues, I
>> would like to make request to Doug to consider it for 4.8 early on.
>>
>> Parav
>>
>> On Mon, Sep 12, 2016 at 10:37 AM, Leon Romanovsky <leon@kernel.org> wrote:
>> > On Sun, Sep 11, 2016 at 11:52:35AM -0600, Jason Gunthorpe wrote:
>> >> On Sun, Sep 11, 2016 at 07:24:45PM +0200, Christoph Hellwig wrote:
>> >> > > > > I've posted some initial work toward a) a while ago, and once we
>> >> > >
>> >> > > Did it get merged? Do you have a pointer?
>> >> >
>> >> > http://www.spinics.net/lists/linux-rdma/msg31958.html
>> >>
>> >> Right, I remember that. Certainly the right direction
>> >>
>> >> > > However, everything under verbs is not straightforward. The files in
>> >> > > userspace are not copies...
>> >> > >
>> >> > > user:
>> >> > >
>> >> > > struct ibv_query_device {
>> >> > >        __u32 command;
>> >> > >        __u16 in_words;
>> >> > >        __u16 out_words;
>> >> > >        __u64 response;
>> >> > >        __u64 driver_data[0];
>> >> > > };
>> >> > >
>> >> > > kernel:
>> >> > >
>> >> > > struct ib_uverbs_query_device {
>> >> > >         __u64 response;
>> >> > >         __u64 driver_data[0];
>> >> > > };
>> >> >
>> >> > We'll obviously need different strutures for the libibvers API
>> >> > and the kernel interface in this case, and we'll need to figure out
>> >> > how to properly translate them.  I think a cast, plus compile time
>> >> > type checking ala BUILD_BUG_ON is the way to go.
>> >>
>> >> I'm not sure I follow, which would I cast?
>> >>
>> >> BUILD_BUG_ON(sizeof(ibv_query_device) == sizeof(ib_uverbs_cmd_hdr) +
>> >>              sizeof(ib_uverbs_query_device))
>> >>
>> >> ?
>> >>
>> >> > > I'm thinking the best way forward might be to use a script and
>> >> > > transform userspace into:
>> >> > >
>> >> > > struct ibv_query_device {
>> >> > >   struct ib_uverbs_cmd_hdr hdr;
>> >> > >   struct ib_uverbs_query_device cmd;
>> >> > > };
>> >> >
>> >> > That would break the users of the interface.
>> >>
>> >> Sorry, I mean doing this inside rdma-plumbing. Since the change is ABI
>> >> identical the modified libibverbs would still be binary compatible
>> >> with all providers but not source compatible. Since all kernel
>> >> supported providers are in rdma-plumbing we can add the '.cmd.' at the
>> >> same time.
>> >>
>> >> The kernel uapi header would stay the same.
>> >>
>> >> > However automatically generating the user ABI from the kernel one
>> >> > might still be a good idea in the long run.
>> >>
>> >> My preference would be to try and use the kernel headers directly.
>> >
>> > I thought the same, especially after realizing that they are almost
>> > copy/paste from the vendor *-abi.h files.
>> >
>> >>
>> >> Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo Sept. 21, 2016, 2:26 p.m. UTC | #30
Hello, Parav.

On Wed, Sep 21, 2016 at 10:13:38AM +0530, Parav Pandit wrote:
> We have completed review from Tejun, Christoph.
> HFI driver folks also provided feedback for Intel drivers.
> Matan's also doesn't have any more comments.
> 
> If possible, if you can also review, it will be helpful.
> 
> I have some more changes unrelated to cgroup in same files in both the git tree.
> Pushing them now either results into merge conflict later on for
> Doug/Tejun, or requires rebase and resending patch.
> If you can review, we can avoid such rework.

My impression of the thread was that there doesn't seem to be enough
of consensus around how rdma resources should be defined.  Is that
part agreed upon now?

Thanks.
Parav Pandit Sept. 21, 2016, 4:02 p.m. UTC | #31
Hi Tejun,

On Wed, Sep 21, 2016 at 7:56 PM, Tejun Heo <tj@kernel.org> wrote:
> Hello, Parav.
>
> On Wed, Sep 21, 2016 at 10:13:38AM +0530, Parav Pandit wrote:
>> We have completed review from Tejun, Christoph.
>> HFI driver folks also provided feedback for Intel drivers.
>> Matan's also doesn't have any more comments.
>>
>> If possible, if you can also review, it will be helpful.
>>
>> I have some more changes unrelated to cgroup in same files in both the git tree.
>> Pushing them now either results into merge conflict later on for
>> Doug/Tejun, or requires rebase and resending patch.
>> If you can review, we can avoid such rework.
>
> My impression of the thread was that there doesn't seem to be enough
> of consensus around how rdma resources should be defined.  Is that
> part agreed upon now?
>

We ended up discussing few points on different thread [1].

There was confusion on how some non-rdma/non-IB drivers would work
with rdma cgroup from Matan.
Christoph explained how they don't fit in the rdma subsystem and
therefore its not prime target to addess.
Intel driver maintainer Denny also acknowledged same on [2].
IB compliant drivers of Intel support rdma cgroup as explained in [2].
With that usnic and Intel psm drivers falls out of rdma cgroup support
as they don't fit very well in the verbs definition.

[1] https://www.spinics.net/lists/linux-rdma/msg40340.html
[2] http://www.spinics.net/lists/linux-rdma/msg40717.html

I will wait for Leon's review comments if he has different view on architecture.
Back in April when I met face-to-face to Leon and Haggai, Leon was in
support to have kernel defined the rdma resources as suggested by
Christoph and Tejun instead of IB/RDMA subsystem.
I will wait for his comments if his views have changed with new uAPI
taking shape.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Oct. 4, 2016, 6:19 p.m. UTC | #32
Hi Doug,

I am still waiting for Leon to provide his comments if any on rdma cgroup.
From other email context, he was on vacation last week.
While we wait for his comments, I wanted to know your view of this
patchset in 4.9 merge window.

To summarize the discussion that happened in two threads.

[1] Ack by Tejun, asking for review from rdma list
[2] quick review by Christoph on patch-v11 (patch 12 has only typo corrections)
[3] Christoph's ack on architecture of rdma cgroup and fitting it with ABI
[4] My response on Matan's query on RSS indirection table
[5] Response from Intel on their driver support for Matan's query
[6] Christoph's point on architecture, which we are following in new
ABI and current ABI

I have reviewed recent patch [7] from Matan where I see IB verbs
objects are still handled through common path as suggested by
Christoph.

I do not see any issues with rdma cgroup patchset other than it requires rebase.
Am I missing something?
Can you please help me - What would be required to merge it to 4.9?

[1] https://lkml.org/lkml/2016/8/31/494
[2] https://lkml.org/lkml/2016/8/25/146
[3] https://lkml.org/lkml/2016/9/10/175
[4] https://lkml.org/lkml/2016/9/14/221
[5] https://lkml.org/lkml/2016/9/19/571
[6] http://www.spinics.net/lists/linux-rdma/msg40337.html
[7] email subject: [RFC ABI V4 0/7] SG-based RDMA ABI Proposal

Regards,
Parav Pandit

On Wed, Sep 21, 2016 at 9:32 PM, Parav Pandit <pandit.parav@gmail.com> wrote:
> Hi Tejun,
>
> On Wed, Sep 21, 2016 at 7:56 PM, Tejun Heo <tj@kernel.org> wrote:
>> Hello, Parav.
>>
>> On Wed, Sep 21, 2016 at 10:13:38AM +0530, Parav Pandit wrote:
>>> We have completed review from Tejun, Christoph.
>>> HFI driver folks also provided feedback for Intel drivers.
>>> Matan's also doesn't have any more comments.
>>>
>>> If possible, if you can also review, it will be helpful.
>>>
>>> I have some more changes unrelated to cgroup in same files in both the git tree.
>>> Pushing them now either results into merge conflict later on for
>>> Doug/Tejun, or requires rebase and resending patch.
>>> If you can review, we can avoid such rework.
>>
>> My impression of the thread was that there doesn't seem to be enough
>> of consensus around how rdma resources should be defined.  Is that
>> part agreed upon now?
>>
>
> We ended up discussing few points on different thread [1].
>
> There was confusion on how some non-rdma/non-IB drivers would work
> with rdma cgroup from Matan.
> Christoph explained how they don't fit in the rdma subsystem and
> therefore its not prime target to addess.
> Intel driver maintainer Denny also acknowledged same on [2].
> IB compliant drivers of Intel support rdma cgroup as explained in [2].
> With that usnic and Intel psm drivers falls out of rdma cgroup support
> as they don't fit very well in the verbs definition.
>
> [1] https://www.spinics.net/lists/linux-rdma/msg40340.html
> [2] http://www.spinics.net/lists/linux-rdma/msg40717.html
>
> I will wait for Leon's review comments if he has different view on architecture.
> Back in April when I met face-to-face to Leon and Haggai, Leon was in
> support to have kernel defined the rdma resources as suggested by
> Christoph and Tejun instead of IB/RDMA subsystem.
> I will wait for his comments if his views have changed with new uAPI
> taking shape.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 5, 2016, 6:37 a.m. UTC | #33
FYI, the patches look fine to me:

Acked-by: Christoph Hellwig <hch@lst.de>

but we're past the merge window for 4.9 now unfortunately.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Oct. 5, 2016, 11:22 a.m. UTC | #34
On Wed, Oct 05, 2016 at 08:37:35AM +0200, Christoph Hellwig wrote:
> FYI, the patches look fine to me:
>
> Acked-by: Christoph Hellwig <hch@lst.de>
>
> but we're past the merge window for 4.9 now unfortunately.

IMHO, it still can make it.

Thanks
Tejun Heo Oct. 5, 2016, 3:36 p.m. UTC | #35
Hello,

On Wed, Oct 05, 2016 at 02:22:57PM +0300, Leon Romanovsky wrote:
> On Wed, Oct 05, 2016 at 08:37:35AM +0200, Christoph Hellwig wrote:
> > FYI, the patches look fine to me:
> >
> > Acked-by: Christoph Hellwig <hch@lst.de>
> >
> > but we're past the merge window for 4.9 now unfortunately.
> 
> IMHO, it still can make it.

Most likely, we only have three / four days till rc1 opens, I think
it's too late.  Let's target the next one.

Thanks.
Parav Pandit Oct. 6, 2016, 12:55 p.m. UTC | #36
Hi Christoph,

On Wed, Oct 5, 2016 at 12:07 PM, Christoph Hellwig <hch@lst.de> wrote:
> FYI, the patches look fine to me:
>
> Acked-by: Christoph Hellwig <hch@lst.de>
>
Thanks a lot for review.

Parav
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Oct. 18, 2016, 8:15 p.m. UTC | #37
Hi Doug,

Leon has finished review as well in [7].
Christoph Acked too in [8].

Can you please advise whether
(1) I should rebase and resend PatchV12?
(2) If so for which branch - master/4.9 or?

Tejun and Christoph mentioned that it might be late for 4.9.
Can we atleast merge to linux-rdma tree, so that more features/changes
can be done on top of it?

How can we avoid merge conflict to Linus since this patchset is
applicable to two git trees. (cgroup and linux-rdma).
I was thinking to push through linux-rdma as it is currently going
through more changes, so resolving merge conflict would be simpler if
that happens.

Please provide the direction.

[7] https://lkml.org/lkml/2016/10/5/134
[8] https://lkml.org/lkml/2016/10/5/30

Regards,
Parav Pandit

On Tue, Oct 4, 2016 at 11:49 PM, Parav Pandit <pandit.parav@gmail.com> wrote:
> Hi Doug,
>
> I am still waiting for Leon to provide his comments if any on rdma cgroup.
> From other email context, he was on vacation last week.
> While we wait for his comments, I wanted to know your view of this
> patchset in 4.9 merge window.
>
> To summarize the discussion that happened in two threads.
>
> [1] Ack by Tejun, asking for review from rdma list
> [2] quick review by Christoph on patch-v11 (patch 12 has only typo corrections)
> [3] Christoph's ack on architecture of rdma cgroup and fitting it with ABI
> [4] My response on Matan's query on RSS indirection table
> [5] Response from Intel on their driver support for Matan's query
> [6] Christoph's point on architecture, which we are following in new
> ABI and current ABI
>
> I have reviewed recent patch [7] from Matan where I see IB verbs
> objects are still handled through common path as suggested by
> Christoph.
>
> I do not see any issues with rdma cgroup patchset other than it requires rebase.
> Am I missing something?
> Can you please help me - What would be required to merge it to 4.9?
>
> [1] https://lkml.org/lkml/2016/8/31/494
> [2] https://lkml.org/lkml/2016/8/25/146
> [3] https://lkml.org/lkml/2016/9/10/175
> [4] https://lkml.org/lkml/2016/9/14/221
> [5] https://lkml.org/lkml/2016/9/19/571
> [6] http://www.spinics.net/lists/linux-rdma/msg40337.html
> [7] email subject: [RFC ABI V4 0/7] SG-based RDMA ABI Proposal
>
> Regards,
> Parav Pandit
>
> On Wed, Sep 21, 2016 at 9:32 PM, Parav Pandit <pandit.parav@gmail.com> wrote:
>> Hi Tejun,
>>
>> On Wed, Sep 21, 2016 at 7:56 PM, Tejun Heo <tj@kernel.org> wrote:
>>> Hello, Parav.
>>>
>>> On Wed, Sep 21, 2016 at 10:13:38AM +0530, Parav Pandit wrote:
>>>> We have completed review from Tejun, Christoph.
>>>> HFI driver folks also provided feedback for Intel drivers.
>>>> Matan's also doesn't have any more comments.
>>>>
>>>> If possible, if you can also review, it will be helpful.
>>>>
>>>> I have some more changes unrelated to cgroup in same files in both the git tree.
>>>> Pushing them now either results into merge conflict later on for
>>>> Doug/Tejun, or requires rebase and resending patch.
>>>> If you can review, we can avoid such rework.
>>>
>>> My impression of the thread was that there doesn't seem to be enough
>>> of consensus around how rdma resources should be defined.  Is that
>>> part agreed upon now?
>>>
>>
>> We ended up discussing few points on different thread [1].
>>
>> There was confusion on how some non-rdma/non-IB drivers would work
>> with rdma cgroup from Matan.
>> Christoph explained how they don't fit in the rdma subsystem and
>> therefore its not prime target to addess.
>> Intel driver maintainer Denny also acknowledged same on [2].
>> IB compliant drivers of Intel support rdma cgroup as explained in [2].
>> With that usnic and Intel psm drivers falls out of rdma cgroup support
>> as they don't fit very well in the verbs definition.
>>
>> [1] https://www.spinics.net/lists/linux-rdma/msg40340.html
>> [2] http://www.spinics.net/lists/linux-rdma/msg40717.html
>>
>> I will wait for Leon's review comments if he has different view on architecture.
>> Back in April when I met face-to-face to Leon and Haggai, Leon was in
>> support to have kernel defined the rdma resources as suggested by
>> Christoph and Tejun instead of IB/RDMA subsystem.
>> I will wait for his comments if his views have changed with new uAPI
>> taking shape.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 0000000..6710e28
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,66 @@ 
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+	RDMACG_VERB_RESOURCE_UCTX,
+	RDMACG_VERB_RESOURCE_AH,
+	RDMACG_VERB_RESOURCE_PD,
+	RDMACG_VERB_RESOURCE_CQ,
+	RDMACG_VERB_RESOURCE_MR,
+	RDMACG_VERB_RESOURCE_MW,
+	RDMACG_VERB_RESOURCE_SRQ,
+	RDMACG_VERB_RESOURCE_QP,
+	RDMACG_VERB_RESOURCE_FLOW,
+	/*
+	 * add any hw specific resource here as RDMA_HW_RESOURCE_NAME
+	 */
+	RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * head to keep track of all resource pools
+	 * that belongs to this cgroup.
+	 */
+	struct list_head		rpools;
+};
+
+struct rdmacg_device {
+	struct list_head	dev_node;
+	struct list_head	rpools;
+	char			*name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index);
+void rdmacg_query_limit(struct rdmacg_device *device,
+			int *limits);
+
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..d0e597c 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@  SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/init/Kconfig b/init/Kconfig
index cac3f09..c7dc64b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1080,6 +1080,16 @@  config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_RDMA
+	bool "RDMA controller"
+	help
+	  Provides enforcement of RDMA resources defined by IB stack.
+	  It is fairly easy for consumers to exhaust RDMA resources, which
+	  can result into resource unavailability to other consumers.
+	  RDMA controller is designed to stop this from happening.
+	  Attaching processes with active RDMA resources to the cgroup
+	  hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e..d2b76d0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -67,6 +67,7 @@  obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
+obj-$(CONFIG_CGROUP_RDMA) += cgroup_rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_rdma.c b/kernel/cgroup_rdma.c
new file mode 100644
index 0000000..6ab9bd9
--- /dev/null
+++ b/kernel/cgroup_rdma.c
@@ -0,0 +1,664 @@ 
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+	RDMACG_RESOURCE_TYPE_MAX,
+	RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+	[RDMACG_VERB_RESOURCE_UCTX]	= "uctx",
+	[RDMACG_VERB_RESOURCE_AH]	= "ah",
+	[RDMACG_VERB_RESOURCE_PD]	= "pd",
+	[RDMACG_VERB_RESOURCE_CQ]	= "cq",
+	[RDMACG_VERB_RESOURCE_MR]	= "mr",
+	[RDMACG_VERB_RESOURCE_MW]	= "mw",
+	[RDMACG_VERB_RESOURCE_SRQ]	= "srq",
+	[RDMACG_VERB_RESOURCE_QP]	= "qp",
+	[RDMACG_VERB_RESOURCE_FLOW]	= "flow",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+	int max;
+	int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+	struct rdmacg_device	*device;
+	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
+
+	struct list_head	cg_node;
+	struct list_head	dev_node;
+
+	/* count active user tasks of this pool */
+	u64			usage_sum;
+	/* total number counts which are set to max */
+	int			num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+	return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+			       int index, int new_max)
+{
+	if (new_max == S32_MAX) {
+		if (rpool->resources[index].max != S32_MAX)
+			rpool->num_max_cnt++;
+	} else {
+		if (rpool->resources[index].max == S32_MAX)
+			rpool->num_max_cnt--;
+	}
+	rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+	int i;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+		set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_del(&rpool->cg_node);
+	list_del(&rpool->dev_node);
+	kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device)
+
+{
+	struct rdmacg_resource_pool *pool;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(pool, &cg->rpools, cg_node)
+		if (pool->device == device)
+			return pool;
+
+	return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+	if (rpool)
+		return rpool;
+
+	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+	if (!rpool)
+		return ERR_PTR(-ENOMEM);
+
+	rpool->device = device;
+	set_all_resource_max_limit(rpool);
+
+	INIT_LIST_HEAD(&rpool->cg_node);
+	INIT_LIST_HEAD(&rpool->dev_node);
+	list_add_tail(&rpool->cg_node, &cg->rpools);
+	list_add_tail(&rpool->dev_node, &device->rpools);
+	return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+		   struct rdmacg_device *device,
+		   enum rdmacg_resource_type index)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+
+	/*
+	 * rpool cannot be null at this stage. Let kernel operate in case
+	 * if there a bug in IB stack or rdma controller, instead of crashing
+	 * the system.
+	 */
+	if (unlikely(!rpool)) {
+		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+		return;
+	}
+
+	rpool->resources[index].usage--;
+
+	/*
+	 * A negative count (or overflow) is invalid,
+	 * it indicates a bug in the rdma controller.
+	 */
+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
+	rpool->usage_sum--;
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+}
+
+/**
+ * rdmacg_uncharge_hirerchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hirerchy(struct rdma_cgroup *cg,
+				     struct rdmacg_device *device,
+				     struct rdma_cgroup *stop_cg,
+				     enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *p;
+
+	mutex_lock(&rdmacg_mutex);
+
+	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+		uncharge_cg_locked(p, device, index);
+
+	mutex_unlock(&rdmacg_mutex);
+
+	css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index)
+{
+	if (index >= RDMACG_RESOURCE_MAX)
+		return;
+
+	rdmacg_uncharge_hirerchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *cg, *p;
+	struct rdmacg_resource_pool *rpool;
+	s64 new;
+	int ret = 0;
+
+	if (index >= RDMACG_RESOURCE_MAX)
+		return -EINVAL;
+
+	/*
+	 * hold on to css, as cgroup can be removed but resource
+	 * accounting happens on css.
+	 */
+	cg = get_current_rdmacg();
+
+	mutex_lock(&rdmacg_mutex);
+	for (p = cg; p; p = parent_rdmacg(p)) {
+		rpool = get_cg_rpool_locked(p, device);
+		if (IS_ERR_OR_NULL(rpool)) {
+			ret = PTR_ERR(rpool);
+			goto err;
+		} else {
+			new = rpool->resources[index].usage + 1;
+			if (new > rpool->resources[index].max) {
+				ret = -EAGAIN;
+				goto err;
+			} else {
+				rpool->resources[index].usage = new;
+				rpool->usage_sum++;
+			}
+		}
+	}
+	mutex_unlock(&rdmacg_mutex);
+
+	*rdmacg = cg;
+	return 0;
+
+err:
+	mutex_unlock(&rdmacg_mutex);
+	rdmacg_uncharge_hirerchy(cg, device, p, index);
+	return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+	INIT_LIST_HEAD(&device->dev_node);
+	INIT_LIST_HEAD(&device->rpools);
+
+	mutex_lock(&rdmacg_mutex);
+	list_add_tail(&device->dev_node, &rdmacg_devices);
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool, *tmp;
+
+	/*
+	 * Synchronize with any active resource settings,
+	 * usage query happening via configfs.
+	 */
+	mutex_lock(&rdmacg_mutex);
+	list_del_init(&device->dev_node);
+
+	/*
+	 * Now that this device is off the cgroup list, its safe to free
+	 * all the rpool resources.
+	 */
+	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+		free_cg_rpool_locked(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+/**
+ * rdmacg_query_limit - query the resource limits that
+ * might have been configured by the user.
+ * @device: pointer to rdmacg device
+ * @limits: pointer to an array of limits where rdma cg will provide
+ *          the configured limits of the cgroup.
+ *
+ * This function honors resource limit in hierarchical way.
+ * In a cgroup hirarchy, it considers the limit of a controller which has
+ * smallest limit configured.
+ */
+void rdmacg_query_limit(struct rdmacg_device *device, int *limits)
+{
+	struct rdma_cgroup *cg, *p;
+	struct rdmacg_resource_pool *rpool;
+	int i;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+		limits[i] = S32_MAX;
+
+	cg = get_current_rdmacg();
+	/*
+	 * Check in hirerchy which pool get the least amount of
+	 * resource limits.
+	 */
+	mutex_lock(&rdmacg_mutex);
+	for (p = cg; p; p = parent_rdmacg(p)) {
+		rpool = find_cg_rpool_locked(cg, device);
+		if (!rpool)
+			continue;
+
+		for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+			limits[i] = min_t(int, limits[i],
+					rpool->resources[i].max);
+	}
+	mutex_unlock(&rdmacg_mutex);
+	css_put(&cg->css);
+}
+EXPORT_SYMBOL(rdmacg_query_limit);
+
+static int parse_resource(char *c, int *intval)
+{
+	substring_t argstr;
+	const char **table = &rdmacg_resource_names[0];
+	char *name, *value = c;
+	size_t len;
+	int ret, i = 0;
+
+	name = strsep(&value, "=");
+	if (!name || !value)
+		return -EINVAL;
+
+	len = strlen(value);
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		if (strcmp(table[i], name))
+			continue;
+
+		argstr.from = value;
+		argstr.to = value + len;
+
+		ret = match_int(&argstr, intval);
+		if (ret >= 0) {
+			if (*intval < 0)
+				break;
+			return i;
+		}
+		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+			*intval = S32_MAX;
+			return i;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+			       int *new_limits, unsigned long *enables)
+{
+	char *c;
+	int err = -EINVAL;
+
+	/* parse resource options */
+	while ((c = strsep(&options, " ")) != NULL) {
+		int index, intval;
+
+		index = parse_resource(c, &intval);
+		if (index < 0)
+			goto err;
+
+		new_limits[index] = intval;
+		*enables |= BIT(index);
+	}
+	return 0;
+
+err:
+	return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+	struct rdmacg_device *device;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node)
+		if (!strcmp(name, device->name))
+			return device;
+
+	return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+	const char *dev_name;
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_device *device;
+	char *options = strstrip(buf);
+	int *new_limits;
+	unsigned long enables = 0;
+	int i = 0, ret = 0;
+
+	/* extract the device name first */
+	dev_name = strsep(&options, " ");
+	if (!dev_name) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+	if (!new_limits) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdmacg_parse_limits(options, new_limits, &enables);
+	if (ret)
+		goto parse_err;
+
+	/* acquire lock to synchronize with hot plug devices */
+	mutex_lock(&rdmacg_mutex);
+
+	device = rdmacg_get_device_locked(dev_name);
+	if (!device) {
+		ret = -ENODEV;
+		goto dev_err;
+	}
+
+	rpool = get_cg_rpool_locked(cg, device);
+	if (IS_ERR_OR_NULL(rpool)) {
+		ret = PTR_ERR(rpool);
+		goto dev_err;
+	}
+
+	/* now set the new limits of the rpool */
+	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+		set_resource_limit(rpool, i, new_limits[i]);
+
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+
+dev_err:
+	mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+	kfree(new_limits);
+
+err:
+	return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+			       struct rdmacg_resource_pool *rpool)
+{
+	enum rdmacg_file_type sf_type;
+	int i;
+	u32 value;
+
+	sf_type = seq_cft(sf)->private;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		seq_puts(sf, rdmacg_resource_names[i]);
+		seq_putc(sf, '=');
+		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+			if (rpool)
+				value = rpool->resources[i].max;
+			else
+				value = S32_MAX;
+		} else {
+			if (rpool)
+				value = rpool->resources[i].usage;
+		}
+
+		if (value == S32_MAX)
+			seq_puts(sf, RDMACG_MAX_STR);
+		else
+			seq_printf(sf, "%d", value);
+		seq_putc(sf, ' ');
+	}
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+	struct rdmacg_device *device;
+	struct rdmacg_resource_pool *rpool;
+	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node) {
+		seq_printf(sf, "%s ", device->name);
+
+		rpool = find_cg_rpool_locked(cg, device);
+		print_rpool_values(sf, rpool);
+
+		seq_putc(sf, '\n');
+	}
+
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+	{
+		.name = "max",
+		.write = rdmacg_resource_set_max,
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_MAX,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_STAT,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct rdma_cgroup *cg;
+
+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+	if (!cg)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&cg->rpools);
+	return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+
+	kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+	struct rdmacg_resource_pool *rpool;
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(rpool, &cg->rpools, cg_node)
+		set_all_resource_max_limit(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+	.css_alloc	= rdmacg_css_alloc,
+	.css_free	= rdmacg_css_free,
+	.css_offline	= rdmacg_css_offline,
+	.legacy_cftypes	= rdmacg_files,
+	.dfl_cftypes	= rdmacg_files,
+};