diff mbox series

[1/5] IB/core: add a simple SRQ set per PD

Message ID 20200317134030.152833-2-maxg@mellanox.com (mailing list archive)
State Superseded
Headers show
Series nvmet-rdma/srpt: SRQ per completion vector | expand

Commit Message

Max Gurtovoy March 17, 2020, 1:40 p.m. UTC
ULP's can use this API to create/destroy SRQ's with the same
characteristics for implementing a logic that aimed to save resources
without significant performance penalty (e.g. create SRQ per completion
vector and use shared receive buffers for multiple controllers of the
ULP).

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
---
 drivers/infiniband/core/Makefile  |  2 +-
 drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/verbs.c   |  4 ++
 include/rdma/ib_verbs.h           |  5 +++
 include/rdma/srq_set.h            | 18 +++++++++
 5 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/srq_set.c
 create mode 100644 include/rdma/srq_set.h

Comments

Leon Romanovsky March 17, 2020, 1:55 p.m. UTC | #1
On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
> ULP's can use this API to create/destroy SRQ's with the same
> characteristics for implementing a logic that aimed to save resources
> without significant performance penalty (e.g. create SRQ per completion
> vector and use shared receive buffers for multiple controllers of the
> ULP).
>
> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
> ---
>  drivers/infiniband/core/Makefile  |  2 +-
>  drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
>  drivers/infiniband/core/verbs.c   |  4 ++
>  include/rdma/ib_verbs.h           |  5 +++
>  include/rdma/srq_set.h            | 18 +++++++++
>  5 files changed, 106 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/srq_set.c
>  create mode 100644 include/rdma/srq_set.h
>
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d1b14887..1d3eaec 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
>  				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
>  				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>  				nldev.o restrack.o counters.o ib_core_uverbs.o \
> -				trace.o
> +				trace.o srq_set.o

Why did you call it "srq_set.c" and not "srq.c"?

>
>  ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>  ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
> diff --git a/drivers/infiniband/core/srq_set.c b/drivers/infiniband/core/srq_set.c
> new file mode 100644
> index 0000000..d143561
> --- /dev/null
> +++ b/drivers/infiniband/core/srq_set.c
> @@ -0,0 +1,78 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
> + */
> +
> +#include <rdma/srq_set.h>
> +
> +struct ib_srq *rdma_srq_get(struct ib_pd *pd)
> +{
> +	struct ib_srq *srq;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pd->srq_lock, flags);
> +	srq = list_first_entry_or_null(&pd->srqs, struct ib_srq, pd_entry);
> +	if (srq) {
> +		list_del(&srq->pd_entry);
> +		pd->srqs_used++;

I don't see any real usage of srqs_used.

> +	}
> +	spin_unlock_irqrestore(&pd->srq_lock, flags);
> +
> +	return srq;
> +}
> +EXPORT_SYMBOL(rdma_srq_get);
> +
> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pd->srq_lock, flags);
> +	list_add(&srq->pd_entry, &pd->srqs);
> +	pd->srqs_used--;
> +	spin_unlock_irqrestore(&pd->srq_lock, flags);
> +}
> +EXPORT_SYMBOL(rdma_srq_put);
> +
> +int rdma_srq_set_init(struct ib_pd *pd, int nr,
> +		struct ib_srq_init_attr *srq_attr)
> +{
> +	struct ib_srq *srq;
> +	unsigned long flags;
> +	int ret, i;
> +
> +	for (i = 0; i < nr; i++) {
> +		srq = ib_create_srq(pd, srq_attr);
> +		if (IS_ERR(srq)) {
> +			ret = PTR_ERR(srq);
> +			goto out;
> +		}
> +
> +		spin_lock_irqsave(&pd->srq_lock, flags);
> +		list_add_tail(&srq->pd_entry, &pd->srqs);
> +		spin_unlock_irqrestore(&pd->srq_lock, flags);
> +	}
> +
> +	return 0;
> +out:
> +	rdma_srq_set_destroy(pd);
> +	return ret;
> +}
> +EXPORT_SYMBOL(rdma_srq_set_init);
> +
> +void rdma_srq_set_destroy(struct ib_pd *pd)
> +{
> +	struct ib_srq *srq;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pd->srq_lock, flags);
> +	while (!list_empty(&pd->srqs)) {
> +		srq = list_first_entry(&pd->srqs, struct ib_srq, pd_entry);
> +		list_del(&srq->pd_entry);
> +
> +		spin_unlock_irqrestore(&pd->srq_lock, flags);
> +		ib_destroy_srq(srq);
> +		spin_lock_irqsave(&pd->srq_lock, flags);
> +	}
> +	spin_unlock_irqrestore(&pd->srq_lock, flags);
> +}
> +EXPORT_SYMBOL(rdma_srq_set_destroy);
> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index e62c9df..6950abf 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -272,6 +272,9 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
>  	pd->__internal_mr = NULL;
>  	atomic_set(&pd->usecnt, 0);
>  	pd->flags = flags;
> +	pd->srqs_used = 0;
> +	spin_lock_init(&pd->srq_lock);
> +	INIT_LIST_HEAD(&pd->srqs);
>
>  	pd->res.type = RDMA_RESTRACK_PD;
>  	rdma_restrack_set_task(&pd->res, caller);
> @@ -340,6 +343,7 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
>  		pd->__internal_mr = NULL;
>  	}
>
> +	WARN_ON_ONCE(pd->srqs_used > 0);

It can be achieved by WARN_ON_ONCE(!list_empty(&pd->srqs))

>  	/* uverbs manipulates usecnt with proper locking, while the kabi
>  	   requires the caller to guarantee we can't race here. */
>  	WARN_ON(atomic_read(&pd->usecnt));
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 1f779fa..fc8207d 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1517,6 +1517,10 @@ struct ib_pd {
>
>  	u32			unsafe_global_rkey;
>
> +	spinlock_t		srq_lock;
> +	int			srqs_used;
> +	struct list_head	srqs;
> +
>  	/*
>  	 * Implementation details of the RDMA core, don't use in drivers:
>  	 */
> @@ -1585,6 +1589,7 @@ struct ib_srq {
>  	void		       *srq_context;
>  	enum ib_srq_type	srq_type;
>  	atomic_t		usecnt;
> +	struct list_head	pd_entry; /* srq set entry */
>
>  	struct {
>  		struct ib_cq   *cq;
> diff --git a/include/rdma/srq_set.h b/include/rdma/srq_set.h
> new file mode 100644
> index 0000000..834c4c6
> --- /dev/null
> +++ b/include/rdma/srq_set.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */
> +/*
> + * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
> + */
> +
> +#ifndef _RDMA_SRQ_SET_H
> +#define _RDMA_SRQ_SET_H 1

"1" is not needed.

> +
> +#include <rdma/ib_verbs.h>
> +
> +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);

At the end, it is not get/put semantics but more add/remove.

Thanks

> +
> +int rdma_srq_set_init(struct ib_pd *pd, int nr,
> +		struct ib_srq_init_attr *srq_attr);
> +void rdma_srq_set_destroy(struct ib_pd *pd);
> +
> +#endif /* _RDMA_SRQ_SET_H */
> --
> 1.8.3.1
>
Max Gurtovoy March 17, 2020, 4:37 p.m. UTC | #2
On 3/17/2020 3:55 PM, Leon Romanovsky wrote:
> On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
>> ULP's can use this API to create/destroy SRQ's with the same
>> characteristics for implementing a logic that aimed to save resources
>> without significant performance penalty (e.g. create SRQ per completion
>> vector and use shared receive buffers for multiple controllers of the
>> ULP).
>>
>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>> ---
>>   drivers/infiniband/core/Makefile  |  2 +-
>>   drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
>>   drivers/infiniband/core/verbs.c   |  4 ++
>>   include/rdma/ib_verbs.h           |  5 +++
>>   include/rdma/srq_set.h            | 18 +++++++++
>>   5 files changed, 106 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/infiniband/core/srq_set.c
>>   create mode 100644 include/rdma/srq_set.h
>>
>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>> index d1b14887..1d3eaec 100644
>> --- a/drivers/infiniband/core/Makefile
>> +++ b/drivers/infiniband/core/Makefile
>> @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
>>   				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
>>   				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>>   				nldev.o restrack.o counters.o ib_core_uverbs.o \
>> -				trace.o
>> +				trace.o srq_set.o
> Why did you call it "srq_set.c" and not "srq.c"?

because it's not a SRQ API but SRQ-set API.


>>   ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>>   ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
>> diff --git a/drivers/infiniband/core/srq_set.c b/drivers/infiniband/core/srq_set.c
>> new file mode 100644
>> index 0000000..d143561
>> --- /dev/null
>> +++ b/drivers/infiniband/core/srq_set.c
>> @@ -0,0 +1,78 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/*
>> + * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
>> + */
>> +
>> +#include <rdma/srq_set.h>
>> +
>> +struct ib_srq *rdma_srq_get(struct ib_pd *pd)
>> +{
>> +	struct ib_srq *srq;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&pd->srq_lock, flags);
>> +	srq = list_first_entry_or_null(&pd->srqs, struct ib_srq, pd_entry);
>> +	if (srq) {
>> +		list_del(&srq->pd_entry);
>> +		pd->srqs_used++;
> I don't see any real usage of srqs_used.
>
>> +	}
>> +	spin_unlock_irqrestore(&pd->srq_lock, flags);
>> +
>> +	return srq;
>> +}
>> +EXPORT_SYMBOL(rdma_srq_get);
>> +
>> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq)
>> +{
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&pd->srq_lock, flags);
>> +	list_add(&srq->pd_entry, &pd->srqs);
>> +	pd->srqs_used--;
>> +	spin_unlock_irqrestore(&pd->srq_lock, flags);
>> +}
>> +EXPORT_SYMBOL(rdma_srq_put);
>> +
>> +int rdma_srq_set_init(struct ib_pd *pd, int nr,
>> +		struct ib_srq_init_attr *srq_attr)
>> +{
>> +	struct ib_srq *srq;
>> +	unsigned long flags;
>> +	int ret, i;
>> +
>> +	for (i = 0; i < nr; i++) {
>> +		srq = ib_create_srq(pd, srq_attr);
>> +		if (IS_ERR(srq)) {
>> +			ret = PTR_ERR(srq);
>> +			goto out;
>> +		}
>> +
>> +		spin_lock_irqsave(&pd->srq_lock, flags);
>> +		list_add_tail(&srq->pd_entry, &pd->srqs);
>> +		spin_unlock_irqrestore(&pd->srq_lock, flags);
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	rdma_srq_set_destroy(pd);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(rdma_srq_set_init);
>> +
>> +void rdma_srq_set_destroy(struct ib_pd *pd)
>> +{
>> +	struct ib_srq *srq;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&pd->srq_lock, flags);
>> +	while (!list_empty(&pd->srqs)) {
>> +		srq = list_first_entry(&pd->srqs, struct ib_srq, pd_entry);
>> +		list_del(&srq->pd_entry);
>> +
>> +		spin_unlock_irqrestore(&pd->srq_lock, flags);
>> +		ib_destroy_srq(srq);
>> +		spin_lock_irqsave(&pd->srq_lock, flags);
>> +	}
>> +	spin_unlock_irqrestore(&pd->srq_lock, flags);
>> +}
>> +EXPORT_SYMBOL(rdma_srq_set_destroy);
>> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
>> index e62c9df..6950abf 100644
>> --- a/drivers/infiniband/core/verbs.c
>> +++ b/drivers/infiniband/core/verbs.c
>> @@ -272,6 +272,9 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
>>   	pd->__internal_mr = NULL;
>>   	atomic_set(&pd->usecnt, 0);
>>   	pd->flags = flags;
>> +	pd->srqs_used = 0;
>> +	spin_lock_init(&pd->srq_lock);
>> +	INIT_LIST_HEAD(&pd->srqs);
>>
>>   	pd->res.type = RDMA_RESTRACK_PD;
>>   	rdma_restrack_set_task(&pd->res, caller);
>> @@ -340,6 +343,7 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
>>   		pd->__internal_mr = NULL;
>>   	}
>>
>> +	WARN_ON_ONCE(pd->srqs_used > 0);
> It can be achieved by WARN_ON_ONCE(!list_empty(&pd->srqs))

ok, I'll change it for v2.


>
>>   	/* uverbs manipulates usecnt with proper locking, while the kabi
>>   	   requires the caller to guarantee we can't race here. */
>>   	WARN_ON(atomic_read(&pd->usecnt));
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>> index 1f779fa..fc8207d 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -1517,6 +1517,10 @@ struct ib_pd {
>>
>>   	u32			unsafe_global_rkey;
>>
>> +	spinlock_t		srq_lock;
>> +	int			srqs_used;
>> +	struct list_head	srqs;
>> +
>>   	/*
>>   	 * Implementation details of the RDMA core, don't use in drivers:
>>   	 */
>> @@ -1585,6 +1589,7 @@ struct ib_srq {
>>   	void		       *srq_context;
>>   	enum ib_srq_type	srq_type;
>>   	atomic_t		usecnt;
>> +	struct list_head	pd_entry; /* srq set entry */
>>
>>   	struct {
>>   		struct ib_cq   *cq;
>> diff --git a/include/rdma/srq_set.h b/include/rdma/srq_set.h
>> new file mode 100644
>> index 0000000..834c4c6
>> --- /dev/null
>> +++ b/include/rdma/srq_set.h
>> @@ -0,0 +1,18 @@
>> +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */
>> +/*
>> + * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
>> + */
>> +
>> +#ifndef _RDMA_SRQ_SET_H
>> +#define _RDMA_SRQ_SET_H 1
> "1" is not needed.

agreed.


>
>> +
>> +#include <rdma/ib_verbs.h>
>> +
>> +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
>> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
> At the end, it is not get/put semantics but more add/remove.

srq = rdma_srq_add ?

rdma_srq_remove(pd, srq) ?

Doesn't seems right to me.

Lets make it simple. For asking a SRQ from the PD set lets use 
rdma_srq_get and returning to we'll use rdma_srq_put.

>
> Thanks
>
>> +
>> +int rdma_srq_set_init(struct ib_pd *pd, int nr,
>> +		struct ib_srq_init_attr *srq_attr);
>> +void rdma_srq_set_destroy(struct ib_pd *pd);
>> +
>> +#endif /* _RDMA_SRQ_SET_H */
>> --
>> 1.8.3.1
>>
Jason Gunthorpe March 17, 2020, 6:10 p.m. UTC | #3
On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:

> > > +#include <rdma/ib_verbs.h>
> > > +
> > > +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
> > > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
> > At the end, it is not get/put semantics but more add/remove.
> 
> srq = rdma_srq_add ?
> 
> rdma_srq_remove(pd, srq) ?
> 
> Doesn't seems right to me.
> 
> Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get
> and returning to we'll use rdma_srq_put.

Is there reference couting here? get/put should be restricted to
refcounting APIs, IMHO.

Jason
Max Gurtovoy March 17, 2020, 6:24 p.m. UTC | #4
On 3/17/2020 8:10 PM, Jason Gunthorpe wrote:
> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
>
>>>> +#include <rdma/ib_verbs.h>
>>>> +
>>>> +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
>>>> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
>>> At the end, it is not get/put semantics but more add/remove.
>> srq = rdma_srq_add ?
>>
>> rdma_srq_remove(pd, srq) ?
>>
>> Doesn't seems right to me.
>>
>> Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get
>> and returning to we'll use rdma_srq_put.
> Is there reference couting here? get/put should be restricted to
> refcounting APIs, IMHO.

I've added a counter (pd->srqs_used) that Leon asked to remove .

There is no call to kref get/put here.

Do you prefer that I'll change it to be array in PD: "struct 
ib_srq           **srqs;" ?

And update ib_alloc_pd API to get pd_attrs and allocate the array during 
PD allocation ?

>
> Jason
Jason Gunthorpe March 17, 2020, 6:43 p.m. UTC | #5
On Tue, Mar 17, 2020 at 08:24:30PM +0200, Max Gurtovoy wrote:
> 
> On 3/17/2020 8:10 PM, Jason Gunthorpe wrote:
> > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
> > 
> > > > > +#include <rdma/ib_verbs.h>
> > > > > +
> > > > > +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
> > > > > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
> > > > At the end, it is not get/put semantics but more add/remove.
> > > srq = rdma_srq_add ?
> > > 
> > > rdma_srq_remove(pd, srq) ?
> > > 
> > > Doesn't seems right to me.
> > > 
> > > Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get
> > > and returning to we'll use rdma_srq_put.
> > Is there reference couting here? get/put should be restricted to
> > refcounting APIs, IMHO.
> 
> I've added a counter (pd->srqs_used) that Leon asked to remove .
> 
> There is no call to kref get/put here.

I didn't look closely, any kind of refcount scheme is reasonable, but
if add is supposed to create a new srq then that isn't 'get'..

> Do you prefer that I'll change it to be array in PD: "struct
> ib_srq           **srqs;" ?

Not particularly..

It actually feels a bit weird, should there be some numa-ness involved
here so that the SRQ with memory on the node that is going to be
polling it is returned?

> And update ib_alloc_pd API to get pd_attrs and allocate the array during PD
> allocation ?

The API is a bit more composable if things can can be done as
following function calls are done that way.. I don't like the giant
multiplexor structs in general

Jason
Leon Romanovsky March 17, 2020, 7:54 p.m. UTC | #6
On Tue, Mar 17, 2020 at 03:10:36PM -0300, Jason Gunthorpe wrote:
> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
>
> > > > +#include <rdma/ib_verbs.h>
> > > > +
> > > > +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
> > > > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
> > > At the end, it is not get/put semantics but more add/remove.
> >
> > srq = rdma_srq_add ?
> >
> > rdma_srq_remove(pd, srq) ?
> >
> > Doesn't seems right to me.
> >
> > Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get
> > and returning to we'll use rdma_srq_put.
>
> Is there reference couting here? get/put should be restricted to
> refcounting APIs, IMHO.

No, there is no refcounting, Max introduced some counter which he
decrease and increase, but doesn't have any other usage of it, simply
burning CPU cycles.

Thanks

>
> Jason
Max Gurtovoy March 17, 2020, 9:56 p.m. UTC | #7
On 3/17/2020 8:43 PM, Jason Gunthorpe wrote:
> On Tue, Mar 17, 2020 at 08:24:30PM +0200, Max Gurtovoy wrote:
>> On 3/17/2020 8:10 PM, Jason Gunthorpe wrote:
>>> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
>>>
>>>>>> +#include <rdma/ib_verbs.h>
>>>>>> +
>>>>>> +struct ib_srq *rdma_srq_get(struct ib_pd *pd);
>>>>>> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
>>>>> At the end, it is not get/put semantics but more add/remove.
>>>> srq = rdma_srq_add ?
>>>>
>>>> rdma_srq_remove(pd, srq) ?
>>>>
>>>> Doesn't seems right to me.
>>>>
>>>> Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get
>>>> and returning to we'll use rdma_srq_put.
>>> Is there reference couting here? get/put should be restricted to
>>> refcounting APIs, IMHO.
>> I've added a counter (pd->srqs_used) that Leon asked to remove .
>>
>> There is no call to kref get/put here.
> I didn't look closely, any kind of refcount scheme is reasonable, but
> if add is supposed to create a new srq then that isn't 'get'..

No, we don't create new srq during the "get". We create a set using 
"rdma_srq_set_init".

"get" will simple pull some srq from the set and "put" will push it back.

>
>> Do you prefer that I'll change it to be array in PD: "struct
>> ib_srq           **srqs;" ?
> Not particularly..
>
> It actually feels a bit weird, should there be some numa-ness involved
> here so that the SRQ with memory on the node that is going to be
> polling it is returned?

Maybe this will be the next improvement. But for now the receive buffers 
are allocated by the ULP.

The idea is to spread the SRQs as much as possible as we do for QP/CQ to 
reach almost the same performance.

In case of 1 SRQ we can't reach good performance since many resources 
and cores are racing for 1 resources.

In case of regular RQ we allocate many buffers that most of the time are 
idle.

If we'll spread SRQs for all cores/vectors we have we'll get great perf 
with saving resources that might be critical in MQ ULPs as NVMf/SRP 
targets that might serve hundreds initiators with hundreds of queues each.


>
>> And update ib_alloc_pd API to get pd_attrs and allocate the array during PD
>> allocation ?
> The API is a bit more composable if things can can be done as
> following function calls are done that way.. I don't like the giant
> multiplexor structs in general
>
> Jason
Leon Romanovsky March 18, 2020, 6:47 a.m. UTC | #8
On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
>
> On 3/17/2020 3:55 PM, Leon Romanovsky wrote:
> > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
> > > ULP's can use this API to create/destroy SRQ's with the same
> > > characteristics for implementing a logic that aimed to save resources
> > > without significant performance penalty (e.g. create SRQ per completion
> > > vector and use shared receive buffers for multiple controllers of the
> > > ULP).
> > >
> > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
> > > ---
> > >   drivers/infiniband/core/Makefile  |  2 +-
> > >   drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
> > >   drivers/infiniband/core/verbs.c   |  4 ++
> > >   include/rdma/ib_verbs.h           |  5 +++
> > >   include/rdma/srq_set.h            | 18 +++++++++
> > >   5 files changed, 106 insertions(+), 1 deletion(-)
> > >   create mode 100644 drivers/infiniband/core/srq_set.c
> > >   create mode 100644 include/rdma/srq_set.h
> > >
> > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> > > index d1b14887..1d3eaec 100644
> > > --- a/drivers/infiniband/core/Makefile
> > > +++ b/drivers/infiniband/core/Makefile
> > > @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
> > >   				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
> > >   				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> > >   				nldev.o restrack.o counters.o ib_core_uverbs.o \
> > > -				trace.o
> > > +				trace.o srq_set.o
> > Why did you call it "srq_set.c" and not "srq.c"?
>
> because it's not a SRQ API but SRQ-set API.

I would say that it is SRQ-pool and not SRQ-set API.

Thanks
Max Gurtovoy March 18, 2020, 9:46 a.m. UTC | #9
On 3/18/2020 8:47 AM, Leon Romanovsky wrote:
> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
>> On 3/17/2020 3:55 PM, Leon Romanovsky wrote:
>>> On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
>>>> ULP's can use this API to create/destroy SRQ's with the same
>>>> characteristics for implementing a logic that aimed to save resources
>>>> without significant performance penalty (e.g. create SRQ per completion
>>>> vector and use shared receive buffers for multiple controllers of the
>>>> ULP).
>>>>
>>>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>>>> ---
>>>>    drivers/infiniband/core/Makefile  |  2 +-
>>>>    drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
>>>>    drivers/infiniband/core/verbs.c   |  4 ++
>>>>    include/rdma/ib_verbs.h           |  5 +++
>>>>    include/rdma/srq_set.h            | 18 +++++++++
>>>>    5 files changed, 106 insertions(+), 1 deletion(-)
>>>>    create mode 100644 drivers/infiniband/core/srq_set.c
>>>>    create mode 100644 include/rdma/srq_set.h
>>>>
>>>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>>>> index d1b14887..1d3eaec 100644
>>>> --- a/drivers/infiniband/core/Makefile
>>>> +++ b/drivers/infiniband/core/Makefile
>>>> @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
>>>>    				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
>>>>    				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>>>>    				nldev.o restrack.o counters.o ib_core_uverbs.o \
>>>> -				trace.o
>>>> +				trace.o srq_set.o
>>> Why did you call it "srq_set.c" and not "srq.c"?
>> because it's not a SRQ API but SRQ-set API.
> I would say that it is SRQ-pool and not SRQ-set API.

If you have some other idea for an API, please share with us.

I've created this API in core layer to make the life of the ULPs easier 
and we can see that it's very easy to add this feature to ULPs and get a 
big benefit of it.

>
> Thanks
Leon Romanovsky March 18, 2020, 10:29 a.m. UTC | #10
On Wed, Mar 18, 2020 at 11:46:19AM +0200, Max Gurtovoy wrote:
>
> On 3/18/2020 8:47 AM, Leon Romanovsky wrote:
> > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
> > > On 3/17/2020 3:55 PM, Leon Romanovsky wrote:
> > > > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
> > > > > ULP's can use this API to create/destroy SRQ's with the same
> > > > > characteristics for implementing a logic that aimed to save resources
> > > > > without significant performance penalty (e.g. create SRQ per completion
> > > > > vector and use shared receive buffers for multiple controllers of the
> > > > > ULP).
> > > > >
> > > > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
> > > > > ---
> > > > >    drivers/infiniband/core/Makefile  |  2 +-
> > > > >    drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
> > > > >    drivers/infiniband/core/verbs.c   |  4 ++
> > > > >    include/rdma/ib_verbs.h           |  5 +++
> > > > >    include/rdma/srq_set.h            | 18 +++++++++
> > > > >    5 files changed, 106 insertions(+), 1 deletion(-)
> > > > >    create mode 100644 drivers/infiniband/core/srq_set.c
> > > > >    create mode 100644 include/rdma/srq_set.h
> > > > >
> > > > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> > > > > index d1b14887..1d3eaec 100644
> > > > > --- a/drivers/infiniband/core/Makefile
> > > > > +++ b/drivers/infiniband/core/Makefile
> > > > > @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
> > > > >    				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
> > > > >    				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> > > > >    				nldev.o restrack.o counters.o ib_core_uverbs.o \
> > > > > -				trace.o
> > > > > +				trace.o srq_set.o
> > > > Why did you call it "srq_set.c" and not "srq.c"?
> > > because it's not a SRQ API but SRQ-set API.
> > I would say that it is SRQ-pool and not SRQ-set API.
>
> If you have some other idea for an API, please share with us.
>
> I've created this API in core layer to make the life of the ULPs easier and
> we can see that it's very easy to add this feature to ULPs and get a big
> benefit of it.

No one here said against the feature, but tried to understand the
rationale behind name *_set and why you decided to use "set" term
and not "pool", like was done for MR pool.

So my suggestion is to name file as srq_pool.c and use rdma_srq_poll_*()
function names.

Thanks

>
> >
> > Thanks
Max Gurtovoy March 18, 2020, 10:39 a.m. UTC | #11
On 3/18/2020 12:29 PM, Leon Romanovsky wrote:
> On Wed, Mar 18, 2020 at 11:46:19AM +0200, Max Gurtovoy wrote:
>> On 3/18/2020 8:47 AM, Leon Romanovsky wrote:
>>> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
>>>> On 3/17/2020 3:55 PM, Leon Romanovsky wrote:
>>>>> On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
>>>>>> ULP's can use this API to create/destroy SRQ's with the same
>>>>>> characteristics for implementing a logic that aimed to save resources
>>>>>> without significant performance penalty (e.g. create SRQ per completion
>>>>>> vector and use shared receive buffers for multiple controllers of the
>>>>>> ULP).
>>>>>>
>>>>>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>>>>>> ---
>>>>>>     drivers/infiniband/core/Makefile  |  2 +-
>>>>>>     drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
>>>>>>     drivers/infiniband/core/verbs.c   |  4 ++
>>>>>>     include/rdma/ib_verbs.h           |  5 +++
>>>>>>     include/rdma/srq_set.h            | 18 +++++++++
>>>>>>     5 files changed, 106 insertions(+), 1 deletion(-)
>>>>>>     create mode 100644 drivers/infiniband/core/srq_set.c
>>>>>>     create mode 100644 include/rdma/srq_set.h
>>>>>>
>>>>>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>>>>>> index d1b14887..1d3eaec 100644
>>>>>> --- a/drivers/infiniband/core/Makefile
>>>>>> +++ b/drivers/infiniband/core/Makefile
>>>>>> @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
>>>>>>     				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
>>>>>>     				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>>>>>>     				nldev.o restrack.o counters.o ib_core_uverbs.o \
>>>>>> -				trace.o
>>>>>> +				trace.o srq_set.o
>>>>> Why did you call it "srq_set.c" and not "srq.c"?
>>>> because it's not a SRQ API but SRQ-set API.
>>> I would say that it is SRQ-pool and not SRQ-set API.
>> If you have some other idea for an API, please share with us.
>>
>> I've created this API in core layer to make the life of the ULPs easier and
>> we can see that it's very easy to add this feature to ULPs and get a big
>> benefit of it.
> No one here said against the feature, but tried to understand the
> rationale behind name *_set and why you decided to use "set" term
> and not "pool", like was done for MR pool.

Ok. But srq_pool was the name I used 2 years ago and you didn't like 
this back then.

So I renamed it to srq_set.

>
> So my suggestion is to name file as srq_pool.c and use rdma_srq_poll_*()
> function names.

No problem.

>
> Thanks
>
>>> Thanks
Leon Romanovsky March 18, 2020, 10:46 a.m. UTC | #12
On Wed, Mar 18, 2020 at 12:39:44PM +0200, Max Gurtovoy wrote:
>
> On 3/18/2020 12:29 PM, Leon Romanovsky wrote:
> > On Wed, Mar 18, 2020 at 11:46:19AM +0200, Max Gurtovoy wrote:
> > > On 3/18/2020 8:47 AM, Leon Romanovsky wrote:
> > > > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote:
> > > > > On 3/17/2020 3:55 PM, Leon Romanovsky wrote:
> > > > > > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote:
> > > > > > > ULP's can use this API to create/destroy SRQ's with the same
> > > > > > > characteristics for implementing a logic that aimed to save resources
> > > > > > > without significant performance penalty (e.g. create SRQ per completion
> > > > > > > vector and use shared receive buffers for multiple controllers of the
> > > > > > > ULP).
> > > > > > >
> > > > > > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
> > > > > > > ---
> > > > > > >     drivers/infiniband/core/Makefile  |  2 +-
> > > > > > >     drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++
> > > > > > >     drivers/infiniband/core/verbs.c   |  4 ++
> > > > > > >     include/rdma/ib_verbs.h           |  5 +++
> > > > > > >     include/rdma/srq_set.h            | 18 +++++++++
> > > > > > >     5 files changed, 106 insertions(+), 1 deletion(-)
> > > > > > >     create mode 100644 drivers/infiniband/core/srq_set.c
> > > > > > >     create mode 100644 include/rdma/srq_set.h
> > > > > > >
> > > > > > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> > > > > > > index d1b14887..1d3eaec 100644
> > > > > > > --- a/drivers/infiniband/core/Makefile
> > > > > > > +++ b/drivers/infiniband/core/Makefile
> > > > > > > @@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
> > > > > > >     				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
> > > > > > >     				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> > > > > > >     				nldev.o restrack.o counters.o ib_core_uverbs.o \
> > > > > > > -				trace.o
> > > > > > > +				trace.o srq_set.o
> > > > > > Why did you call it "srq_set.c" and not "srq.c"?
> > > > > because it's not a SRQ API but SRQ-set API.
> > > > I would say that it is SRQ-pool and not SRQ-set API.
> > > If you have some other idea for an API, please share with us.
> > >
> > > I've created this API in core layer to make the life of the ULPs easier and
> > > we can see that it's very easy to add this feature to ULPs and get a big
> > > benefit of it.
> > No one here said against the feature, but tried to understand the
> > rationale behind name *_set and why you decided to use "set" term
> > and not "pool", like was done for MR pool.
>
> Ok. But srq_pool was the name I used 2 years ago and you didn't like this
> back then.

I don't like it today too, but don't have better name to suggest.

Thanks
diff mbox series

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d1b14887..1d3eaec 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -12,7 +12,7 @@  ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
 				nldev.o restrack.o counters.o ib_core_uverbs.o \
-				trace.o
+				trace.o srq_set.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/srq_set.c b/drivers/infiniband/core/srq_set.c
new file mode 100644
index 0000000..d143561
--- /dev/null
+++ b/drivers/infiniband/core/srq_set.c
@@ -0,0 +1,78 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/srq_set.h>
+
+struct ib_srq *rdma_srq_get(struct ib_pd *pd)
+{
+	struct ib_srq *srq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pd->srq_lock, flags);
+	srq = list_first_entry_or_null(&pd->srqs, struct ib_srq, pd_entry);
+	if (srq) {
+		list_del(&srq->pd_entry);
+		pd->srqs_used++;
+	}
+	spin_unlock_irqrestore(&pd->srq_lock, flags);
+
+	return srq;
+}
+EXPORT_SYMBOL(rdma_srq_get);
+
+void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pd->srq_lock, flags);
+	list_add(&srq->pd_entry, &pd->srqs);
+	pd->srqs_used--;
+	spin_unlock_irqrestore(&pd->srq_lock, flags);
+}
+EXPORT_SYMBOL(rdma_srq_put);
+
+int rdma_srq_set_init(struct ib_pd *pd, int nr,
+		struct ib_srq_init_attr *srq_attr)
+{
+	struct ib_srq *srq;
+	unsigned long flags;
+	int ret, i;
+
+	for (i = 0; i < nr; i++) {
+		srq = ib_create_srq(pd, srq_attr);
+		if (IS_ERR(srq)) {
+			ret = PTR_ERR(srq);
+			goto out;
+		}
+
+		spin_lock_irqsave(&pd->srq_lock, flags);
+		list_add_tail(&srq->pd_entry, &pd->srqs);
+		spin_unlock_irqrestore(&pd->srq_lock, flags);
+	}
+
+	return 0;
+out:
+	rdma_srq_set_destroy(pd);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_srq_set_init);
+
+void rdma_srq_set_destroy(struct ib_pd *pd)
+{
+	struct ib_srq *srq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pd->srq_lock, flags);
+	while (!list_empty(&pd->srqs)) {
+		srq = list_first_entry(&pd->srqs, struct ib_srq, pd_entry);
+		list_del(&srq->pd_entry);
+
+		spin_unlock_irqrestore(&pd->srq_lock, flags);
+		ib_destroy_srq(srq);
+		spin_lock_irqsave(&pd->srq_lock, flags);
+	}
+	spin_unlock_irqrestore(&pd->srq_lock, flags);
+}
+EXPORT_SYMBOL(rdma_srq_set_destroy);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index e62c9df..6950abf 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -272,6 +272,9 @@  struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 	pd->__internal_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
 	pd->flags = flags;
+	pd->srqs_used = 0;
+	spin_lock_init(&pd->srq_lock);
+	INIT_LIST_HEAD(&pd->srqs);
 
 	pd->res.type = RDMA_RESTRACK_PD;
 	rdma_restrack_set_task(&pd->res, caller);
@@ -340,6 +343,7 @@  void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
 		pd->__internal_mr = NULL;
 	}
 
+	WARN_ON_ONCE(pd->srqs_used > 0);
 	/* uverbs manipulates usecnt with proper locking, while the kabi
 	   requires the caller to guarantee we can't race here. */
 	WARN_ON(atomic_read(&pd->usecnt));
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1f779fa..fc8207d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1517,6 +1517,10 @@  struct ib_pd {
 
 	u32			unsafe_global_rkey;
 
+	spinlock_t		srq_lock;
+	int			srqs_used;
+	struct list_head	srqs;
+
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
@@ -1585,6 +1589,7 @@  struct ib_srq {
 	void		       *srq_context;
 	enum ib_srq_type	srq_type;
 	atomic_t		usecnt;
+	struct list_head	pd_entry; /* srq set entry */
 
 	struct {
 		struct ib_cq   *cq;
diff --git a/include/rdma/srq_set.h b/include/rdma/srq_set.h
new file mode 100644
index 0000000..834c4c6
--- /dev/null
+++ b/include/rdma/srq_set.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_SRQ_SET_H
+#define _RDMA_SRQ_SET_H 1
+
+#include <rdma/ib_verbs.h>
+
+struct ib_srq *rdma_srq_get(struct ib_pd *pd);
+void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq);
+
+int rdma_srq_set_init(struct ib_pd *pd, int nr,
+		struct ib_srq_init_attr *srq_attr);
+void rdma_srq_set_destroy(struct ib_pd *pd);
+
+#endif /* _RDMA_SRQ_SET_H */