Message ID | 20200317134030.152833-2-maxg@mellanox.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | nvmet-rdma/srpt: SRQ per completion vector | expand |
On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: > ULP's can use this API to create/destroy SRQ's with the same > characteristics for implementing a logic that aimed to save resources > without significant performance penalty (e.g. create SRQ per completion > vector and use shared receive buffers for multiple controllers of the > ULP). > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com> > --- > drivers/infiniband/core/Makefile | 2 +- > drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ > drivers/infiniband/core/verbs.c | 4 ++ > include/rdma/ib_verbs.h | 5 +++ > include/rdma/srq_set.h | 18 +++++++++ > 5 files changed, 106 insertions(+), 1 deletion(-) > create mode 100644 drivers/infiniband/core/srq_set.c > create mode 100644 include/rdma/srq_set.h > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile > index d1b14887..1d3eaec 100644 > --- a/drivers/infiniband/core/Makefile > +++ b/drivers/infiniband/core/Makefile > @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ > roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ > multicast.o mad.o smi.o agent.o mad_rmpp.o \ > nldev.o restrack.o counters.o ib_core_uverbs.o \ > - trace.o > + trace.o srq_set.o Why did you call it "srq_set.c" and not "srq.c"? > > ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o > ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o > diff --git a/drivers/infiniband/core/srq_set.c b/drivers/infiniband/core/srq_set.c > new file mode 100644 > index 0000000..d143561 > --- /dev/null > +++ b/drivers/infiniband/core/srq_set.c > @@ -0,0 +1,78 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* > + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. > + */ > + > +#include <rdma/srq_set.h> > + > +struct ib_srq *rdma_srq_get(struct ib_pd *pd) > +{ > + struct ib_srq *srq; > + unsigned long flags; > + > + spin_lock_irqsave(&pd->srq_lock, flags); > + srq = list_first_entry_or_null(&pd->srqs, struct ib_srq, pd_entry); > + if (srq) { > + list_del(&srq->pd_entry); > + pd->srqs_used++; I don't see any real usage of srqs_used. > + } > + spin_unlock_irqrestore(&pd->srq_lock, flags); > + > + return srq; > +} > +EXPORT_SYMBOL(rdma_srq_get); > + > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&pd->srq_lock, flags); > + list_add(&srq->pd_entry, &pd->srqs); > + pd->srqs_used--; > + spin_unlock_irqrestore(&pd->srq_lock, flags); > +} > +EXPORT_SYMBOL(rdma_srq_put); > + > +int rdma_srq_set_init(struct ib_pd *pd, int nr, > + struct ib_srq_init_attr *srq_attr) > +{ > + struct ib_srq *srq; > + unsigned long flags; > + int ret, i; > + > + for (i = 0; i < nr; i++) { > + srq = ib_create_srq(pd, srq_attr); > + if (IS_ERR(srq)) { > + ret = PTR_ERR(srq); > + goto out; > + } > + > + spin_lock_irqsave(&pd->srq_lock, flags); > + list_add_tail(&srq->pd_entry, &pd->srqs); > + spin_unlock_irqrestore(&pd->srq_lock, flags); > + } > + > + return 0; > +out: > + rdma_srq_set_destroy(pd); > + return ret; > +} > +EXPORT_SYMBOL(rdma_srq_set_init); > + > +void rdma_srq_set_destroy(struct ib_pd *pd) > +{ > + struct ib_srq *srq; > + unsigned long flags; > + > + spin_lock_irqsave(&pd->srq_lock, flags); > + while (!list_empty(&pd->srqs)) { > + srq = list_first_entry(&pd->srqs, struct ib_srq, pd_entry); > + list_del(&srq->pd_entry); > + > + spin_unlock_irqrestore(&pd->srq_lock, flags); > + ib_destroy_srq(srq); > + spin_lock_irqsave(&pd->srq_lock, flags); > + } > + spin_unlock_irqrestore(&pd->srq_lock, flags); > +} > +EXPORT_SYMBOL(rdma_srq_set_destroy); > diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c > index e62c9df..6950abf 100644 > --- a/drivers/infiniband/core/verbs.c > +++ b/drivers/infiniband/core/verbs.c > @@ -272,6 +272,9 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, > pd->__internal_mr = NULL; > atomic_set(&pd->usecnt, 0); > pd->flags = flags; > + pd->srqs_used = 0; > + spin_lock_init(&pd->srq_lock); > + INIT_LIST_HEAD(&pd->srqs); > > pd->res.type = RDMA_RESTRACK_PD; > rdma_restrack_set_task(&pd->res, caller); > @@ -340,6 +343,7 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) > pd->__internal_mr = NULL; > } > > + WARN_ON_ONCE(pd->srqs_used > 0); It can be achieved by WARN_ON_ONCE(!list_empty(&pd->srqs)) > /* uverbs manipulates usecnt with proper locking, while the kabi > requires the caller to guarantee we can't race here. */ > WARN_ON(atomic_read(&pd->usecnt)); > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index 1f779fa..fc8207d 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -1517,6 +1517,10 @@ struct ib_pd { > > u32 unsafe_global_rkey; > > + spinlock_t srq_lock; > + int srqs_used; > + struct list_head srqs; > + > /* > * Implementation details of the RDMA core, don't use in drivers: > */ > @@ -1585,6 +1589,7 @@ struct ib_srq { > void *srq_context; > enum ib_srq_type srq_type; > atomic_t usecnt; > + struct list_head pd_entry; /* srq set entry */ > > struct { > struct ib_cq *cq; > diff --git a/include/rdma/srq_set.h b/include/rdma/srq_set.h > new file mode 100644 > index 0000000..834c4c6 > --- /dev/null > +++ b/include/rdma/srq_set.h > @@ -0,0 +1,18 @@ > +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */ > +/* > + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. > + */ > + > +#ifndef _RDMA_SRQ_SET_H > +#define _RDMA_SRQ_SET_H 1 "1" is not needed. > + > +#include <rdma/ib_verbs.h> > + > +struct ib_srq *rdma_srq_get(struct ib_pd *pd); > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); At the end, it is not get/put semantics but more add/remove. Thanks > + > +int rdma_srq_set_init(struct ib_pd *pd, int nr, > + struct ib_srq_init_attr *srq_attr); > +void rdma_srq_set_destroy(struct ib_pd *pd); > + > +#endif /* _RDMA_SRQ_SET_H */ > -- > 1.8.3.1 >
On 3/17/2020 3:55 PM, Leon Romanovsky wrote: > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: >> ULP's can use this API to create/destroy SRQ's with the same >> characteristics for implementing a logic that aimed to save resources >> without significant performance penalty (e.g. create SRQ per completion >> vector and use shared receive buffers for multiple controllers of the >> ULP). >> >> Signed-off-by: Max Gurtovoy <maxg@mellanox.com> >> --- >> drivers/infiniband/core/Makefile | 2 +- >> drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ >> drivers/infiniband/core/verbs.c | 4 ++ >> include/rdma/ib_verbs.h | 5 +++ >> include/rdma/srq_set.h | 18 +++++++++ >> 5 files changed, 106 insertions(+), 1 deletion(-) >> create mode 100644 drivers/infiniband/core/srq_set.c >> create mode 100644 include/rdma/srq_set.h >> >> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile >> index d1b14887..1d3eaec 100644 >> --- a/drivers/infiniband/core/Makefile >> +++ b/drivers/infiniband/core/Makefile >> @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ >> roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ >> multicast.o mad.o smi.o agent.o mad_rmpp.o \ >> nldev.o restrack.o counters.o ib_core_uverbs.o \ >> - trace.o >> + trace.o srq_set.o > Why did you call it "srq_set.c" and not "srq.c"? because it's not a SRQ API but SRQ-set API. >> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o >> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o >> diff --git a/drivers/infiniband/core/srq_set.c b/drivers/infiniband/core/srq_set.c >> new file mode 100644 >> index 0000000..d143561 >> --- /dev/null >> +++ b/drivers/infiniband/core/srq_set.c >> @@ -0,0 +1,78 @@ >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB >> +/* >> + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. >> + */ >> + >> +#include <rdma/srq_set.h> >> + >> +struct ib_srq *rdma_srq_get(struct ib_pd *pd) >> +{ >> + struct ib_srq *srq; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&pd->srq_lock, flags); >> + srq = list_first_entry_or_null(&pd->srqs, struct ib_srq, pd_entry); >> + if (srq) { >> + list_del(&srq->pd_entry); >> + pd->srqs_used++; > I don't see any real usage of srqs_used. > >> + } >> + spin_unlock_irqrestore(&pd->srq_lock, flags); >> + >> + return srq; >> +} >> +EXPORT_SYMBOL(rdma_srq_get); >> + >> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq) >> +{ >> + unsigned long flags; >> + >> + spin_lock_irqsave(&pd->srq_lock, flags); >> + list_add(&srq->pd_entry, &pd->srqs); >> + pd->srqs_used--; >> + spin_unlock_irqrestore(&pd->srq_lock, flags); >> +} >> +EXPORT_SYMBOL(rdma_srq_put); >> + >> +int rdma_srq_set_init(struct ib_pd *pd, int nr, >> + struct ib_srq_init_attr *srq_attr) >> +{ >> + struct ib_srq *srq; >> + unsigned long flags; >> + int ret, i; >> + >> + for (i = 0; i < nr; i++) { >> + srq = ib_create_srq(pd, srq_attr); >> + if (IS_ERR(srq)) { >> + ret = PTR_ERR(srq); >> + goto out; >> + } >> + >> + spin_lock_irqsave(&pd->srq_lock, flags); >> + list_add_tail(&srq->pd_entry, &pd->srqs); >> + spin_unlock_irqrestore(&pd->srq_lock, flags); >> + } >> + >> + return 0; >> +out: >> + rdma_srq_set_destroy(pd); >> + return ret; >> +} >> +EXPORT_SYMBOL(rdma_srq_set_init); >> + >> +void rdma_srq_set_destroy(struct ib_pd *pd) >> +{ >> + struct ib_srq *srq; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&pd->srq_lock, flags); >> + while (!list_empty(&pd->srqs)) { >> + srq = list_first_entry(&pd->srqs, struct ib_srq, pd_entry); >> + list_del(&srq->pd_entry); >> + >> + spin_unlock_irqrestore(&pd->srq_lock, flags); >> + ib_destroy_srq(srq); >> + spin_lock_irqsave(&pd->srq_lock, flags); >> + } >> + spin_unlock_irqrestore(&pd->srq_lock, flags); >> +} >> +EXPORT_SYMBOL(rdma_srq_set_destroy); >> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c >> index e62c9df..6950abf 100644 >> --- a/drivers/infiniband/core/verbs.c >> +++ b/drivers/infiniband/core/verbs.c >> @@ -272,6 +272,9 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, >> pd->__internal_mr = NULL; >> atomic_set(&pd->usecnt, 0); >> pd->flags = flags; >> + pd->srqs_used = 0; >> + spin_lock_init(&pd->srq_lock); >> + INIT_LIST_HEAD(&pd->srqs); >> >> pd->res.type = RDMA_RESTRACK_PD; >> rdma_restrack_set_task(&pd->res, caller); >> @@ -340,6 +343,7 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) >> pd->__internal_mr = NULL; >> } >> >> + WARN_ON_ONCE(pd->srqs_used > 0); > It can be achieved by WARN_ON_ONCE(!list_empty(&pd->srqs)) ok, I'll change it for v2. > >> /* uverbs manipulates usecnt with proper locking, while the kabi >> requires the caller to guarantee we can't race here. */ >> WARN_ON(atomic_read(&pd->usecnt)); >> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h >> index 1f779fa..fc8207d 100644 >> --- a/include/rdma/ib_verbs.h >> +++ b/include/rdma/ib_verbs.h >> @@ -1517,6 +1517,10 @@ struct ib_pd { >> >> u32 unsafe_global_rkey; >> >> + spinlock_t srq_lock; >> + int srqs_used; >> + struct list_head srqs; >> + >> /* >> * Implementation details of the RDMA core, don't use in drivers: >> */ >> @@ -1585,6 +1589,7 @@ struct ib_srq { >> void *srq_context; >> enum ib_srq_type srq_type; >> atomic_t usecnt; >> + struct list_head pd_entry; /* srq set entry */ >> >> struct { >> struct ib_cq *cq; >> diff --git a/include/rdma/srq_set.h b/include/rdma/srq_set.h >> new file mode 100644 >> index 0000000..834c4c6 >> --- /dev/null >> +++ b/include/rdma/srq_set.h >> @@ -0,0 +1,18 @@ >> +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */ >> +/* >> + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. >> + */ >> + >> +#ifndef _RDMA_SRQ_SET_H >> +#define _RDMA_SRQ_SET_H 1 > "1" is not needed. agreed. > >> + >> +#include <rdma/ib_verbs.h> >> + >> +struct ib_srq *rdma_srq_get(struct ib_pd *pd); >> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); > At the end, it is not get/put semantics but more add/remove. srq = rdma_srq_add ? rdma_srq_remove(pd, srq) ? Doesn't seems right to me. Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get and returning to we'll use rdma_srq_put. > > Thanks > >> + >> +int rdma_srq_set_init(struct ib_pd *pd, int nr, >> + struct ib_srq_init_attr *srq_attr); >> +void rdma_srq_set_destroy(struct ib_pd *pd); >> + >> +#endif /* _RDMA_SRQ_SET_H */ >> -- >> 1.8.3.1 >>
On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > > > +#include <rdma/ib_verbs.h> > > > + > > > +struct ib_srq *rdma_srq_get(struct ib_pd *pd); > > > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); > > At the end, it is not get/put semantics but more add/remove. > > srq = rdma_srq_add ? > > rdma_srq_remove(pd, srq) ? > > Doesn't seems right to me. > > Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get > and returning to we'll use rdma_srq_put. Is there reference couting here? get/put should be restricted to refcounting APIs, IMHO. Jason
On 3/17/2020 8:10 PM, Jason Gunthorpe wrote: > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > >>>> +#include <rdma/ib_verbs.h> >>>> + >>>> +struct ib_srq *rdma_srq_get(struct ib_pd *pd); >>>> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); >>> At the end, it is not get/put semantics but more add/remove. >> srq = rdma_srq_add ? >> >> rdma_srq_remove(pd, srq) ? >> >> Doesn't seems right to me. >> >> Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get >> and returning to we'll use rdma_srq_put. > Is there reference couting here? get/put should be restricted to > refcounting APIs, IMHO. I've added a counter (pd->srqs_used) that Leon asked to remove . There is no call to kref get/put here. Do you prefer that I'll change it to be array in PD: "struct ib_srq **srqs;" ? And update ib_alloc_pd API to get pd_attrs and allocate the array during PD allocation ? > > Jason
On Tue, Mar 17, 2020 at 08:24:30PM +0200, Max Gurtovoy wrote: > > On 3/17/2020 8:10 PM, Jason Gunthorpe wrote: > > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > > > > > > > +#include <rdma/ib_verbs.h> > > > > > + > > > > > +struct ib_srq *rdma_srq_get(struct ib_pd *pd); > > > > > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); > > > > At the end, it is not get/put semantics but more add/remove. > > > srq = rdma_srq_add ? > > > > > > rdma_srq_remove(pd, srq) ? > > > > > > Doesn't seems right to me. > > > > > > Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get > > > and returning to we'll use rdma_srq_put. > > Is there reference couting here? get/put should be restricted to > > refcounting APIs, IMHO. > > I've added a counter (pd->srqs_used) that Leon asked to remove . > > There is no call to kref get/put here. I didn't look closely, any kind of refcount scheme is reasonable, but if add is supposed to create a new srq then that isn't 'get'.. > Do you prefer that I'll change it to be array in PD: "struct > ib_srq **srqs;" ? Not particularly.. It actually feels a bit weird, should there be some numa-ness involved here so that the SRQ with memory on the node that is going to be polling it is returned? > And update ib_alloc_pd API to get pd_attrs and allocate the array during PD > allocation ? The API is a bit more composable if things can can be done as following function calls are done that way.. I don't like the giant multiplexor structs in general Jason
On Tue, Mar 17, 2020 at 03:10:36PM -0300, Jason Gunthorpe wrote: > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > > > > > +#include <rdma/ib_verbs.h> > > > > + > > > > +struct ib_srq *rdma_srq_get(struct ib_pd *pd); > > > > +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); > > > At the end, it is not get/put semantics but more add/remove. > > > > srq = rdma_srq_add ? > > > > rdma_srq_remove(pd, srq) ? > > > > Doesn't seems right to me. > > > > Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get > > and returning to we'll use rdma_srq_put. > > Is there reference couting here? get/put should be restricted to > refcounting APIs, IMHO. No, there is no refcounting, Max introduced some counter which he decrease and increase, but doesn't have any other usage of it, simply burning CPU cycles. Thanks > > Jason
On 3/17/2020 8:43 PM, Jason Gunthorpe wrote: > On Tue, Mar 17, 2020 at 08:24:30PM +0200, Max Gurtovoy wrote: >> On 3/17/2020 8:10 PM, Jason Gunthorpe wrote: >>> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: >>> >>>>>> +#include <rdma/ib_verbs.h> >>>>>> + >>>>>> +struct ib_srq *rdma_srq_get(struct ib_pd *pd); >>>>>> +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); >>>>> At the end, it is not get/put semantics but more add/remove. >>>> srq = rdma_srq_add ? >>>> >>>> rdma_srq_remove(pd, srq) ? >>>> >>>> Doesn't seems right to me. >>>> >>>> Lets make it simple. For asking a SRQ from the PD set lets use rdma_srq_get >>>> and returning to we'll use rdma_srq_put. >>> Is there reference couting here? get/put should be restricted to >>> refcounting APIs, IMHO. >> I've added a counter (pd->srqs_used) that Leon asked to remove . >> >> There is no call to kref get/put here. > I didn't look closely, any kind of refcount scheme is reasonable, but > if add is supposed to create a new srq then that isn't 'get'.. No, we don't create new srq during the "get". We create a set using "rdma_srq_set_init". "get" will simple pull some srq from the set and "put" will push it back. > >> Do you prefer that I'll change it to be array in PD: "struct >> ib_srq **srqs;" ? > Not particularly.. > > It actually feels a bit weird, should there be some numa-ness involved > here so that the SRQ with memory on the node that is going to be > polling it is returned? Maybe this will be the next improvement. But for now the receive buffers are allocated by the ULP. The idea is to spread the SRQs as much as possible as we do for QP/CQ to reach almost the same performance. In case of 1 SRQ we can't reach good performance since many resources and cores are racing for 1 resources. In case of regular RQ we allocate many buffers that most of the time are idle. If we'll spread SRQs for all cores/vectors we have we'll get great perf with saving resources that might be critical in MQ ULPs as NVMf/SRP targets that might serve hundreds initiators with hundreds of queues each. > >> And update ib_alloc_pd API to get pd_attrs and allocate the array during PD >> allocation ? > The API is a bit more composable if things can can be done as > following function calls are done that way.. I don't like the giant > multiplexor structs in general > > Jason
On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > > On 3/17/2020 3:55 PM, Leon Romanovsky wrote: > > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: > > > ULP's can use this API to create/destroy SRQ's with the same > > > characteristics for implementing a logic that aimed to save resources > > > without significant performance penalty (e.g. create SRQ per completion > > > vector and use shared receive buffers for multiple controllers of the > > > ULP). > > > > > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com> > > > --- > > > drivers/infiniband/core/Makefile | 2 +- > > > drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ > > > drivers/infiniband/core/verbs.c | 4 ++ > > > include/rdma/ib_verbs.h | 5 +++ > > > include/rdma/srq_set.h | 18 +++++++++ > > > 5 files changed, 106 insertions(+), 1 deletion(-) > > > create mode 100644 drivers/infiniband/core/srq_set.c > > > create mode 100644 include/rdma/srq_set.h > > > > > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile > > > index d1b14887..1d3eaec 100644 > > > --- a/drivers/infiniband/core/Makefile > > > +++ b/drivers/infiniband/core/Makefile > > > @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ > > > roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ > > > multicast.o mad.o smi.o agent.o mad_rmpp.o \ > > > nldev.o restrack.o counters.o ib_core_uverbs.o \ > > > - trace.o > > > + trace.o srq_set.o > > Why did you call it "srq_set.c" and not "srq.c"? > > because it's not a SRQ API but SRQ-set API. I would say that it is SRQ-pool and not SRQ-set API. Thanks
On 3/18/2020 8:47 AM, Leon Romanovsky wrote: > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: >> On 3/17/2020 3:55 PM, Leon Romanovsky wrote: >>> On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: >>>> ULP's can use this API to create/destroy SRQ's with the same >>>> characteristics for implementing a logic that aimed to save resources >>>> without significant performance penalty (e.g. create SRQ per completion >>>> vector and use shared receive buffers for multiple controllers of the >>>> ULP). >>>> >>>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com> >>>> --- >>>> drivers/infiniband/core/Makefile | 2 +- >>>> drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ >>>> drivers/infiniband/core/verbs.c | 4 ++ >>>> include/rdma/ib_verbs.h | 5 +++ >>>> include/rdma/srq_set.h | 18 +++++++++ >>>> 5 files changed, 106 insertions(+), 1 deletion(-) >>>> create mode 100644 drivers/infiniband/core/srq_set.c >>>> create mode 100644 include/rdma/srq_set.h >>>> >>>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile >>>> index d1b14887..1d3eaec 100644 >>>> --- a/drivers/infiniband/core/Makefile >>>> +++ b/drivers/infiniband/core/Makefile >>>> @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ >>>> roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ >>>> multicast.o mad.o smi.o agent.o mad_rmpp.o \ >>>> nldev.o restrack.o counters.o ib_core_uverbs.o \ >>>> - trace.o >>>> + trace.o srq_set.o >>> Why did you call it "srq_set.c" and not "srq.c"? >> because it's not a SRQ API but SRQ-set API. > I would say that it is SRQ-pool and not SRQ-set API. If you have some other idea for an API, please share with us. I've created this API in core layer to make the life of the ULPs easier and we can see that it's very easy to add this feature to ULPs and get a big benefit of it. > > Thanks
On Wed, Mar 18, 2020 at 11:46:19AM +0200, Max Gurtovoy wrote: > > On 3/18/2020 8:47 AM, Leon Romanovsky wrote: > > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > > > On 3/17/2020 3:55 PM, Leon Romanovsky wrote: > > > > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: > > > > > ULP's can use this API to create/destroy SRQ's with the same > > > > > characteristics for implementing a logic that aimed to save resources > > > > > without significant performance penalty (e.g. create SRQ per completion > > > > > vector and use shared receive buffers for multiple controllers of the > > > > > ULP). > > > > > > > > > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com> > > > > > --- > > > > > drivers/infiniband/core/Makefile | 2 +- > > > > > drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ > > > > > drivers/infiniband/core/verbs.c | 4 ++ > > > > > include/rdma/ib_verbs.h | 5 +++ > > > > > include/rdma/srq_set.h | 18 +++++++++ > > > > > 5 files changed, 106 insertions(+), 1 deletion(-) > > > > > create mode 100644 drivers/infiniband/core/srq_set.c > > > > > create mode 100644 include/rdma/srq_set.h > > > > > > > > > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile > > > > > index d1b14887..1d3eaec 100644 > > > > > --- a/drivers/infiniband/core/Makefile > > > > > +++ b/drivers/infiniband/core/Makefile > > > > > @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ > > > > > roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ > > > > > multicast.o mad.o smi.o agent.o mad_rmpp.o \ > > > > > nldev.o restrack.o counters.o ib_core_uverbs.o \ > > > > > - trace.o > > > > > + trace.o srq_set.o > > > > Why did you call it "srq_set.c" and not "srq.c"? > > > because it's not a SRQ API but SRQ-set API. > > I would say that it is SRQ-pool and not SRQ-set API. > > If you have some other idea for an API, please share with us. > > I've created this API in core layer to make the life of the ULPs easier and > we can see that it's very easy to add this feature to ULPs and get a big > benefit of it. No one here said against the feature, but tried to understand the rationale behind name *_set and why you decided to use "set" term and not "pool", like was done for MR pool. So my suggestion is to name file as srq_pool.c and use rdma_srq_poll_*() function names. Thanks > > > > > Thanks
On 3/18/2020 12:29 PM, Leon Romanovsky wrote: > On Wed, Mar 18, 2020 at 11:46:19AM +0200, Max Gurtovoy wrote: >> On 3/18/2020 8:47 AM, Leon Romanovsky wrote: >>> On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: >>>> On 3/17/2020 3:55 PM, Leon Romanovsky wrote: >>>>> On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: >>>>>> ULP's can use this API to create/destroy SRQ's with the same >>>>>> characteristics for implementing a logic that aimed to save resources >>>>>> without significant performance penalty (e.g. create SRQ per completion >>>>>> vector and use shared receive buffers for multiple controllers of the >>>>>> ULP). >>>>>> >>>>>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com> >>>>>> --- >>>>>> drivers/infiniband/core/Makefile | 2 +- >>>>>> drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ >>>>>> drivers/infiniband/core/verbs.c | 4 ++ >>>>>> include/rdma/ib_verbs.h | 5 +++ >>>>>> include/rdma/srq_set.h | 18 +++++++++ >>>>>> 5 files changed, 106 insertions(+), 1 deletion(-) >>>>>> create mode 100644 drivers/infiniband/core/srq_set.c >>>>>> create mode 100644 include/rdma/srq_set.h >>>>>> >>>>>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile >>>>>> index d1b14887..1d3eaec 100644 >>>>>> --- a/drivers/infiniband/core/Makefile >>>>>> +++ b/drivers/infiniband/core/Makefile >>>>>> @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ >>>>>> roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ >>>>>> multicast.o mad.o smi.o agent.o mad_rmpp.o \ >>>>>> nldev.o restrack.o counters.o ib_core_uverbs.o \ >>>>>> - trace.o >>>>>> + trace.o srq_set.o >>>>> Why did you call it "srq_set.c" and not "srq.c"? >>>> because it's not a SRQ API but SRQ-set API. >>> I would say that it is SRQ-pool and not SRQ-set API. >> If you have some other idea for an API, please share with us. >> >> I've created this API in core layer to make the life of the ULPs easier and >> we can see that it's very easy to add this feature to ULPs and get a big >> benefit of it. > No one here said against the feature, but tried to understand the > rationale behind name *_set and why you decided to use "set" term > and not "pool", like was done for MR pool. Ok. But srq_pool was the name I used 2 years ago and you didn't like this back then. So I renamed it to srq_set. > > So my suggestion is to name file as srq_pool.c and use rdma_srq_poll_*() > function names. No problem. > > Thanks > >>> Thanks
On Wed, Mar 18, 2020 at 12:39:44PM +0200, Max Gurtovoy wrote: > > On 3/18/2020 12:29 PM, Leon Romanovsky wrote: > > On Wed, Mar 18, 2020 at 11:46:19AM +0200, Max Gurtovoy wrote: > > > On 3/18/2020 8:47 AM, Leon Romanovsky wrote: > > > > On Tue, Mar 17, 2020 at 06:37:57PM +0200, Max Gurtovoy wrote: > > > > > On 3/17/2020 3:55 PM, Leon Romanovsky wrote: > > > > > > On Tue, Mar 17, 2020 at 03:40:26PM +0200, Max Gurtovoy wrote: > > > > > > > ULP's can use this API to create/destroy SRQ's with the same > > > > > > > characteristics for implementing a logic that aimed to save resources > > > > > > > without significant performance penalty (e.g. create SRQ per completion > > > > > > > vector and use shared receive buffers for multiple controllers of the > > > > > > > ULP). > > > > > > > > > > > > > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com> > > > > > > > --- > > > > > > > drivers/infiniband/core/Makefile | 2 +- > > > > > > > drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ > > > > > > > drivers/infiniband/core/verbs.c | 4 ++ > > > > > > > include/rdma/ib_verbs.h | 5 +++ > > > > > > > include/rdma/srq_set.h | 18 +++++++++ > > > > > > > 5 files changed, 106 insertions(+), 1 deletion(-) > > > > > > > create mode 100644 drivers/infiniband/core/srq_set.c > > > > > > > create mode 100644 include/rdma/srq_set.h > > > > > > > > > > > > > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile > > > > > > > index d1b14887..1d3eaec 100644 > > > > > > > --- a/drivers/infiniband/core/Makefile > > > > > > > +++ b/drivers/infiniband/core/Makefile > > > > > > > @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ > > > > > > > roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ > > > > > > > multicast.o mad.o smi.o agent.o mad_rmpp.o \ > > > > > > > nldev.o restrack.o counters.o ib_core_uverbs.o \ > > > > > > > - trace.o > > > > > > > + trace.o srq_set.o > > > > > > Why did you call it "srq_set.c" and not "srq.c"? > > > > > because it's not a SRQ API but SRQ-set API. > > > > I would say that it is SRQ-pool and not SRQ-set API. > > > If you have some other idea for an API, please share with us. > > > > > > I've created this API in core layer to make the life of the ULPs easier and > > > we can see that it's very easy to add this feature to ULPs and get a big > > > benefit of it. > > No one here said against the feature, but tried to understand the > > rationale behind name *_set and why you decided to use "set" term > > and not "pool", like was done for MR pool. > > Ok. But srq_pool was the name I used 2 years ago and you didn't like this > back then. I don't like it today too, but don't have better name to suggest. Thanks
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d1b14887..1d3eaec 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ - trace.o + trace.o srq_set.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/srq_set.c b/drivers/infiniband/core/srq_set.c new file mode 100644 index 0000000..d143561 --- /dev/null +++ b/drivers/infiniband/core/srq_set.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#include <rdma/srq_set.h> + +struct ib_srq *rdma_srq_get(struct ib_pd *pd) +{ + struct ib_srq *srq; + unsigned long flags; + + spin_lock_irqsave(&pd->srq_lock, flags); + srq = list_first_entry_or_null(&pd->srqs, struct ib_srq, pd_entry); + if (srq) { + list_del(&srq->pd_entry); + pd->srqs_used++; + } + spin_unlock_irqrestore(&pd->srq_lock, flags); + + return srq; +} +EXPORT_SYMBOL(rdma_srq_get); + +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq) +{ + unsigned long flags; + + spin_lock_irqsave(&pd->srq_lock, flags); + list_add(&srq->pd_entry, &pd->srqs); + pd->srqs_used--; + spin_unlock_irqrestore(&pd->srq_lock, flags); +} +EXPORT_SYMBOL(rdma_srq_put); + +int rdma_srq_set_init(struct ib_pd *pd, int nr, + struct ib_srq_init_attr *srq_attr) +{ + struct ib_srq *srq; + unsigned long flags; + int ret, i; + + for (i = 0; i < nr; i++) { + srq = ib_create_srq(pd, srq_attr); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto out; + } + + spin_lock_irqsave(&pd->srq_lock, flags); + list_add_tail(&srq->pd_entry, &pd->srqs); + spin_unlock_irqrestore(&pd->srq_lock, flags); + } + + return 0; +out: + rdma_srq_set_destroy(pd); + return ret; +} +EXPORT_SYMBOL(rdma_srq_set_init); + +void rdma_srq_set_destroy(struct ib_pd *pd) +{ + struct ib_srq *srq; + unsigned long flags; + + spin_lock_irqsave(&pd->srq_lock, flags); + while (!list_empty(&pd->srqs)) { + srq = list_first_entry(&pd->srqs, struct ib_srq, pd_entry); + list_del(&srq->pd_entry); + + spin_unlock_irqrestore(&pd->srq_lock, flags); + ib_destroy_srq(srq); + spin_lock_irqsave(&pd->srq_lock, flags); + } + spin_unlock_irqrestore(&pd->srq_lock, flags); +} +EXPORT_SYMBOL(rdma_srq_set_destroy); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e62c9df..6950abf 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -272,6 +272,9 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); pd->flags = flags; + pd->srqs_used = 0; + spin_lock_init(&pd->srq_lock); + INIT_LIST_HEAD(&pd->srqs); pd->res.type = RDMA_RESTRACK_PD; rdma_restrack_set_task(&pd->res, caller); @@ -340,6 +343,7 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) pd->__internal_mr = NULL; } + WARN_ON_ONCE(pd->srqs_used > 0); /* uverbs manipulates usecnt with proper locking, while the kabi requires the caller to guarantee we can't race here. */ WARN_ON(atomic_read(&pd->usecnt)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1f779fa..fc8207d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1517,6 +1517,10 @@ struct ib_pd { u32 unsafe_global_rkey; + spinlock_t srq_lock; + int srqs_used; + struct list_head srqs; + /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -1585,6 +1589,7 @@ struct ib_srq { void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; + struct list_head pd_entry; /* srq set entry */ struct { struct ib_cq *cq; diff --git a/include/rdma/srq_set.h b/include/rdma/srq_set.h new file mode 100644 index 0000000..834c4c6 --- /dev/null +++ b/include/rdma/srq_set.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */ +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_SRQ_SET_H +#define _RDMA_SRQ_SET_H 1 + +#include <rdma/ib_verbs.h> + +struct ib_srq *rdma_srq_get(struct ib_pd *pd); +void rdma_srq_put(struct ib_pd *pd, struct ib_srq *srq); + +int rdma_srq_set_init(struct ib_pd *pd, int nr, + struct ib_srq_init_attr *srq_attr); +void rdma_srq_set_destroy(struct ib_pd *pd); + +#endif /* _RDMA_SRQ_SET_H */
ULP's can use this API to create/destroy SRQ's with the same characteristics for implementing a logic that aimed to save resources without significant performance penalty (e.g. create SRQ per completion vector and use shared receive buffers for multiple controllers of the ULP). Signed-off-by: Max Gurtovoy <maxg@mellanox.com> --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/srq_set.c | 78 +++++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/verbs.c | 4 ++ include/rdma/ib_verbs.h | 5 +++ include/rdma/srq_set.h | 18 +++++++++ 5 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/srq_set.c create mode 100644 include/rdma/srq_set.h