diff mbox series

[rdma-next] RDMA/rdmavt: Decouple QP and SGE lists allocations

Message ID c34a864803f9bbd33d3f856a6ba2dd595ab708a7.1620729033.git.leonro@nvidia.com (mailing list archive)
State Superseded
Headers show
Series [rdma-next] RDMA/rdmavt: Decouple QP and SGE lists allocations | expand

Commit Message

Leon Romanovsky May 11, 2021, 10:36 a.m. UTC
From: Leon Romanovsky <leonro@nvidia.com>

The rdmavt QP has fields that are both needed for the control and data
path. Such mixed declaration caused to the very specific allocation flow
with kzalloc_node and SGE list embedded into the struct rvt_qp.

This patch separates QP creation to two: regular memory allocation for
the control path and specific code for the SGE list, while the access to
the later is performed through derefenced pointer.

Such pointer and its context are expected to be in the cache, so
performance difference is expected to be negligible, if any exists.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
Hi,

This change is needed to convert QP to core allocation scheme. In that
scheme QP is allocated outside of the driver and size of such allocation
is constant and can be calculated at the compile time.

Thanks
---
 drivers/infiniband/sw/rdmavt/qp.c | 13 ++++++++-----
 include/rdma/rdmavt_qp.h          |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

Comments

Haakon Bugge May 11, 2021, 10:59 a.m. UTC | #1
> On 11 May 2021, at 12:36, Leon Romanovsky <leon@kernel.org> wrote:
> 
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> The rdmavt QP has fields that are both needed for the control and data
> path. Such mixed declaration caused to the very specific allocation flow
> with kzalloc_node and SGE list embedded into the struct rvt_qp.
> 
> This patch separates QP creation to two: regular memory allocation for
> the control path and specific code for the SGE list, while the access to
> the later is performed through derefenced pointer.
> 
> Such pointer and its context are expected to be in the cache, so
> performance difference is expected to be negligible, if any exists.
> 
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> Hi,
> 
> This change is needed to convert QP to core allocation scheme. In that
> scheme QP is allocated outside of the driver and size of such allocation
> is constant and can be calculated at the compile time.
> 
> Thanks
> ---
> drivers/infiniband/sw/rdmavt/qp.c | 13 ++++++++-----
> include/rdma/rdmavt_qp.h          |  2 +-
> 2 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
> index 9d13db68283c..4522071fc220 100644
> --- a/drivers/infiniband/sw/rdmavt/qp.c
> +++ b/drivers/infiniband/sw/rdmavt/qp.c
> @@ -1077,7 +1077,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> 	int err;
> 	struct rvt_swqe *swq = NULL;
> 	size_t sz;
> -	size_t sg_list_sz;
> +	size_t sg_list_sz = 0;
> 	struct ib_qp *ret = ERR_PTR(-ENOMEM);
> 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
> 	void *priv = NULL;
> @@ -1125,8 +1125,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> 		if (!swq)
> 			return ERR_PTR(-ENOMEM);
> 
> -		sz = sizeof(*qp);
> -		sg_list_sz = 0;
> 		if (init_attr->srq) {
> 			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
> 
> @@ -1136,10 +1134,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> 		} else if (init_attr->cap.max_recv_sge > 1)
> 			sg_list_sz = sizeof(*qp->r_sg_list) *
> 				(init_attr->cap.max_recv_sge - 1);
> -		qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL,
> -				  rdi->dparms.node);
> +		qp = kzalloc(sizeof(*qp), GFP_KERNEL);

Why not kzalloc_node() here?


Thxs, HÃ¥kon

> 		if (!qp)
> 			goto bail_swq;
> +		qp->r_sg_list =
> +			kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node);
> +		if (!qp->r_sg_list)
> +			goto bail_qp;
> 		qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
> 
> 		RCU_INIT_POINTER(qp->next, NULL);
> @@ -1327,6 +1328,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> 
> bail_qp:
> 	kfree(qp->s_ack_queue);
> +	kfree(qp->r_sg_list);
> 	kfree(qp);
> 
> bail_swq:
> @@ -1761,6 +1763,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
> 	kvfree(qp->r_rq.kwq);
> 	rdi->driver_f.qp_priv_free(rdi, qp);
> 	kfree(qp->s_ack_queue);
> +	kfree(qp->r_sg_list);
> 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
> 	rdma_destroy_ah_attr(&qp->alt_ah_attr);
> 	free_ud_wq_attr(qp);
> diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
> index 8275954f5ce6..2e58d5e6ac0e 100644
> --- a/include/rdma/rdmavt_qp.h
> +++ b/include/rdma/rdmavt_qp.h
> @@ -444,7 +444,7 @@ struct rvt_qp {
> 	/*
> 	 * This sge list MUST be last. Do not add anything below here.
> 	 */
> -	struct rvt_sge r_sg_list[] /* verified SGEs */
> +	struct rvt_sge *r_sg_list /* verified SGEs */
> 		____cacheline_aligned_in_smp;
> };
> 
> -- 
> 2.31.1
>
Dennis Dalessandro May 11, 2021, 12:26 p.m. UTC | #2
On 5/11/21 6:36 AM, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> The rdmavt QP has fields that are both needed for the control and data
> path. Such mixed declaration caused to the very specific allocation flow
> with kzalloc_node and SGE list embedded into the struct rvt_qp.
> 
> This patch separates QP creation to two: regular memory allocation for
> the control path and specific code for the SGE list, while the access to
> the later is performed through derefenced pointer.
> 
> Such pointer and its context are expected to be in the cache, so
> performance difference is expected to be negligible, if any exists.
> 
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> Hi,
> 
> This change is needed to convert QP to core allocation scheme. In that
> scheme QP is allocated outside of the driver and size of such allocation
> is constant and can be calculated at the compile time.

Thanks Leon, we'll get this put through our testing.

-Denny
Leon Romanovsky May 11, 2021, 12:34 p.m. UTC | #3
On Tue, May 11, 2021 at 10:59:52AM +0000, Haakon Bugge wrote:
> 
> 
> > On 11 May 2021, at 12:36, Leon Romanovsky <leon@kernel.org> wrote:
> > 
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > The rdmavt QP has fields that are both needed for the control and data
> > path. Such mixed declaration caused to the very specific allocation flow
> > with kzalloc_node and SGE list embedded into the struct rvt_qp.
> > 
> > This patch separates QP creation to two: regular memory allocation for
> > the control path and specific code for the SGE list, while the access to
> > the later is performed through derefenced pointer.
> > 
> > Such pointer and its context are expected to be in the cache, so
> > performance difference is expected to be negligible, if any exists.
> > 
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > Hi,
> > 
> > This change is needed to convert QP to core allocation scheme. In that
> > scheme QP is allocated outside of the driver and size of such allocation
> > is constant and can be calculated at the compile time.
> > 
> > Thanks
> > ---
> > drivers/infiniband/sw/rdmavt/qp.c | 13 ++++++++-----
> > include/rdma/rdmavt_qp.h          |  2 +-
> > 2 files changed, 9 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
> > index 9d13db68283c..4522071fc220 100644
> > --- a/drivers/infiniband/sw/rdmavt/qp.c
> > +++ b/drivers/infiniband/sw/rdmavt/qp.c
> > @@ -1077,7 +1077,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> > 	int err;
> > 	struct rvt_swqe *swq = NULL;
> > 	size_t sz;
> > -	size_t sg_list_sz;
> > +	size_t sg_list_sz = 0;
> > 	struct ib_qp *ret = ERR_PTR(-ENOMEM);
> > 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
> > 	void *priv = NULL;
> > @@ -1125,8 +1125,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> > 		if (!swq)
> > 			return ERR_PTR(-ENOMEM);
> > 
> > -		sz = sizeof(*qp);
> > -		sg_list_sz = 0;
> > 		if (init_attr->srq) {
> > 			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
> > 
> > @@ -1136,10 +1134,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
> > 		} else if (init_attr->cap.max_recv_sge > 1)
> > 			sg_list_sz = sizeof(*qp->r_sg_list) *
> > 				(init_attr->cap.max_recv_sge - 1);
> > -		qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL,
> > -				  rdi->dparms.node);
> > +		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
> 
> Why not kzalloc_node() here?

The idea is to delete this kzalloc later in next patch, because all
drivers are doing same thing "qp = kzalloc(sizeof(*qp), GFP_KERNEL);".

Thanks
Leon Romanovsky May 11, 2021, 12:34 p.m. UTC | #4
On Tue, May 11, 2021 at 08:26:43AM -0400, Dennis Dalessandro wrote:
> On 5/11/21 6:36 AM, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > The rdmavt QP has fields that are both needed for the control and data
> > path. Such mixed declaration caused to the very specific allocation flow
> > with kzalloc_node and SGE list embedded into the struct rvt_qp.
> > 
> > This patch separates QP creation to two: regular memory allocation for
> > the control path and specific code for the SGE list, while the access to
> > the later is performed through derefenced pointer.
> > 
> > Such pointer and its context are expected to be in the cache, so
> > performance difference is expected to be negligible, if any exists.
> > 
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > Hi,
> > 
> > This change is needed to convert QP to core allocation scheme. In that
> > scheme QP is allocated outside of the driver and size of such allocation
> > is constant and can be calculated at the compile time.
> 
> Thanks Leon, we'll get this put through our testing.

Thanks a lot.

> 
> -Denny
Marciniszyn, Mike May 11, 2021, 7:15 p.m. UTC | #5
> >
> > Why not kzalloc_node() here?

I agree here.

Other allocations that have been promoted to the core have lost the node attribute in the allocation.

For the rdmavt based drivers and especially with the QP, there are performance implications.

I have no issue moving the allocation, as long as the node centric allocation is preserved.

I will test the patch to make sure there is nothing else lurking.

Mike
Leon Romanovsky May 11, 2021, 7:27 p.m. UTC | #6
On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote:
> > >
> > > Why not kzalloc_node() here?
> 
> I agree here.
> 
> Other allocations that have been promoted to the core have lost the node attribute in the allocation.

Did you notice any performance degradation?

Thanks
Marciniszyn, Mike May 11, 2021, 7:39 p.m. UTC | #7
> 
> On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote:
> > > >
> > > > Why not kzalloc_node() here?
> >
> > I agree here.
> >
> > Other allocations that have been promoted to the core have lost the node
> attribute in the allocation.
> 
> Did you notice any performance degradation?
> 

For the QP, we most certainly will.

In any case, the promotion should address not losing the node.

The allocation gets the ib_device, and it would seem to hard to add method of determining the node.

Mike
Dennis Dalessandro May 12, 2021, 4:08 a.m. UTC | #8
On 5/11/21 3:27 PM, Leon Romanovsky wrote:
> On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote:
>>>>
>>>> Why not kzalloc_node() here?
>>
>> I agree here.
>>
>> Other allocations that have been promoted to the core have lost the node attribute in the allocation.
> 
> Did you notice any performance degradation?
> 

So what's the motivation to change it from the way it was originally 
designed? It seems to me if the original implementation went to the 
trouble to allocate the memory on the local node, refactoring the code 
should respect that.

-Denny
Leon Romanovsky May 12, 2021, 12:13 p.m. UTC | #9
On Wed, May 12, 2021 at 12:08:59AM -0400, Dennis Dalessandro wrote:
> 
> On 5/11/21 3:27 PM, Leon Romanovsky wrote:
> > On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote:
> > > > > 
> > > > > Why not kzalloc_node() here?
> > > 
> > > I agree here.
> > > 
> > > Other allocations that have been promoted to the core have lost the node attribute in the allocation.
> > 
> > Did you notice any performance degradation?
> > 
> 
> So what's the motivation to change it from the way it was originally
> designed? It seems to me if the original implementation went to the trouble
> to allocate the memory on the local node, refactoring the code should
> respect that.

I have no problem to make rdma_zalloc_*() node aware, but would like to get
real performance justification. My assumption is that rdmavt use kzalloc_node
for the control plane based on some internal performance testing and we finally
can see the difference between kzalloc and kzalloc_node, am I right?

Is the claim of performance degradation backed by data?

The main reason (maybe I'm wrong here) is to avoid _node() allocators
because they increase chances of memory allocation failure due to not
doing fallback in case node memory is depleted.

Again, I'm suggesting to do plain kzalloc() for control part of QP.

Thanks

> 
> -Denny
Marciniszyn, Mike May 12, 2021, 12:23 p.m. UTC | #10
> -	struct rvt_sge r_sg_list[] /* verified SGEs */
> +	struct rvt_sge *r_sg_list /* verified SGEs */
>  		____cacheline_aligned_in_smp;
>  };
> 

Since since has been made an independent allocation, r_sg_list becomes a read-mostly pointer and should be moved up in rvt_qp to other 64 bit fields around timeout_jiffies.

The cacheline can then be dropped for this field.

Mike
Marciniszyn, Mike May 12, 2021, 12:25 p.m. UTC | #11
> > Thanks Leon, we'll get this put through our testing.
> 
> Thanks a lot.
> 
> >

The patch as is passed all our functional testing.

Mike
Dennis Dalessandro May 12, 2021, 12:45 p.m. UTC | #12
On 5/12/21 8:13 AM, Leon Romanovsky wrote:
> On Wed, May 12, 2021 at 12:08:59AM -0400, Dennis Dalessandro wrote:
>>
>> On 5/11/21 3:27 PM, Leon Romanovsky wrote:
>>> On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote:
>>>>>>
>>>>>> Why not kzalloc_node() here?
>>>>
>>>> I agree here.
>>>>
>>>> Other allocations that have been promoted to the core have lost the node attribute in the allocation.
>>>
>>> Did you notice any performance degradation?
>>>
>>
>> So what's the motivation to change it from the way it was originally
>> designed? It seems to me if the original implementation went to the trouble
>> to allocate the memory on the local node, refactoring the code should
>> respect that.
> 
> I have no problem to make rdma_zalloc_*() node aware, but would like to get
> real performance justification. My assumption is that rdmavt use kzalloc_node
> for the control plane based on some internal performance testing and we finally
> can see the difference between kzalloc and kzalloc_node, am I right?
> 
> Is the claim of performance degradation backed by data?

Yes, in the past. I don't have access anymore now that I'm not with 
Intel. It probably would not have been publishable anyway.

> The main reason (maybe I'm wrong here) is to avoid _node() allocators
> because they increase chances of memory allocation failure due to not
> doing fallback in case node memory is depleted.

Agreed. It's a trade-off that was deemed acceptable.

> Again, I'm suggesting to do plain kzalloc() for control part of QP.

Now I don't recall data for that specifically, but to be on the safe 
side I would not want to risk a performance regression.

-Denny
Leon Romanovsky May 12, 2021, 12:50 p.m. UTC | #13
On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
> > > Thanks Leon, we'll get this put through our testing.
> > 
> > Thanks a lot.
> > 
> > >
> 
> The patch as is passed all our functional testing.

Thanks Mike,

Can I ask you to perform a performance comparison between this patch and
the following?

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 4522071fc220..d0e0f7cbe17a 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1134,7 +1134,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                } else if (init_attr->cap.max_recv_sge > 1)
                        sg_list_sz = sizeof(*qp->r_sg_list) *
                                (init_attr->cap.max_recv_sge - 1);
-               qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+               qp = kzalloc_node(sizeof(*qp), GFP_KERNEL, rdi->dparms.node);
                if (!qp)
                        goto bail_swq;
                qp->r_sg_list =


Thanks

> 
> Mike
Dennis Dalessandro May 13, 2021, 7:03 p.m. UTC | #14
On 5/12/21 8:50 AM, Leon Romanovsky wrote:
> On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
>>>> Thanks Leon, we'll get this put through our testing.
>>>
>>> Thanks a lot.
>>>
>>>>
>>
>> The patch as is passed all our functional testing.
> 
> Thanks Mike,
> 
> Can I ask you to perform a performance comparison between this patch and
> the following?

We have years of performance data with the code the way it is. Please 
maintain the original functionality of the code when moving things into 
the core unless there is a compelling reason to change. That is not the 
case here.

> 
> diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
> index 4522071fc220..d0e0f7cbe17a 100644
> --- a/drivers/infiniband/sw/rdmavt/qp.c
> +++ b/drivers/infiniband/sw/rdmavt/qp.c
> @@ -1134,7 +1134,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
>                  } else if (init_attr->cap.max_recv_sge > 1)
>                          sg_list_sz = sizeof(*qp->r_sg_list) *
>                                  (init_attr->cap.max_recv_sge - 1);
> -               qp = kzalloc(sizeof(*qp), GFP_KERNEL);
> +               qp = kzalloc_node(sizeof(*qp), GFP_KERNEL, rdi->dparms.node);
>                  if (!qp)
>                          goto bail_swq;
>                  qp->r_sg_list =
> 
> 

-Denny
Jason Gunthorpe May 13, 2021, 7:15 p.m. UTC | #15
On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote:
> On 5/12/21 8:50 AM, Leon Romanovsky wrote:
> > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
> > > > > Thanks Leon, we'll get this put through our testing.
> > > > 
> > > > Thanks a lot.
> > > > 
> > > > > 
> > > 
> > > The patch as is passed all our functional testing.
> > 
> > Thanks Mike,
> > 
> > Can I ask you to perform a performance comparison between this patch and
> > the following?
> 
> We have years of performance data with the code the way it is. Please
> maintain the original functionality of the code when moving things into the
> core unless there is a compelling reason to change. That is not the case
> here.

Well, making the core do node allocations for metadata on every driver
is a pretty big thing to ask for with no data.

Jason
Dennis Dalessandro May 13, 2021, 7:31 p.m. UTC | #16
On 5/13/21 3:15 PM, Jason Gunthorpe wrote:
> On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote:
>> On 5/12/21 8:50 AM, Leon Romanovsky wrote:
>>> On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
>>>>>> Thanks Leon, we'll get this put through our testing.
>>>>>
>>>>> Thanks a lot.
>>>>>
>>>>>>
>>>>
>>>> The patch as is passed all our functional testing.
>>>
>>> Thanks Mike,
>>>
>>> Can I ask you to perform a performance comparison between this patch and
>>> the following?
>>
>> We have years of performance data with the code the way it is. Please
>> maintain the original functionality of the code when moving things into the
>> core unless there is a compelling reason to change. That is not the case
>> here.
> 
> Well, making the core do node allocations for metadata on every driver
> is a pretty big thing to ask for with no data.

Can't you just make the call into the core take a flag for this? You are 
  looking to make a change to key behavior without any clear reason that 
I can see for why it needs to be that way. If there is a good reason, 
please explain so we can understand.

I would think the person authoring the patch should be responsible to 
prove their patch doesn't cause a regression. We are telling you it did 
make a difference when the code was first written, probably like 6 years 
ago. At the very least no one had an issue with this code 4 years ago 
the last time it was touched (by Leon btw). Nor issues with the other 
uses of the call.

We can have our performance experts look at it but it's going to take 
some time as it's nothing that has been on anyone's radar.

-Denny
Jason Gunthorpe May 14, 2021, 1:02 p.m. UTC | #17
On Thu, May 13, 2021 at 03:31:48PM -0400, Dennis Dalessandro wrote:
> On 5/13/21 3:15 PM, Jason Gunthorpe wrote:
> > On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote:
> > > On 5/12/21 8:50 AM, Leon Romanovsky wrote:
> > > > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
> > > > > > > Thanks Leon, we'll get this put through our testing.
> > > > > > 
> > > > > > Thanks a lot.
> > > > > > 
> > > > > > > 
> > > > > 
> > > > > The patch as is passed all our functional testing.
> > > > 
> > > > Thanks Mike,
> > > > 
> > > > Can I ask you to perform a performance comparison between this patch and
> > > > the following?
> > > 
> > > We have years of performance data with the code the way it is. Please
> > > maintain the original functionality of the code when moving things into the
> > > core unless there is a compelling reason to change. That is not the case
> > > here.
> > 
> > Well, making the core do node allocations for metadata on every driver
> > is a pretty big thing to ask for with no data.
> 
> Can't you just make the call into the core take a flag for this? You are
> looking to make a change to key behavior without any clear reason that I can
> see for why it needs to be that way. If there is a good reason, please
> explain so we can understand.

The lifetime model of all this data is messed up, there are a bunch of
little bugs on the error paths, and we can't have a proper refcount
lifetime module when this code really wants to have it.

IMHO if hf1 has a performance need here it should chain a sub
allocation since promoting node awareness to the core code looks
not nice..

These are not supposed to be performance sensitive data structures,
they haven't even been organized for cache locality or anything.

> I would think the person authoring the patch should be responsible to prove
> their patch doesn't cause a regression.

I'm more interested in this argument as it applied to functional
regressions. Performance is always shifting around and a win for a
node specific allocation seems highly situational to me. I half wonder
if all the node allocation in this driver is just some copy and
paste.

Jason
Dennis Dalessandro May 14, 2021, 2:07 p.m. UTC | #18
On 5/14/21 9:02 AM, Jason Gunthorpe wrote:
> On Thu, May 13, 2021 at 03:31:48PM -0400, Dennis Dalessandro wrote:
>> On 5/13/21 3:15 PM, Jason Gunthorpe wrote:
>>> On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote:
>>>> On 5/12/21 8:50 AM, Leon Romanovsky wrote:
>>>>> On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
>>>>>>>> Thanks Leon, we'll get this put through our testing.
>>>>>>>
>>>>>>> Thanks a lot.
>>>>>>>
>>>>>>>>
>>>>>>
>>>>>> The patch as is passed all our functional testing.
>>>>>
>>>>> Thanks Mike,
>>>>>
>>>>> Can I ask you to perform a performance comparison between this patch and
>>>>> the following?
>>>>
>>>> We have years of performance data with the code the way it is. Please
>>>> maintain the original functionality of the code when moving things into the
>>>> core unless there is a compelling reason to change. That is not the case
>>>> here.
>>>
>>> Well, making the core do node allocations for metadata on every driver
>>> is a pretty big thing to ask for with no data.
>>
>> Can't you just make the call into the core take a flag for this? You are
>> looking to make a change to key behavior without any clear reason that I can
>> see for why it needs to be that way. If there is a good reason, please
>> explain so we can understand.
> 
> The lifetime model of all this data is messed up, there are a bunch of
> little bugs on the error paths, and we can't have a proper refcount
> lifetime module when this code really wants to have it.
> 
> IMHO if hf1 has a performance need here it should chain a sub
> allocation since promoting node awareness to the core code looks
> not nice..

That's part of what I want to understand. Why is it "not nice"? Is it 
because there is only 1 driver that needs it or something else.

As far as chaining a sub allocation, I'm not sure I follow. Isn't that 
kinda what Leon is doing here? Or will do, in other words move the qp 
allocation to the core and leave the SGE allocation in the driver per 
node. I can't say for any certainty one way or the other this is OK. I 
just know it would really suck to end up with a performance regression 
for something that was easily avoided by not changing the code behavior. 
A regression in code that has been this way since day 1 would be really 
bad. I'd just really rather not take that chance.

> These are not supposed to be performance sensitive data structures,
> they haven't even been organized for cache locality or anything.
> 
>> I would think the person authoring the patch should be responsible to prove
>> their patch doesn't cause a regression.
> 
> I'm more interested in this argument as it applied to functional
> regressions. Performance is always shifting around and a win for a
> node specific allocation seems highly situational to me. I half wonder
> if all the node allocation in this driver is just some copy and
> paste.

I think prove is too strong of a word. Should have said do what is 
reasonably necessary to ensure their patch doesn't cause a regression. 
Whether that's running their own tests or taking the advice from the 
folks who wrote the initial code or even other non-biased review 
opinions, etc. I certainly don't expect Leon to throw some HFIs in a 
machine and do a performance evaluation.

I think this is the exact opposite of copy/paste. When we wrote this 
code originally there was a ton of work that went into how data 
structures were aligned and organized, as well as an examination of 
allocations and per node allocations were found to be important. If you 
look at the original qib code in v4.5, before we did rdmavt, the 
allocation was not per node. We specifically changed that in v4.6 when 
we put in rdmavt. In v4.3 when hfi1 went into staging it was not using 
the per node variant either (because that was copy/paste).

I would love to be able to go back in our code reviews and bug tracking 
and tell you exactly why this line of code was changed to be per node. 
Unfortunately that level of information has not passed on to Cornelis.

-Denny
Jason Gunthorpe May 14, 2021, 2:35 p.m. UTC | #19
On Fri, May 14, 2021 at 10:07:43AM -0400, Dennis Dalessandro wrote:
> > IMHO if hf1 has a performance need here it should chain a sub
> > allocation since promoting node awareness to the core code looks
> > not nice..
> 
> That's part of what I want to understand. Why is it "not nice"? Is it
> because there is only 1 driver that needs it or something else.

Because node allocation is extremely situational. Currently the kernel
has some tunable automatic heuristic, overriding it should only be
done in cases where the code knows absolutely that a node is the
correct thing, for instance because an IRQ pinned to a specific node
is the main consumer of the data.

hfi1 might have some situation where putting the QP on the device's
node makes sense, while another driver might work better with the QP
on the user thread that owns it. Who knows, it depends on the access
pattern.

How do I sort this out in a generic way without making a big mess?

And why are you so sure that node allocation is the right thing for
hfi?? The interrupts can be rebalanced by userspace and user threads
can be pinned to other nodes. Why is choosing the device node
unconditionally the right choice?

This feels like something that was designed to benifit a very
constrained use case and harm everything else.

> As far as chaining a sub allocation, I'm not sure I follow. Isn't that kinda
> what Leon is doing here? Or will do, in other words move the qp allocation
> to the core and leave the SGE allocation in the driver per node. I can't say
> for any certainty one way or the other this is OK. I just know it would
> really suck to end up with a performance regression for something that was
> easily avoided by not changing the code behavior. A regression in code that
> has been this way since day 1 would be really bad. I'd just really rather
> not take that chance.

It means put the data you know is performance sensitive in a struct
and then allocate that struct and related on the node that is
guarenteed to be touching that data. For instance if you have a pinned
workqueue or IRQ or something.

The core stuff in ib_qp is not performance sensitive and has no
obvious node affinity since it relates primarily to simple control
stuff.

> I would love to be able to go back in our code reviews and bug tracking and
> tell you exactly why this line of code was changed to be per node.
> Unfortunately that level of information has not passed on to Cornelis.

Wow, that is remarkable

Jason
Marciniszyn, Mike May 14, 2021, 3 p.m. UTC | #20
> The core stuff in ib_qp is not performance sensitive and has no obvious node
> affinity since it relates primarily to simple control stuff.
> 

The current rvt_qp "inherits" from ib_qp, so the fields in the "control" stuff are performance critical especially for receive processing and have historically live in the same allocation.

I would in no way call these fields "simple control stuff".    The rvt_qp structure is tuned to optimize receive processing and the NUMA locality is part of that tuning.

We could separate out the allocation, but that misses promoting fields from rvt_qp that may indeed be common into the core.

I know that we use the qpn from ib_qp and there may be other fields in the critical path.

Mike
Jason Gunthorpe May 14, 2021, 3:02 p.m. UTC | #21
On Fri, May 14, 2021 at 03:00:37PM +0000, Marciniszyn, Mike wrote:
> > The core stuff in ib_qp is not performance sensitive and has no obvious node
> > affinity since it relates primarily to simple control stuff.
> > 
> 
> The current rvt_qp "inherits" from ib_qp, so the fields in the
> "control" stuff are performance critical especially for receive
> processing and have historically live in the same allocation.

This is why I said "core stuff in ib_qp" if drivers are adding
performance stuff to their own structs then that is the driver's
responsibility to handle.

> I would in no way call these fields "simple control stuff".  The
> rvt_qp structure is tuned to optimize receive processing and the
> NUMA locality is part of that tuning.
> 
> We could separate out the allocation, but that misses promoting
> fields from rvt_qp that may indeed be common into the core.
> 
> I know that we use the qpn from ib_qp and there may be other fields
> in the critical path.

I wouldn't worry about 32 bits when tuning for performance

Jason
Leon Romanovsky May 16, 2021, 10:56 a.m. UTC | #22
On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote:
> On 5/12/21 8:50 AM, Leon Romanovsky wrote:
> > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote:
> > > > > Thanks Leon, we'll get this put through our testing.
> > > > 
> > > > Thanks a lot.
> > > > 
> > > > > 
> > > 
> > > The patch as is passed all our functional testing.
> > 
> > Thanks Mike,
> > 
> > Can I ask you to perform a performance comparison between this patch and
> > the following?
> 
> We have years of performance data with the code the way it is. Please
> maintain the original functionality of the code when moving things into the
> core unless there is a compelling reason to change. That is not the case
> here.

Sorry for not being responsive.

In addition to already said in parallel thread, this change keeps the
functionality except static node. I'm curious to finally see the difference
between these two allocations and it is very unlilkely we will see any.

For example, this QP can be associated with application that runs on
different node than rdi->dparms.node. Will we see performance degradation?

Thanks
Leon Romanovsky May 19, 2021, 7:50 a.m. UTC | #23
On Fri, May 14, 2021 at 12:02:37PM -0300, Jason Gunthorpe wrote:
> On Fri, May 14, 2021 at 03:00:37PM +0000, Marciniszyn, Mike wrote:
> > > The core stuff in ib_qp is not performance sensitive and has no obvious node
> > > affinity since it relates primarily to simple control stuff.
> > > 
> > 
> > The current rvt_qp "inherits" from ib_qp, so the fields in the
> > "control" stuff are performance critical especially for receive
> > processing and have historically live in the same allocation.
> 
> This is why I said "core stuff in ib_qp" if drivers are adding
> performance stuff to their own structs then that is the driver's
> responsibility to handle.

Can I learn from this response that node aware allocation is not needed,
and this patch can go as is.

Thanks
Dennis Dalessandro May 19, 2021, 11:56 a.m. UTC | #24
On 5/19/21 3:50 AM, Leon Romanovsky wrote:
> On Fri, May 14, 2021 at 12:02:37PM -0300, Jason Gunthorpe wrote:
>> On Fri, May 14, 2021 at 03:00:37PM +0000, Marciniszyn, Mike wrote:
>>>> The core stuff in ib_qp is not performance sensitive and has no obvious node
>>>> affinity since it relates primarily to simple control stuff.
>>>>
>>>
>>> The current rvt_qp "inherits" from ib_qp, so the fields in the
>>> "control" stuff are performance critical especially for receive
>>> processing and have historically live in the same allocation.
>>
>> This is why I said "core stuff in ib_qp" if drivers are adding
>> performance stuff to their own structs then that is the driver's
>> responsibility to handle.
> 
> Can I learn from this response that node aware allocation is not needed,
> and this patch can go as is.

It's pretty clearly a NAK from us. The code was purposely written this 
way, it was done years ago (from day 1 basically), changing it now is 
concerning for performance.

Perhaps the code can be enhanced to move more stuff into the driver's 
own structs as Jason points out, but that should happen first. For now I 
still don't understand why the core can't optionally make the allocation 
per node.

-Denny
Jason Gunthorpe May 19, 2021, 6:29 p.m. UTC | #25
On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote:

> Perhaps the code can be enhanced to move more stuff into the driver's own
> structs as Jason points out, but that should happen first. For now I still
> don't understand why the core can't optionally make the allocation per node.

Because I think it is wrong in the general case to assign all
allocations to a single node?

Jason
Dennis Dalessandro May 19, 2021, 7:49 p.m. UTC | #26
On 5/19/21 2:29 PM, Jason Gunthorpe wrote:
> On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote:
> 
>> Perhaps the code can be enhanced to move more stuff into the driver's own
>> structs as Jason points out, but that should happen first. For now I still
>> don't understand why the core can't optionally make the allocation per node.
> 
> Because I think it is wrong in the general case to assign all
> allocations to a single node?

If by general case you mean for all drivers, sure, totally agree. We 
aren't talking about all drivers though, just the particular case of rdmavt.

-Denny
Jason Gunthorpe May 19, 2021, 8:26 p.m. UTC | #27
On Wed, May 19, 2021 at 03:49:31PM -0400, Dennis Dalessandro wrote:
> On 5/19/21 2:29 PM, Jason Gunthorpe wrote:
> > On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote:
> > 
> > > Perhaps the code can be enhanced to move more stuff into the driver's own
> > > structs as Jason points out, but that should happen first. For now I still
> > > don't understand why the core can't optionally make the allocation per node.
> > 
> > Because I think it is wrong in the general case to assign all
> > allocations to a single node?
> 
> If by general case you mean for all drivers, sure, totally agree. We aren't
> talking about all drivers though, just the particular case of rdmavt.

I think it is wrong for rdmavt too and your benchmarks have focused on
a specific case with process/thread affinities that can actually
benefit from it.

I don't want to encourage other drivers to do the same thing.

The correct thing to do today in 2021 is to use the standard NUMA
memory policy on already node-affine threads. The memory policy goes
into the kernel and normal non-_node allocations will obey it. When
combined with an appropriate node-affine HCA this will work as you are
expecting right now.

However you can't do anything like that while the kernel has the _node
annotations, that overrides the NUMA memory policy and breaks the
policy system!

The *only* reason to override the node behavior in the kernel is if
the kernel knows with high certainty that allocations are only going
to be touched by certain CPUs, such as because it knows that the
allocation is substantially for use in a CPU pinned irq/workqeueue or
accessed via DMA from a node affine DMA device.

None of these seem true for the QP struct.

Especially since for RDMA all of the above is highly situational. The
IRQ/WQ processing anything in RDMA should be tied to the comp_vector,
so without knowing that information you simply can't do anything
correct at allocation time. 

The idea of allocating every to the HW's node is simply not correct
design. I will grant you it may have made sense ages ago before the
NUMA stuff was more completed, but today it does not and you'd be
better to remove it all and use memory policy properly than insist we
keep it around forever.

Jason
Dennis Dalessandro May 20, 2021, 10:02 p.m. UTC | #28
On 5/19/21 4:26 PM, Jason Gunthorpe wrote:
> On Wed, May 19, 2021 at 03:49:31PM -0400, Dennis Dalessandro wrote:
>> On 5/19/21 2:29 PM, Jason Gunthorpe wrote:
>>> On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote:
>>>
>>>> Perhaps the code can be enhanced to move more stuff into the driver's own
>>>> structs as Jason points out, but that should happen first. For now I still
>>>> don't understand why the core can't optionally make the allocation per node.
>>>
>>> Because I think it is wrong in the general case to assign all
>>> allocations to a single node?
>>
>> If by general case you mean for all drivers, sure, totally agree. We aren't
>> talking about all drivers though, just the particular case of rdmavt.
> 
> I think it is wrong for rdmavt too and your benchmarks have focused on
> a specific case with process/thread affinities that can actually
> benefit from it.
> 
> I don't want to encourage other drivers to do the same thing.

I would imagine they would get the same push back we are getting here. I 
don't think this would encourage anyone honestly.

> The correct thing to do today in 2021 is to use the standard NUMA
> memory policy on already node-affine threads. The memory policy goes
> into the kernel and normal non-_node allocations will obey it. When
> combined with an appropriate node-affine HCA this will work as you are
> expecting right now.

So we shouldn't see any issue in the normal case is what you are saying. 
I'd like to believe that, proving it is not easy though.

> However you can't do anything like that while the kernel has the _node
> annotations, that overrides the NUMA memory policy and breaks the
> policy system!

Does our driver doing this break the entire system? I'm not sure how 
that's possible. Is there an effort to get rid of these per node 
allocations so ultimately we won't have a choice at some point?

> The *only* reason to override the node behavior in the kernel is if
> the kernel knows with high certainty that allocations are only going
> to be touched by certain CPUs, such as because it knows that the
> allocation is substantially for use in a CPU pinned irq/workqeueue or
> accessed via DMA from a node affine DMA device.
> 
> None of these seem true for the QP struct.

Will let Mike M respond about that. He's got a much better handle on the 
implications.

> Especially since for RDMA all of the above is highly situational. The
> IRQ/WQ processing anything in RDMA should be tied to the comp_vector,
> so without knowing that information you simply can't do anything
> correct at allocation time.

I don't think that's true for our case. The comp_vector may in some 
cases be the right thing to dictate where memory should be, in our case 
I don't think that's true all the time.

> The idea of allocating every to the HW's node is simply not correct
> design. I will grant you it may have made sense ages ago before the
> NUMA stuff was more completed, but today it does not and you'd be
> better to remove it all and use memory policy properly than insist we
> keep it around forever.

Not insisting anything. If the trend is to remove these sort of 
allocations and other drivers are no longer doing this "not correct 
design" we are certainly open to change. We just want to understand the 
impact first rather than being strong armed into accepting a performance 
regression just so Leon can refactor some code.

-Denny
Leon Romanovsky May 21, 2021, 6:29 a.m. UTC | #29
On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote:
> On 5/19/21 4:26 PM, Jason Gunthorpe wrote:
> > On Wed, May 19, 2021 at 03:49:31PM -0400, Dennis Dalessandro wrote:
> > > On 5/19/21 2:29 PM, Jason Gunthorpe wrote:
> > > > On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote:

<...>

> > Especially since for RDMA all of the above is highly situational. The
> > IRQ/WQ processing anything in RDMA should be tied to the comp_vector,
> > so without knowing that information you simply can't do anything
> > correct at allocation time.
> 
> I don't think that's true for our case. The comp_vector may in some cases be
> the right thing to dictate where memory should be, in our case I don't think
> that's true all the time.

In verbs world, the comp_vector is always the right thing to dictate
node policy. We can argue if it works correctly or not.

https://www.rdmamojo.com/2012/11/03/ibv_create_cq/
comp_vector:
 MSI-X completion vector that will be used for signaling Completion events.
 If the IRQ affinity masks of these interrupts have been configured to spread
 each MSI-X interrupt to be handled by a different core, this parameter can be
 used to spread the completion workload over multiple cores.

> 
> > The idea of allocating every to the HW's node is simply not correct
> > design. I will grant you it may have made sense ages ago before the
> > NUMA stuff was more completed, but today it does not and you'd be
> > better to remove it all and use memory policy properly than insist we
> > keep it around forever.
> 
> Not insisting anything. If the trend is to remove these sort of allocations
> and other drivers are no longer doing this "not correct design" we are
> certainly open to change. We just want to understand the impact first rather
> than being strong armed into accepting a performance regression just so Leon
> can refactor some code.

It is hard to talk without data.

Thanks

> 
> -Denny
Jason Gunthorpe May 25, 2021, 1:13 p.m. UTC | #30
On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote:

> > I don't want to encourage other drivers to do the same thing.
> 
> I would imagine they would get the same push back we are getting here. I
> don't think this would encourage anyone honestly.

Then we are back to making infrastructure that is only useful for one,
arguably wrong, driver.
 
> > The correct thing to do today in 2021 is to use the standard NUMA
> > memory policy on already node-affine threads. The memory policy goes
> > into the kernel and normal non-_node allocations will obey it. When
> > combined with an appropriate node-affine HCA this will work as you are
> > expecting right now.
> 
> So we shouldn't see any issue in the normal case is what you are
> saying. I'd like to believe that, proving it is not easy though.

Well, I said you have to setup the userspace properly, I'm not sure it
just works out of the box.

> > However you can't do anything like that while the kernel has the _node
> > annotations, that overrides the NUMA memory policy and breaks the
> > policy system!
> 
> Does our driver doing this break the entire system? I'm not sure how that's
> possible. 

It breaks your driver part of it, and if we lift it to the core code
then it breaks all drivers, so it is a hard no-go.

> Is there an effort to get rid of these per node allocations so
> ultimately we won't have a choice at some point?

Unlikely, subtle stuff like this will just be left broken in drivers
nobody cares about..

Jason
Dennis Dalessandro May 25, 2021, 2:10 p.m. UTC | #31
On 5/25/21 9:13 AM, Jason Gunthorpe wrote:
> On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote:
> 
>>> I don't want to encourage other drivers to do the same thing.
>>
>> I would imagine they would get the same push back we are getting here. I
>> don't think this would encourage anyone honestly.
> 
> Then we are back to making infrastructure that is only useful for one,
> arguably wrong, driver.

That's just it, you argue that it's wrong. We don't agree that it's 
wrong. In fact you said previously:

"
The *only* reason to override the node behavior in the kernel is if
the kernel knows with high certainty that allocations are only going
to be touched by certain CPUs, such as because it knows that the
allocation is substantially for use in a CPU pinned irq/workqeueue or
accessed via DMA from a node affine DMA device.
"

Well, that's pretty much why we are doing this.

>>> The correct thing to do today in 2021 is to use the standard NUMA
>>> memory policy on already node-affine threads. The memory policy goes
>>> into the kernel and normal non-_node allocations will obey it. When
>>> combined with an appropriate node-affine HCA this will work as you are
>>> expecting right now.
>>
>> So we shouldn't see any issue in the normal case is what you are
>> saying. I'd like to believe that, proving it is not easy though.
> 
> Well, I said you have to setup the userspace properly, I'm not sure it
> just works out of the box.

I'm going to go out on a limb and assume it will not just work out of 
the box.

>>> However you can't do anything like that while the kernel has the _node
>>> annotations, that overrides the NUMA memory policy and breaks the
>>> policy system!
>>
>> Does our driver doing this break the entire system? I'm not sure how that's
>> possible.
> 
> It breaks your driver part of it, and if we lift it to the core code
> then it breaks all drivers, so it is a hard no-go.
> 
>> Is there an effort to get rid of these per node allocations so
>> ultimately we won't have a choice at some point?
> 
> Unlikely, subtle stuff like this will just be left broken in drivers
> nobody cares about..

If it's not that big of a deal then what's the problem? Again, you keep 
saying it's broken. I'm still not seeing a compelling reason that tells 
me it is in fact broken. This is the way we get best performance which 
for the RDMA subsystem should pretty much trump everything except security.

All this being said, philosophical and theoretical arguments aren't 
going to get us anywhere productive. Things could certainly be different 
performance wise half a decade later after the code originally went in.

We are already mid 5.13 cycle. So the earliest this could be queued up 
to go in is 5.14. Can this wait one more cycle? If we can't get it 
tested/proven to make a difference mid 5.14, we will drop the objection 
and Leon's patch can go ahead in for 5.15. Fair compromise?

-Denny
Jason Gunthorpe May 25, 2021, 2:20 p.m. UTC | #32
On Tue, May 25, 2021 at 10:10:47AM -0400, Dennis Dalessandro wrote:
> On 5/25/21 9:13 AM, Jason Gunthorpe wrote:
> > On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote:
> > 
> > > > I don't want to encourage other drivers to do the same thing.
> > > 
> > > I would imagine they would get the same push back we are getting here. I
> > > don't think this would encourage anyone honestly.
> > 
> > Then we are back to making infrastructure that is only useful for one,
> > arguably wrong, driver.
> 
> That's just it, you argue that it's wrong. We don't agree that it's wrong.
> In fact you said previously:

You haven't presented a single shred of anything to substantiate this
disagreement beyoned "we have ancient benchmarks we can't reproduce"

Not even a hand wavey logical argument why it could matter.

> "
> The *only* reason to override the node behavior in the kernel is if
> the kernel knows with high certainty that allocations are only going
> to be touched by certain CPUs, such as because it knows that the
> allocation is substantially for use in a CPU pinned irq/workqeueue or
> accessed via DMA from a node affine DMA device.
> "
> 
> Well, that's pretty much why we are doing this.

Huh?I don't see DMA from the qp struct and as I said any MSI affinity
should be driven by the comp_vector, so no, I don't think that is what
HFI is doing at all.

> We are already mid 5.13 cycle. So the earliest this could be queued up to go
> in is 5.14. Can this wait one more cycle? If we can't get it tested/proven
> to make a difference mid 5.14, we will drop the objection and Leon's patch
> can go ahead in for 5.15. Fair compromise?

Fine, but the main question is if you can use normal memory policy
settings, not this.

Jason
Dennis Dalessandro May 25, 2021, 2:29 p.m. UTC | #33
On 5/25/21 10:20 AM, Jason Gunthorpe wrote:
>> We are already mid 5.13 cycle. So the earliest this could be queued up to go
>> in is 5.14. Can this wait one more cycle? If we can't get it tested/proven
>> to make a difference mid 5.14, we will drop the objection and Leon's patch
>> can go ahead in for 5.15. Fair compromise?
> 
> Fine, but the main question is if you can use normal memory policy
> settings, not this.

Agreed.

-Denny
Leon Romanovsky June 2, 2021, 4:33 a.m. UTC | #34
On Tue, May 25, 2021 at 10:10:47AM -0400, Dennis Dalessandro wrote:
> On 5/25/21 9:13 AM, Jason Gunthorpe wrote:
> > On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote:

<...>

> We are already mid 5.13 cycle. So the earliest this could be queued up to go
> in is 5.14. Can this wait one more cycle? If we can't get it tested/proven
> to make a difference mid 5.14, we will drop the objection and Leon's patch
> can go ahead in for 5.15. Fair compromise?

I sent this patch as early as I could to make sure that it won't
jeopardize the restrack QP flow fixes. Delaying one more cycle means
that QP conversion will be delayed too which is needed to close the race
between netlink query QP call and simultaneous ibv_destroy_qp() call.

Thanks

> 
> -Denny
> 
> 
> 
>
Marciniszyn, Mike June 28, 2021, 9:59 p.m. UTC | #35
>
> Fine, but the main question is if you can use normal memory policy settings, not
> this.
>
> Jason

Our performance team has gotten some preliminary data on AMD platforms.

I prepared a kernel that will using allocate the QP using the "local" numa node (as currently done) and an allocation that intentionally allocates on the opposite socket based on a module parameter and our internal tests were executed with progressively larger queue pair counts.

In the second case on 64 core/socket AMD platforms, we are seeing with the intentionally opposite allocation, latency dropped ~6-7% and BW dropped ~13% on high queue count perftest.

SKX impact is minimal if any, but we need to look at legacy Intel chips that preceded SKX.   We are still reviewing the data and expanding the test to older chips.

Our theory is the hfi1 interrupt receive processing is fetching cachelines between the sockets causing the slowdown.   The receive processing is critical for hfi1 (and qib before that).    This is a heavily tuned code path.

To answer some of the pending questions posed before, the mempolicy looks to be a process relative control and does not apply to our QP allocation where the struct rvt_qp is in the kernel.  It certainly does not apply to kernel ULPs such as those created by say Lustre, ipoib, SRP, iSer, and NFS RDMA.

We do support comp_vector stuff, but that distributes completion processing.  Completions are triggered in our receive processing but to a much less extent based on ULP choices and packet type.    From a strategy standpoint, the code assumes distribution of kernel receive interrupt processing is vectored either by irqbalance or by explicit user mode scripting to spread RC QP receive processing across CPUs on the local socket.

Mike
External recipient
Jason Gunthorpe June 28, 2021, 11:19 p.m. UTC | #36
On Mon, Jun 28, 2021 at 09:59:48PM +0000, Marciniszyn, Mike wrote:

> To answer some of the pending questions posed before, the mempolicy
> looks to be a process relative control and does not apply to our QP
> allocation where the struct rvt_qp is in the kernel.

I think mempolicy is per task, which is a thread, and it propagates
into kernel allocations made under that task's current

> It certainly does not apply to kernel ULPs such as those created by
> say Lustre, ipoib, SRP, iSer, and NFS RDMA.

These don't use uverbs, so a uverbs change is not relavent.
 
> We do support comp_vector stuff, but that distributes completion
> processing.  Completions are triggered in our receive processing but
> to a much less extent based on ULP choices and packet type.  From a
> strategy standpoint, the code assumes distribution of kernel receive
> interrupt processing is vectored either by irqbalance or by explicit
> user mode scripting to spread RC QP receive processing across CPUs
> on the local socket.

And there you go, it should be allocating the memory based on the NUMA
affinity of the IRQ that it is going to assign to touch the memory.

And the CPU threads that are triggering this should be affine to the
same socket as well, otherwise you just get bouncing in another area.

Overall I think you get the same configuration if you just let the
normal policy stuff do its work, and it might be less fragile to boot.

I certainly object to this idea that the driver assumes userspace will
never move its IRQs off the local because it has wrongly hardwired a
numa locality to the wrong object.

Jason
Leon Romanovsky July 4, 2021, 6:34 a.m. UTC | #37
On Mon, Jun 28, 2021 at 08:19:34PM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 28, 2021 at 09:59:48PM +0000, Marciniszyn, Mike wrote:

<...>

> I certainly object to this idea that the driver assumes userspace will
> never move its IRQs off the local because it has wrongly hardwired a
> numa locality to the wrong object.

It makes me wonder how should we progress with my patch.

Thanks

> 
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 9d13db68283c..4522071fc220 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1077,7 +1077,7 @@  struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 	int err;
 	struct rvt_swqe *swq = NULL;
 	size_t sz;
-	size_t sg_list_sz;
+	size_t sg_list_sz = 0;
 	struct ib_qp *ret = ERR_PTR(-ENOMEM);
 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
 	void *priv = NULL;
@@ -1125,8 +1125,6 @@  struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		if (!swq)
 			return ERR_PTR(-ENOMEM);
 
-		sz = sizeof(*qp);
-		sg_list_sz = 0;
 		if (init_attr->srq) {
 			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
 
@@ -1136,10 +1134,13 @@  struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		} else if (init_attr->cap.max_recv_sge > 1)
 			sg_list_sz = sizeof(*qp->r_sg_list) *
 				(init_attr->cap.max_recv_sge - 1);
-		qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL,
-				  rdi->dparms.node);
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 		if (!qp)
 			goto bail_swq;
+		qp->r_sg_list =
+			kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node);
+		if (!qp->r_sg_list)
+			goto bail_qp;
 		qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
 
 		RCU_INIT_POINTER(qp->next, NULL);
@@ -1327,6 +1328,7 @@  struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 
 bail_qp:
 	kfree(qp->s_ack_queue);
+	kfree(qp->r_sg_list);
 	kfree(qp);
 
 bail_swq:
@@ -1761,6 +1763,7 @@  int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	kvfree(qp->r_rq.kwq);
 	rdi->driver_f.qp_priv_free(rdi, qp);
 	kfree(qp->s_ack_queue);
+	kfree(qp->r_sg_list);
 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
 	rdma_destroy_ah_attr(&qp->alt_ah_attr);
 	free_ud_wq_attr(qp);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 8275954f5ce6..2e58d5e6ac0e 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -444,7 +444,7 @@  struct rvt_qp {
 	/*
 	 * This sge list MUST be last. Do not add anything below here.
 	 */
-	struct rvt_sge r_sg_list[] /* verified SGEs */
+	struct rvt_sge *r_sg_list /* verified SGEs */
 		____cacheline_aligned_in_smp;
 };