Message ID | c34a864803f9bbd33d3f856a6ba2dd595ab708a7.1620729033.git.leonro@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [rdma-next] RDMA/rdmavt: Decouple QP and SGE lists allocations | expand |
> On 11 May 2021, at 12:36, Leon Romanovsky <leon@kernel.org> wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > The rdmavt QP has fields that are both needed for the control and data > path. Such mixed declaration caused to the very specific allocation flow > with kzalloc_node and SGE list embedded into the struct rvt_qp. > > This patch separates QP creation to two: regular memory allocation for > the control path and specific code for the SGE list, while the access to > the later is performed through derefenced pointer. > > Such pointer and its context are expected to be in the cache, so > performance difference is expected to be negligible, if any exists. > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > Hi, > > This change is needed to convert QP to core allocation scheme. In that > scheme QP is allocated outside of the driver and size of such allocation > is constant and can be calculated at the compile time. > > Thanks > --- > drivers/infiniband/sw/rdmavt/qp.c | 13 ++++++++----- > include/rdma/rdmavt_qp.h | 2 +- > 2 files changed, 9 insertions(+), 6 deletions(-) > > diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c > index 9d13db68283c..4522071fc220 100644 > --- a/drivers/infiniband/sw/rdmavt/qp.c > +++ b/drivers/infiniband/sw/rdmavt/qp.c > @@ -1077,7 +1077,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > int err; > struct rvt_swqe *swq = NULL; > size_t sz; > - size_t sg_list_sz; > + size_t sg_list_sz = 0; > struct ib_qp *ret = ERR_PTR(-ENOMEM); > struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); > void *priv = NULL; > @@ -1125,8 +1125,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > if (!swq) > return ERR_PTR(-ENOMEM); > > - sz = sizeof(*qp); > - sg_list_sz = 0; > if (init_attr->srq) { > struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq); > > @@ -1136,10 +1134,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > } else if (init_attr->cap.max_recv_sge > 1) > sg_list_sz = sizeof(*qp->r_sg_list) * > (init_attr->cap.max_recv_sge - 1); > - qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL, > - rdi->dparms.node); > + qp = kzalloc(sizeof(*qp), GFP_KERNEL); Why not kzalloc_node() here? Thxs, HÃ¥kon > if (!qp) > goto bail_swq; > + qp->r_sg_list = > + kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node); > + if (!qp->r_sg_list) > + goto bail_qp; > qp->allowed_ops = get_allowed_ops(init_attr->qp_type); > > RCU_INIT_POINTER(qp->next, NULL); > @@ -1327,6 +1328,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > > bail_qp: > kfree(qp->s_ack_queue); > + kfree(qp->r_sg_list); > kfree(qp); > > bail_swq: > @@ -1761,6 +1763,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) > kvfree(qp->r_rq.kwq); > rdi->driver_f.qp_priv_free(rdi, qp); > kfree(qp->s_ack_queue); > + kfree(qp->r_sg_list); > rdma_destroy_ah_attr(&qp->remote_ah_attr); > rdma_destroy_ah_attr(&qp->alt_ah_attr); > free_ud_wq_attr(qp); > diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h > index 8275954f5ce6..2e58d5e6ac0e 100644 > --- a/include/rdma/rdmavt_qp.h > +++ b/include/rdma/rdmavt_qp.h > @@ -444,7 +444,7 @@ struct rvt_qp { > /* > * This sge list MUST be last. Do not add anything below here. > */ > - struct rvt_sge r_sg_list[] /* verified SGEs */ > + struct rvt_sge *r_sg_list /* verified SGEs */ > ____cacheline_aligned_in_smp; > }; > > -- > 2.31.1 >
On 5/11/21 6:36 AM, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > The rdmavt QP has fields that are both needed for the control and data > path. Such mixed declaration caused to the very specific allocation flow > with kzalloc_node and SGE list embedded into the struct rvt_qp. > > This patch separates QP creation to two: regular memory allocation for > the control path and specific code for the SGE list, while the access to > the later is performed through derefenced pointer. > > Such pointer and its context are expected to be in the cache, so > performance difference is expected to be negligible, if any exists. > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > Hi, > > This change is needed to convert QP to core allocation scheme. In that > scheme QP is allocated outside of the driver and size of such allocation > is constant and can be calculated at the compile time. Thanks Leon, we'll get this put through our testing. -Denny
On Tue, May 11, 2021 at 10:59:52AM +0000, Haakon Bugge wrote: > > > > On 11 May 2021, at 12:36, Leon Romanovsky <leon@kernel.org> wrote: > > > > From: Leon Romanovsky <leonro@nvidia.com> > > > > The rdmavt QP has fields that are both needed for the control and data > > path. Such mixed declaration caused to the very specific allocation flow > > with kzalloc_node and SGE list embedded into the struct rvt_qp. > > > > This patch separates QP creation to two: regular memory allocation for > > the control path and specific code for the SGE list, while the access to > > the later is performed through derefenced pointer. > > > > Such pointer and its context are expected to be in the cache, so > > performance difference is expected to be negligible, if any exists. > > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > > --- > > Hi, > > > > This change is needed to convert QP to core allocation scheme. In that > > scheme QP is allocated outside of the driver and size of such allocation > > is constant and can be calculated at the compile time. > > > > Thanks > > --- > > drivers/infiniband/sw/rdmavt/qp.c | 13 ++++++++----- > > include/rdma/rdmavt_qp.h | 2 +- > > 2 files changed, 9 insertions(+), 6 deletions(-) > > > > diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c > > index 9d13db68283c..4522071fc220 100644 > > --- a/drivers/infiniband/sw/rdmavt/qp.c > > +++ b/drivers/infiniband/sw/rdmavt/qp.c > > @@ -1077,7 +1077,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > > int err; > > struct rvt_swqe *swq = NULL; > > size_t sz; > > - size_t sg_list_sz; > > + size_t sg_list_sz = 0; > > struct ib_qp *ret = ERR_PTR(-ENOMEM); > > struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); > > void *priv = NULL; > > @@ -1125,8 +1125,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > > if (!swq) > > return ERR_PTR(-ENOMEM); > > > > - sz = sizeof(*qp); > > - sg_list_sz = 0; > > if (init_attr->srq) { > > struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq); > > > > @@ -1136,10 +1134,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > > } else if (init_attr->cap.max_recv_sge > 1) > > sg_list_sz = sizeof(*qp->r_sg_list) * > > (init_attr->cap.max_recv_sge - 1); > > - qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL, > > - rdi->dparms.node); > > + qp = kzalloc(sizeof(*qp), GFP_KERNEL); > > Why not kzalloc_node() here? The idea is to delete this kzalloc later in next patch, because all drivers are doing same thing "qp = kzalloc(sizeof(*qp), GFP_KERNEL);". Thanks
On Tue, May 11, 2021 at 08:26:43AM -0400, Dennis Dalessandro wrote: > On 5/11/21 6:36 AM, Leon Romanovsky wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > > > The rdmavt QP has fields that are both needed for the control and data > > path. Such mixed declaration caused to the very specific allocation flow > > with kzalloc_node and SGE list embedded into the struct rvt_qp. > > > > This patch separates QP creation to two: regular memory allocation for > > the control path and specific code for the SGE list, while the access to > > the later is performed through derefenced pointer. > > > > Such pointer and its context are expected to be in the cache, so > > performance difference is expected to be negligible, if any exists. > > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > > --- > > Hi, > > > > This change is needed to convert QP to core allocation scheme. In that > > scheme QP is allocated outside of the driver and size of such allocation > > is constant and can be calculated at the compile time. > > Thanks Leon, we'll get this put through our testing. Thanks a lot. > > -Denny
> > > > Why not kzalloc_node() here? I agree here. Other allocations that have been promoted to the core have lost the node attribute in the allocation. For the rdmavt based drivers and especially with the QP, there are performance implications. I have no issue moving the allocation, as long as the node centric allocation is preserved. I will test the patch to make sure there is nothing else lurking. Mike
On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote: > > > > > > Why not kzalloc_node() here? > > I agree here. > > Other allocations that have been promoted to the core have lost the node attribute in the allocation. Did you notice any performance degradation? Thanks
> > On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote: > > > > > > > > Why not kzalloc_node() here? > > > > I agree here. > > > > Other allocations that have been promoted to the core have lost the node > attribute in the allocation. > > Did you notice any performance degradation? > For the QP, we most certainly will. In any case, the promotion should address not losing the node. The allocation gets the ib_device, and it would seem to hard to add method of determining the node. Mike
On 5/11/21 3:27 PM, Leon Romanovsky wrote: > On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote: >>>> >>>> Why not kzalloc_node() here? >> >> I agree here. >> >> Other allocations that have been promoted to the core have lost the node attribute in the allocation. > > Did you notice any performance degradation? > So what's the motivation to change it from the way it was originally designed? It seems to me if the original implementation went to the trouble to allocate the memory on the local node, refactoring the code should respect that. -Denny
On Wed, May 12, 2021 at 12:08:59AM -0400, Dennis Dalessandro wrote: > > On 5/11/21 3:27 PM, Leon Romanovsky wrote: > > On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote: > > > > > > > > > > Why not kzalloc_node() here? > > > > > > I agree here. > > > > > > Other allocations that have been promoted to the core have lost the node attribute in the allocation. > > > > Did you notice any performance degradation? > > > > So what's the motivation to change it from the way it was originally > designed? It seems to me if the original implementation went to the trouble > to allocate the memory on the local node, refactoring the code should > respect that. I have no problem to make rdma_zalloc_*() node aware, but would like to get real performance justification. My assumption is that rdmavt use kzalloc_node for the control plane based on some internal performance testing and we finally can see the difference between kzalloc and kzalloc_node, am I right? Is the claim of performance degradation backed by data? The main reason (maybe I'm wrong here) is to avoid _node() allocators because they increase chances of memory allocation failure due to not doing fallback in case node memory is depleted. Again, I'm suggesting to do plain kzalloc() for control part of QP. Thanks > > -Denny
> - struct rvt_sge r_sg_list[] /* verified SGEs */ > + struct rvt_sge *r_sg_list /* verified SGEs */ > ____cacheline_aligned_in_smp; > }; > Since since has been made an independent allocation, r_sg_list becomes a read-mostly pointer and should be moved up in rvt_qp to other 64 bit fields around timeout_jiffies. The cacheline can then be dropped for this field. Mike
> > Thanks Leon, we'll get this put through our testing. > > Thanks a lot. > > > The patch as is passed all our functional testing. Mike
On 5/12/21 8:13 AM, Leon Romanovsky wrote: > On Wed, May 12, 2021 at 12:08:59AM -0400, Dennis Dalessandro wrote: >> >> On 5/11/21 3:27 PM, Leon Romanovsky wrote: >>> On Tue, May 11, 2021 at 07:15:09PM +0000, Marciniszyn, Mike wrote: >>>>>> >>>>>> Why not kzalloc_node() here? >>>> >>>> I agree here. >>>> >>>> Other allocations that have been promoted to the core have lost the node attribute in the allocation. >>> >>> Did you notice any performance degradation? >>> >> >> So what's the motivation to change it from the way it was originally >> designed? It seems to me if the original implementation went to the trouble >> to allocate the memory on the local node, refactoring the code should >> respect that. > > I have no problem to make rdma_zalloc_*() node aware, but would like to get > real performance justification. My assumption is that rdmavt use kzalloc_node > for the control plane based on some internal performance testing and we finally > can see the difference between kzalloc and kzalloc_node, am I right? > > Is the claim of performance degradation backed by data? Yes, in the past. I don't have access anymore now that I'm not with Intel. It probably would not have been publishable anyway. > The main reason (maybe I'm wrong here) is to avoid _node() allocators > because they increase chances of memory allocation failure due to not > doing fallback in case node memory is depleted. Agreed. It's a trade-off that was deemed acceptable. > Again, I'm suggesting to do plain kzalloc() for control part of QP. Now I don't recall data for that specifically, but to be on the safe side I would not want to risk a performance regression. -Denny
On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: > > > Thanks Leon, we'll get this put through our testing. > > > > Thanks a lot. > > > > > > > The patch as is passed all our functional testing. Thanks Mike, Can I ask you to perform a performance comparison between this patch and the following? diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 4522071fc220..d0e0f7cbe17a 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1134,7 +1134,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, } else if (init_attr->cap.max_recv_sge > 1) sg_list_sz = sizeof(*qp->r_sg_list) * (init_attr->cap.max_recv_sge - 1); - qp = kzalloc(sizeof(*qp), GFP_KERNEL); + qp = kzalloc_node(sizeof(*qp), GFP_KERNEL, rdi->dparms.node); if (!qp) goto bail_swq; qp->r_sg_list = Thanks > > Mike
On 5/12/21 8:50 AM, Leon Romanovsky wrote: > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: >>>> Thanks Leon, we'll get this put through our testing. >>> >>> Thanks a lot. >>> >>>> >> >> The patch as is passed all our functional testing. > > Thanks Mike, > > Can I ask you to perform a performance comparison between this patch and > the following? We have years of performance data with the code the way it is. Please maintain the original functionality of the code when moving things into the core unless there is a compelling reason to change. That is not the case here. > > diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c > index 4522071fc220..d0e0f7cbe17a 100644 > --- a/drivers/infiniband/sw/rdmavt/qp.c > +++ b/drivers/infiniband/sw/rdmavt/qp.c > @@ -1134,7 +1134,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > } else if (init_attr->cap.max_recv_sge > 1) > sg_list_sz = sizeof(*qp->r_sg_list) * > (init_attr->cap.max_recv_sge - 1); > - qp = kzalloc(sizeof(*qp), GFP_KERNEL); > + qp = kzalloc_node(sizeof(*qp), GFP_KERNEL, rdi->dparms.node); > if (!qp) > goto bail_swq; > qp->r_sg_list = > > -Denny
On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote: > On 5/12/21 8:50 AM, Leon Romanovsky wrote: > > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: > > > > > Thanks Leon, we'll get this put through our testing. > > > > > > > > Thanks a lot. > > > > > > > > > > > > > > > The patch as is passed all our functional testing. > > > > Thanks Mike, > > > > Can I ask you to perform a performance comparison between this patch and > > the following? > > We have years of performance data with the code the way it is. Please > maintain the original functionality of the code when moving things into the > core unless there is a compelling reason to change. That is not the case > here. Well, making the core do node allocations for metadata on every driver is a pretty big thing to ask for with no data. Jason
On 5/13/21 3:15 PM, Jason Gunthorpe wrote: > On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote: >> On 5/12/21 8:50 AM, Leon Romanovsky wrote: >>> On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: >>>>>> Thanks Leon, we'll get this put through our testing. >>>>> >>>>> Thanks a lot. >>>>> >>>>>> >>>> >>>> The patch as is passed all our functional testing. >>> >>> Thanks Mike, >>> >>> Can I ask you to perform a performance comparison between this patch and >>> the following? >> >> We have years of performance data with the code the way it is. Please >> maintain the original functionality of the code when moving things into the >> core unless there is a compelling reason to change. That is not the case >> here. > > Well, making the core do node allocations for metadata on every driver > is a pretty big thing to ask for with no data. Can't you just make the call into the core take a flag for this? You are looking to make a change to key behavior without any clear reason that I can see for why it needs to be that way. If there is a good reason, please explain so we can understand. I would think the person authoring the patch should be responsible to prove their patch doesn't cause a regression. We are telling you it did make a difference when the code was first written, probably like 6 years ago. At the very least no one had an issue with this code 4 years ago the last time it was touched (by Leon btw). Nor issues with the other uses of the call. We can have our performance experts look at it but it's going to take some time as it's nothing that has been on anyone's radar. -Denny
On Thu, May 13, 2021 at 03:31:48PM -0400, Dennis Dalessandro wrote: > On 5/13/21 3:15 PM, Jason Gunthorpe wrote: > > On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote: > > > On 5/12/21 8:50 AM, Leon Romanovsky wrote: > > > > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: > > > > > > > Thanks Leon, we'll get this put through our testing. > > > > > > > > > > > > Thanks a lot. > > > > > > > > > > > > > > > > > > > > > > > The patch as is passed all our functional testing. > > > > > > > > Thanks Mike, > > > > > > > > Can I ask you to perform a performance comparison between this patch and > > > > the following? > > > > > > We have years of performance data with the code the way it is. Please > > > maintain the original functionality of the code when moving things into the > > > core unless there is a compelling reason to change. That is not the case > > > here. > > > > Well, making the core do node allocations for metadata on every driver > > is a pretty big thing to ask for with no data. > > Can't you just make the call into the core take a flag for this? You are > looking to make a change to key behavior without any clear reason that I can > see for why it needs to be that way. If there is a good reason, please > explain so we can understand. The lifetime model of all this data is messed up, there are a bunch of little bugs on the error paths, and we can't have a proper refcount lifetime module when this code really wants to have it. IMHO if hf1 has a performance need here it should chain a sub allocation since promoting node awareness to the core code looks not nice.. These are not supposed to be performance sensitive data structures, they haven't even been organized for cache locality or anything. > I would think the person authoring the patch should be responsible to prove > their patch doesn't cause a regression. I'm more interested in this argument as it applied to functional regressions. Performance is always shifting around and a win for a node specific allocation seems highly situational to me. I half wonder if all the node allocation in this driver is just some copy and paste. Jason
On 5/14/21 9:02 AM, Jason Gunthorpe wrote: > On Thu, May 13, 2021 at 03:31:48PM -0400, Dennis Dalessandro wrote: >> On 5/13/21 3:15 PM, Jason Gunthorpe wrote: >>> On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote: >>>> On 5/12/21 8:50 AM, Leon Romanovsky wrote: >>>>> On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: >>>>>>>> Thanks Leon, we'll get this put through our testing. >>>>>>> >>>>>>> Thanks a lot. >>>>>>> >>>>>>>> >>>>>> >>>>>> The patch as is passed all our functional testing. >>>>> >>>>> Thanks Mike, >>>>> >>>>> Can I ask you to perform a performance comparison between this patch and >>>>> the following? >>>> >>>> We have years of performance data with the code the way it is. Please >>>> maintain the original functionality of the code when moving things into the >>>> core unless there is a compelling reason to change. That is not the case >>>> here. >>> >>> Well, making the core do node allocations for metadata on every driver >>> is a pretty big thing to ask for with no data. >> >> Can't you just make the call into the core take a flag for this? You are >> looking to make a change to key behavior without any clear reason that I can >> see for why it needs to be that way. If there is a good reason, please >> explain so we can understand. > > The lifetime model of all this data is messed up, there are a bunch of > little bugs on the error paths, and we can't have a proper refcount > lifetime module when this code really wants to have it. > > IMHO if hf1 has a performance need here it should chain a sub > allocation since promoting node awareness to the core code looks > not nice.. That's part of what I want to understand. Why is it "not nice"? Is it because there is only 1 driver that needs it or something else. As far as chaining a sub allocation, I'm not sure I follow. Isn't that kinda what Leon is doing here? Or will do, in other words move the qp allocation to the core and leave the SGE allocation in the driver per node. I can't say for any certainty one way or the other this is OK. I just know it would really suck to end up with a performance regression for something that was easily avoided by not changing the code behavior. A regression in code that has been this way since day 1 would be really bad. I'd just really rather not take that chance. > These are not supposed to be performance sensitive data structures, > they haven't even been organized for cache locality or anything. > >> I would think the person authoring the patch should be responsible to prove >> their patch doesn't cause a regression. > > I'm more interested in this argument as it applied to functional > regressions. Performance is always shifting around and a win for a > node specific allocation seems highly situational to me. I half wonder > if all the node allocation in this driver is just some copy and > paste. I think prove is too strong of a word. Should have said do what is reasonably necessary to ensure their patch doesn't cause a regression. Whether that's running their own tests or taking the advice from the folks who wrote the initial code or even other non-biased review opinions, etc. I certainly don't expect Leon to throw some HFIs in a machine and do a performance evaluation. I think this is the exact opposite of copy/paste. When we wrote this code originally there was a ton of work that went into how data structures were aligned and organized, as well as an examination of allocations and per node allocations were found to be important. If you look at the original qib code in v4.5, before we did rdmavt, the allocation was not per node. We specifically changed that in v4.6 when we put in rdmavt. In v4.3 when hfi1 went into staging it was not using the per node variant either (because that was copy/paste). I would love to be able to go back in our code reviews and bug tracking and tell you exactly why this line of code was changed to be per node. Unfortunately that level of information has not passed on to Cornelis. -Denny
On Fri, May 14, 2021 at 10:07:43AM -0400, Dennis Dalessandro wrote: > > IMHO if hf1 has a performance need here it should chain a sub > > allocation since promoting node awareness to the core code looks > > not nice.. > > That's part of what I want to understand. Why is it "not nice"? Is it > because there is only 1 driver that needs it or something else. Because node allocation is extremely situational. Currently the kernel has some tunable automatic heuristic, overriding it should only be done in cases where the code knows absolutely that a node is the correct thing, for instance because an IRQ pinned to a specific node is the main consumer of the data. hfi1 might have some situation where putting the QP on the device's node makes sense, while another driver might work better with the QP on the user thread that owns it. Who knows, it depends on the access pattern. How do I sort this out in a generic way without making a big mess? And why are you so sure that node allocation is the right thing for hfi?? The interrupts can be rebalanced by userspace and user threads can be pinned to other nodes. Why is choosing the device node unconditionally the right choice? This feels like something that was designed to benifit a very constrained use case and harm everything else. > As far as chaining a sub allocation, I'm not sure I follow. Isn't that kinda > what Leon is doing here? Or will do, in other words move the qp allocation > to the core and leave the SGE allocation in the driver per node. I can't say > for any certainty one way or the other this is OK. I just know it would > really suck to end up with a performance regression for something that was > easily avoided by not changing the code behavior. A regression in code that > has been this way since day 1 would be really bad. I'd just really rather > not take that chance. It means put the data you know is performance sensitive in a struct and then allocate that struct and related on the node that is guarenteed to be touching that data. For instance if you have a pinned workqueue or IRQ or something. The core stuff in ib_qp is not performance sensitive and has no obvious node affinity since it relates primarily to simple control stuff. > I would love to be able to go back in our code reviews and bug tracking and > tell you exactly why this line of code was changed to be per node. > Unfortunately that level of information has not passed on to Cornelis. Wow, that is remarkable Jason
> The core stuff in ib_qp is not performance sensitive and has no obvious node > affinity since it relates primarily to simple control stuff. > The current rvt_qp "inherits" from ib_qp, so the fields in the "control" stuff are performance critical especially for receive processing and have historically live in the same allocation. I would in no way call these fields "simple control stuff". The rvt_qp structure is tuned to optimize receive processing and the NUMA locality is part of that tuning. We could separate out the allocation, but that misses promoting fields from rvt_qp that may indeed be common into the core. I know that we use the qpn from ib_qp and there may be other fields in the critical path. Mike
On Fri, May 14, 2021 at 03:00:37PM +0000, Marciniszyn, Mike wrote: > > The core stuff in ib_qp is not performance sensitive and has no obvious node > > affinity since it relates primarily to simple control stuff. > > > > The current rvt_qp "inherits" from ib_qp, so the fields in the > "control" stuff are performance critical especially for receive > processing and have historically live in the same allocation. This is why I said "core stuff in ib_qp" if drivers are adding performance stuff to their own structs then that is the driver's responsibility to handle. > I would in no way call these fields "simple control stuff". The > rvt_qp structure is tuned to optimize receive processing and the > NUMA locality is part of that tuning. > > We could separate out the allocation, but that misses promoting > fields from rvt_qp that may indeed be common into the core. > > I know that we use the qpn from ib_qp and there may be other fields > in the critical path. I wouldn't worry about 32 bits when tuning for performance Jason
On Thu, May 13, 2021 at 03:03:43PM -0400, Dennis Dalessandro wrote: > On 5/12/21 8:50 AM, Leon Romanovsky wrote: > > On Wed, May 12, 2021 at 12:25:15PM +0000, Marciniszyn, Mike wrote: > > > > > Thanks Leon, we'll get this put through our testing. > > > > > > > > Thanks a lot. > > > > > > > > > > > > > > > The patch as is passed all our functional testing. > > > > Thanks Mike, > > > > Can I ask you to perform a performance comparison between this patch and > > the following? > > We have years of performance data with the code the way it is. Please > maintain the original functionality of the code when moving things into the > core unless there is a compelling reason to change. That is not the case > here. Sorry for not being responsive. In addition to already said in parallel thread, this change keeps the functionality except static node. I'm curious to finally see the difference between these two allocations and it is very unlilkely we will see any. For example, this QP can be associated with application that runs on different node than rdi->dparms.node. Will we see performance degradation? Thanks
On Fri, May 14, 2021 at 12:02:37PM -0300, Jason Gunthorpe wrote: > On Fri, May 14, 2021 at 03:00:37PM +0000, Marciniszyn, Mike wrote: > > > The core stuff in ib_qp is not performance sensitive and has no obvious node > > > affinity since it relates primarily to simple control stuff. > > > > > > > The current rvt_qp "inherits" from ib_qp, so the fields in the > > "control" stuff are performance critical especially for receive > > processing and have historically live in the same allocation. > > This is why I said "core stuff in ib_qp" if drivers are adding > performance stuff to their own structs then that is the driver's > responsibility to handle. Can I learn from this response that node aware allocation is not needed, and this patch can go as is. Thanks
On 5/19/21 3:50 AM, Leon Romanovsky wrote: > On Fri, May 14, 2021 at 12:02:37PM -0300, Jason Gunthorpe wrote: >> On Fri, May 14, 2021 at 03:00:37PM +0000, Marciniszyn, Mike wrote: >>>> The core stuff in ib_qp is not performance sensitive and has no obvious node >>>> affinity since it relates primarily to simple control stuff. >>>> >>> >>> The current rvt_qp "inherits" from ib_qp, so the fields in the >>> "control" stuff are performance critical especially for receive >>> processing and have historically live in the same allocation. >> >> This is why I said "core stuff in ib_qp" if drivers are adding >> performance stuff to their own structs then that is the driver's >> responsibility to handle. > > Can I learn from this response that node aware allocation is not needed, > and this patch can go as is. It's pretty clearly a NAK from us. The code was purposely written this way, it was done years ago (from day 1 basically), changing it now is concerning for performance. Perhaps the code can be enhanced to move more stuff into the driver's own structs as Jason points out, but that should happen first. For now I still don't understand why the core can't optionally make the allocation per node. -Denny
On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote: > Perhaps the code can be enhanced to move more stuff into the driver's own > structs as Jason points out, but that should happen first. For now I still > don't understand why the core can't optionally make the allocation per node. Because I think it is wrong in the general case to assign all allocations to a single node? Jason
On 5/19/21 2:29 PM, Jason Gunthorpe wrote: > On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote: > >> Perhaps the code can be enhanced to move more stuff into the driver's own >> structs as Jason points out, but that should happen first. For now I still >> don't understand why the core can't optionally make the allocation per node. > > Because I think it is wrong in the general case to assign all > allocations to a single node? If by general case you mean for all drivers, sure, totally agree. We aren't talking about all drivers though, just the particular case of rdmavt. -Denny
On Wed, May 19, 2021 at 03:49:31PM -0400, Dennis Dalessandro wrote: > On 5/19/21 2:29 PM, Jason Gunthorpe wrote: > > On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote: > > > > > Perhaps the code can be enhanced to move more stuff into the driver's own > > > structs as Jason points out, but that should happen first. For now I still > > > don't understand why the core can't optionally make the allocation per node. > > > > Because I think it is wrong in the general case to assign all > > allocations to a single node? > > If by general case you mean for all drivers, sure, totally agree. We aren't > talking about all drivers though, just the particular case of rdmavt. I think it is wrong for rdmavt too and your benchmarks have focused on a specific case with process/thread affinities that can actually benefit from it. I don't want to encourage other drivers to do the same thing. The correct thing to do today in 2021 is to use the standard NUMA memory policy on already node-affine threads. The memory policy goes into the kernel and normal non-_node allocations will obey it. When combined with an appropriate node-affine HCA this will work as you are expecting right now. However you can't do anything like that while the kernel has the _node annotations, that overrides the NUMA memory policy and breaks the policy system! The *only* reason to override the node behavior in the kernel is if the kernel knows with high certainty that allocations are only going to be touched by certain CPUs, such as because it knows that the allocation is substantially for use in a CPU pinned irq/workqeueue or accessed via DMA from a node affine DMA device. None of these seem true for the QP struct. Especially since for RDMA all of the above is highly situational. The IRQ/WQ processing anything in RDMA should be tied to the comp_vector, so without knowing that information you simply can't do anything correct at allocation time. The idea of allocating every to the HW's node is simply not correct design. I will grant you it may have made sense ages ago before the NUMA stuff was more completed, but today it does not and you'd be better to remove it all and use memory policy properly than insist we keep it around forever. Jason
On 5/19/21 4:26 PM, Jason Gunthorpe wrote: > On Wed, May 19, 2021 at 03:49:31PM -0400, Dennis Dalessandro wrote: >> On 5/19/21 2:29 PM, Jason Gunthorpe wrote: >>> On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote: >>> >>>> Perhaps the code can be enhanced to move more stuff into the driver's own >>>> structs as Jason points out, but that should happen first. For now I still >>>> don't understand why the core can't optionally make the allocation per node. >>> >>> Because I think it is wrong in the general case to assign all >>> allocations to a single node? >> >> If by general case you mean for all drivers, sure, totally agree. We aren't >> talking about all drivers though, just the particular case of rdmavt. > > I think it is wrong for rdmavt too and your benchmarks have focused on > a specific case with process/thread affinities that can actually > benefit from it. > > I don't want to encourage other drivers to do the same thing. I would imagine they would get the same push back we are getting here. I don't think this would encourage anyone honestly. > The correct thing to do today in 2021 is to use the standard NUMA > memory policy on already node-affine threads. The memory policy goes > into the kernel and normal non-_node allocations will obey it. When > combined with an appropriate node-affine HCA this will work as you are > expecting right now. So we shouldn't see any issue in the normal case is what you are saying. I'd like to believe that, proving it is not easy though. > However you can't do anything like that while the kernel has the _node > annotations, that overrides the NUMA memory policy and breaks the > policy system! Does our driver doing this break the entire system? I'm not sure how that's possible. Is there an effort to get rid of these per node allocations so ultimately we won't have a choice at some point? > The *only* reason to override the node behavior in the kernel is if > the kernel knows with high certainty that allocations are only going > to be touched by certain CPUs, such as because it knows that the > allocation is substantially for use in a CPU pinned irq/workqeueue or > accessed via DMA from a node affine DMA device. > > None of these seem true for the QP struct. Will let Mike M respond about that. He's got a much better handle on the implications. > Especially since for RDMA all of the above is highly situational. The > IRQ/WQ processing anything in RDMA should be tied to the comp_vector, > so without knowing that information you simply can't do anything > correct at allocation time. I don't think that's true for our case. The comp_vector may in some cases be the right thing to dictate where memory should be, in our case I don't think that's true all the time. > The idea of allocating every to the HW's node is simply not correct > design. I will grant you it may have made sense ages ago before the > NUMA stuff was more completed, but today it does not and you'd be > better to remove it all and use memory policy properly than insist we > keep it around forever. Not insisting anything. If the trend is to remove these sort of allocations and other drivers are no longer doing this "not correct design" we are certainly open to change. We just want to understand the impact first rather than being strong armed into accepting a performance regression just so Leon can refactor some code. -Denny
On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote: > On 5/19/21 4:26 PM, Jason Gunthorpe wrote: > > On Wed, May 19, 2021 at 03:49:31PM -0400, Dennis Dalessandro wrote: > > > On 5/19/21 2:29 PM, Jason Gunthorpe wrote: > > > > On Wed, May 19, 2021 at 07:56:32AM -0400, Dennis Dalessandro wrote: <...> > > Especially since for RDMA all of the above is highly situational. The > > IRQ/WQ processing anything in RDMA should be tied to the comp_vector, > > so without knowing that information you simply can't do anything > > correct at allocation time. > > I don't think that's true for our case. The comp_vector may in some cases be > the right thing to dictate where memory should be, in our case I don't think > that's true all the time. In verbs world, the comp_vector is always the right thing to dictate node policy. We can argue if it works correctly or not. https://www.rdmamojo.com/2012/11/03/ibv_create_cq/ comp_vector: MSI-X completion vector that will be used for signaling Completion events. If the IRQ affinity masks of these interrupts have been configured to spread each MSI-X interrupt to be handled by a different core, this parameter can be used to spread the completion workload over multiple cores. > > > The idea of allocating every to the HW's node is simply not correct > > design. I will grant you it may have made sense ages ago before the > > NUMA stuff was more completed, but today it does not and you'd be > > better to remove it all and use memory policy properly than insist we > > keep it around forever. > > Not insisting anything. If the trend is to remove these sort of allocations > and other drivers are no longer doing this "not correct design" we are > certainly open to change. We just want to understand the impact first rather > than being strong armed into accepting a performance regression just so Leon > can refactor some code. It is hard to talk without data. Thanks > > -Denny
On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote: > > I don't want to encourage other drivers to do the same thing. > > I would imagine they would get the same push back we are getting here. I > don't think this would encourage anyone honestly. Then we are back to making infrastructure that is only useful for one, arguably wrong, driver. > > The correct thing to do today in 2021 is to use the standard NUMA > > memory policy on already node-affine threads. The memory policy goes > > into the kernel and normal non-_node allocations will obey it. When > > combined with an appropriate node-affine HCA this will work as you are > > expecting right now. > > So we shouldn't see any issue in the normal case is what you are > saying. I'd like to believe that, proving it is not easy though. Well, I said you have to setup the userspace properly, I'm not sure it just works out of the box. > > However you can't do anything like that while the kernel has the _node > > annotations, that overrides the NUMA memory policy and breaks the > > policy system! > > Does our driver doing this break the entire system? I'm not sure how that's > possible. It breaks your driver part of it, and if we lift it to the core code then it breaks all drivers, so it is a hard no-go. > Is there an effort to get rid of these per node allocations so > ultimately we won't have a choice at some point? Unlikely, subtle stuff like this will just be left broken in drivers nobody cares about.. Jason
On 5/25/21 9:13 AM, Jason Gunthorpe wrote: > On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote: > >>> I don't want to encourage other drivers to do the same thing. >> >> I would imagine they would get the same push back we are getting here. I >> don't think this would encourage anyone honestly. > > Then we are back to making infrastructure that is only useful for one, > arguably wrong, driver. That's just it, you argue that it's wrong. We don't agree that it's wrong. In fact you said previously: " The *only* reason to override the node behavior in the kernel is if the kernel knows with high certainty that allocations are only going to be touched by certain CPUs, such as because it knows that the allocation is substantially for use in a CPU pinned irq/workqeueue or accessed via DMA from a node affine DMA device. " Well, that's pretty much why we are doing this. >>> The correct thing to do today in 2021 is to use the standard NUMA >>> memory policy on already node-affine threads. The memory policy goes >>> into the kernel and normal non-_node allocations will obey it. When >>> combined with an appropriate node-affine HCA this will work as you are >>> expecting right now. >> >> So we shouldn't see any issue in the normal case is what you are >> saying. I'd like to believe that, proving it is not easy though. > > Well, I said you have to setup the userspace properly, I'm not sure it > just works out of the box. I'm going to go out on a limb and assume it will not just work out of the box. >>> However you can't do anything like that while the kernel has the _node >>> annotations, that overrides the NUMA memory policy and breaks the >>> policy system! >> >> Does our driver doing this break the entire system? I'm not sure how that's >> possible. > > It breaks your driver part of it, and if we lift it to the core code > then it breaks all drivers, so it is a hard no-go. > >> Is there an effort to get rid of these per node allocations so >> ultimately we won't have a choice at some point? > > Unlikely, subtle stuff like this will just be left broken in drivers > nobody cares about.. If it's not that big of a deal then what's the problem? Again, you keep saying it's broken. I'm still not seeing a compelling reason that tells me it is in fact broken. This is the way we get best performance which for the RDMA subsystem should pretty much trump everything except security. All this being said, philosophical and theoretical arguments aren't going to get us anywhere productive. Things could certainly be different performance wise half a decade later after the code originally went in. We are already mid 5.13 cycle. So the earliest this could be queued up to go in is 5.14. Can this wait one more cycle? If we can't get it tested/proven to make a difference mid 5.14, we will drop the objection and Leon's patch can go ahead in for 5.15. Fair compromise? -Denny
On Tue, May 25, 2021 at 10:10:47AM -0400, Dennis Dalessandro wrote: > On 5/25/21 9:13 AM, Jason Gunthorpe wrote: > > On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote: > > > > > > I don't want to encourage other drivers to do the same thing. > > > > > > I would imagine they would get the same push back we are getting here. I > > > don't think this would encourage anyone honestly. > > > > Then we are back to making infrastructure that is only useful for one, > > arguably wrong, driver. > > That's just it, you argue that it's wrong. We don't agree that it's wrong. > In fact you said previously: You haven't presented a single shred of anything to substantiate this disagreement beyoned "we have ancient benchmarks we can't reproduce" Not even a hand wavey logical argument why it could matter. > " > The *only* reason to override the node behavior in the kernel is if > the kernel knows with high certainty that allocations are only going > to be touched by certain CPUs, such as because it knows that the > allocation is substantially for use in a CPU pinned irq/workqeueue or > accessed via DMA from a node affine DMA device. > " > > Well, that's pretty much why we are doing this. Huh?I don't see DMA from the qp struct and as I said any MSI affinity should be driven by the comp_vector, so no, I don't think that is what HFI is doing at all. > We are already mid 5.13 cycle. So the earliest this could be queued up to go > in is 5.14. Can this wait one more cycle? If we can't get it tested/proven > to make a difference mid 5.14, we will drop the objection and Leon's patch > can go ahead in for 5.15. Fair compromise? Fine, but the main question is if you can use normal memory policy settings, not this. Jason
On 5/25/21 10:20 AM, Jason Gunthorpe wrote: >> We are already mid 5.13 cycle. So the earliest this could be queued up to go >> in is 5.14. Can this wait one more cycle? If we can't get it tested/proven >> to make a difference mid 5.14, we will drop the objection and Leon's patch >> can go ahead in for 5.15. Fair compromise? > > Fine, but the main question is if you can use normal memory policy > settings, not this. Agreed. -Denny
On Tue, May 25, 2021 at 10:10:47AM -0400, Dennis Dalessandro wrote: > On 5/25/21 9:13 AM, Jason Gunthorpe wrote: > > On Thu, May 20, 2021 at 06:02:09PM -0400, Dennis Dalessandro wrote: <...> > We are already mid 5.13 cycle. So the earliest this could be queued up to go > in is 5.14. Can this wait one more cycle? If we can't get it tested/proven > to make a difference mid 5.14, we will drop the objection and Leon's patch > can go ahead in for 5.15. Fair compromise? I sent this patch as early as I could to make sure that it won't jeopardize the restrack QP flow fixes. Delaying one more cycle means that QP conversion will be delayed too which is needed to close the race between netlink query QP call and simultaneous ibv_destroy_qp() call. Thanks > > -Denny > > > >
> > Fine, but the main question is if you can use normal memory policy settings, not > this. > > Jason Our performance team has gotten some preliminary data on AMD platforms. I prepared a kernel that will using allocate the QP using the "local" numa node (as currently done) and an allocation that intentionally allocates on the opposite socket based on a module parameter and our internal tests were executed with progressively larger queue pair counts. In the second case on 64 core/socket AMD platforms, we are seeing with the intentionally opposite allocation, latency dropped ~6-7% and BW dropped ~13% on high queue count perftest. SKX impact is minimal if any, but we need to look at legacy Intel chips that preceded SKX. We are still reviewing the data and expanding the test to older chips. Our theory is the hfi1 interrupt receive processing is fetching cachelines between the sockets causing the slowdown. The receive processing is critical for hfi1 (and qib before that). This is a heavily tuned code path. To answer some of the pending questions posed before, the mempolicy looks to be a process relative control and does not apply to our QP allocation where the struct rvt_qp is in the kernel. It certainly does not apply to kernel ULPs such as those created by say Lustre, ipoib, SRP, iSer, and NFS RDMA. We do support comp_vector stuff, but that distributes completion processing. Completions are triggered in our receive processing but to a much less extent based on ULP choices and packet type. From a strategy standpoint, the code assumes distribution of kernel receive interrupt processing is vectored either by irqbalance or by explicit user mode scripting to spread RC QP receive processing across CPUs on the local socket. Mike External recipient
On Mon, Jun 28, 2021 at 09:59:48PM +0000, Marciniszyn, Mike wrote: > To answer some of the pending questions posed before, the mempolicy > looks to be a process relative control and does not apply to our QP > allocation where the struct rvt_qp is in the kernel. I think mempolicy is per task, which is a thread, and it propagates into kernel allocations made under that task's current > It certainly does not apply to kernel ULPs such as those created by > say Lustre, ipoib, SRP, iSer, and NFS RDMA. These don't use uverbs, so a uverbs change is not relavent. > We do support comp_vector stuff, but that distributes completion > processing. Completions are triggered in our receive processing but > to a much less extent based on ULP choices and packet type. From a > strategy standpoint, the code assumes distribution of kernel receive > interrupt processing is vectored either by irqbalance or by explicit > user mode scripting to spread RC QP receive processing across CPUs > on the local socket. And there you go, it should be allocating the memory based on the NUMA affinity of the IRQ that it is going to assign to touch the memory. And the CPU threads that are triggering this should be affine to the same socket as well, otherwise you just get bouncing in another area. Overall I think you get the same configuration if you just let the normal policy stuff do its work, and it might be less fragile to boot. I certainly object to this idea that the driver assumes userspace will never move its IRQs off the local because it has wrongly hardwired a numa locality to the wrong object. Jason
On Mon, Jun 28, 2021 at 08:19:34PM -0300, Jason Gunthorpe wrote: > On Mon, Jun 28, 2021 at 09:59:48PM +0000, Marciniszyn, Mike wrote: <...> > I certainly object to this idea that the driver assumes userspace will > never move its IRQs off the local because it has wrongly hardwired a > numa locality to the wrong object. It makes me wonder how should we progress with my patch. Thanks > > Jason
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 9d13db68283c..4522071fc220 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1077,7 +1077,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, int err; struct rvt_swqe *swq = NULL; size_t sz; - size_t sg_list_sz; + size_t sg_list_sz = 0; struct ib_qp *ret = ERR_PTR(-ENOMEM); struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); void *priv = NULL; @@ -1125,8 +1125,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, if (!swq) return ERR_PTR(-ENOMEM); - sz = sizeof(*qp); - sg_list_sz = 0; if (init_attr->srq) { struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq); @@ -1136,10 +1134,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, } else if (init_attr->cap.max_recv_sge > 1) sg_list_sz = sizeof(*qp->r_sg_list) * (init_attr->cap.max_recv_sge - 1); - qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL, - rdi->dparms.node); + qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) goto bail_swq; + qp->r_sg_list = + kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node); + if (!qp->r_sg_list) + goto bail_qp; qp->allowed_ops = get_allowed_ops(init_attr->qp_type); RCU_INIT_POINTER(qp->next, NULL); @@ -1327,6 +1328,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, bail_qp: kfree(qp->s_ack_queue); + kfree(qp->r_sg_list); kfree(qp); bail_swq: @@ -1761,6 +1763,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) kvfree(qp->r_rq.kwq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); + kfree(qp->r_sg_list); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); free_ud_wq_attr(qp); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 8275954f5ce6..2e58d5e6ac0e 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -444,7 +444,7 @@ struct rvt_qp { /* * This sge list MUST be last. Do not add anything below here. */ - struct rvt_sge r_sg_list[] /* verified SGEs */ + struct rvt_sge *r_sg_list /* verified SGEs */ ____cacheline_aligned_in_smp; };