diff mbox series

[for-next] RDMA/rxe: Fix incorrect fencing

Message ID 20220522223345.9889-1-rpearsonhpe@gmail.com (mailing list archive)
State Superseded
Headers show
Series [for-next] RDMA/rxe: Fix incorrect fencing | expand

Commit Message

Bob Pearson May 22, 2022, 10:33 p.m. UTC
Currently the rxe driver checks if any previous operation
is not complete to determine if a fence wait is required.
This is not correct. For a regular fence only previous
read or atomic operations must be complete while for a local
invalidate fence all previous operations must be complete.
This patch corrects this behavior.

Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)


base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a

Comments

Haris Iqbal May 22, 2022, 11:59 p.m. UTC | #1
On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>
> Currently the rxe driver checks if any previous operation
> is not complete to determine if a fence wait is required.
> This is not correct. For a regular fence only previous
> read or atomic operations must be complete while for a local
> invalidate fence all previous operations must be complete.
> This patch corrects this behavior.
>
> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
>  1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> index ae5fbc79dd5c..f36263855a45 100644
> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
>                      (wqe->state != wqe_state_processing)))
>                 return NULL;
>
> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
> -                                                    (index != cons))) {
> -               qp->req.wait_fence = 1;
> -               return NULL;
> -       }
> -
>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
>         return wqe;
>  }
>
> +/**
> + * rxe_wqe_is_fenced - check if next wqe is fenced
> + * @qp: the queue pair
> + * @wqe: the next wqe
> + *
> + * Returns: 1 if wqe is fenced (needs to wait)
> + *         0 if wqe is good to go
> + */
> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
> +{
> +       unsigned int cons;
> +
> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
> +               return 0;
> +
> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
> +
> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
> +        * Requires ALL previous operations on the send queue
> +        * are complete.
> +        */
> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
> +               return qp->req.wqe_index != cons;


Do I understand correctly that according to this code a wr with opcode
IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
work?

If that is the desired behaviour, can you point out where in spec this
is mentioned.

Thanks.


> +
> +       /* Fence see IBA 10.8.3.3
> +        * Requires that all previous read and atomic operations
> +        * are complete.
> +        */
> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
> +}
> +
>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
>  {
>         switch (opcode) {
> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
>         if (unlikely(!wqe))
>                 goto exit;
>
> +       if (rxe_wqe_is_fenced(qp, wqe)) {
> +               qp->req.wait_fence = 1;
> +               goto exit;
> +       }
> +
>         if (wqe->mask & WR_LOCAL_OP_MASK) {
>                 ret = rxe_do_local_ops(qp, wqe);
>                 if (unlikely(ret))
>
> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
> --
> 2.34.1
>
Bob Pearson May 23, 2022, 3:51 a.m. UTC | #2
On 5/22/22 18:59, Haris Iqbal wrote:
> On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>
>> Currently the rxe driver checks if any previous operation
>> is not complete to determine if a fence wait is required.
>> This is not correct. For a regular fence only previous
>> read or atomic operations must be complete while for a local
>> invalidate fence all previous operations must be complete.
>> This patch corrects this behavior.
>>
>> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>> ---
>>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
>>  1 file changed, 36 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
>> index ae5fbc79dd5c..f36263855a45 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_req.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
>> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
>>                      (wqe->state != wqe_state_processing)))
>>                 return NULL;
>>
>> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
>> -                                                    (index != cons))) {
>> -               qp->req.wait_fence = 1;
>> -               return NULL;
>> -       }
>> -
>>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
>>         return wqe;
>>  }
>>
>> +/**
>> + * rxe_wqe_is_fenced - check if next wqe is fenced
>> + * @qp: the queue pair
>> + * @wqe: the next wqe
>> + *
>> + * Returns: 1 if wqe is fenced (needs to wait)
>> + *         0 if wqe is good to go
>> + */
>> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
>> +{
>> +       unsigned int cons;
>> +
>> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
>> +               return 0;
>> +
>> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
>> +
>> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
>> +        * Requires ALL previous operations on the send queue
>> +        * are complete.
>> +        */
>> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
>> +               return qp->req.wqe_index != cons;
> 
> 
> Do I understand correctly that according to this code a wr with opcode
> IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
> work?
> 
> If that is the desired behaviour, can you point out where in spec this
> is mentioned.

According to IBA "Local invalidate fence" (LIF) and regular Fence behave
differently. (See the referenced sections in the IBA.) For a local invalidate
operation the fence bit fences all previous operations. That was the old behavior
which made no distinction between local invalidate and other operations.
The change here are the other operations with a regular fence which should only
requires read and atomic operations to be fenced.

Not sure what you mean by 'also'. Per the IBA if the LIF is set then you have
strict invalidate ordering if not then you have relaxed ordering. The kernel verbs
API only has one fence bit and does not have a separate LIF bit so I am
interpreting them to share the one bit.

Bob
> 
> Thanks.
> 
> 
>> +
>> +       /* Fence see IBA 10.8.3.3
>> +        * Requires that all previous read and atomic operations
>> +        * are complete.
>> +        */
>> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
>> +}
>> +
>>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
>>  {
>>         switch (opcode) {
>> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
>>         if (unlikely(!wqe))
>>                 goto exit;
>>
>> +       if (rxe_wqe_is_fenced(qp, wqe)) {
>> +               qp->req.wait_fence = 1;
>> +               goto exit;
>> +       }
>> +
>>         if (wqe->mask & WR_LOCAL_OP_MASK) {
>>                 ret = rxe_do_local_ops(qp, wqe);
>>                 if (unlikely(ret))
>>
>> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
>> --
>> 2.34.1
>>
Haris Iqbal May 23, 2022, 8:05 a.m. UTC | #3
On Mon, May 23, 2022 at 5:51 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>
> On 5/22/22 18:59, Haris Iqbal wrote:
> > On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
> >>
> >> Currently the rxe driver checks if any previous operation
> >> is not complete to determine if a fence wait is required.
> >> This is not correct. For a regular fence only previous
> >> read or atomic operations must be complete while for a local
> >> invalidate fence all previous operations must be complete.
> >> This patch corrects this behavior.
> >>
> >> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
> >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >> ---
> >>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
> >>  1 file changed, 36 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> >> index ae5fbc79dd5c..f36263855a45 100644
> >> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> >> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> >> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
> >>                      (wqe->state != wqe_state_processing)))
> >>                 return NULL;
> >>
> >> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
> >> -                                                    (index != cons))) {
> >> -               qp->req.wait_fence = 1;
> >> -               return NULL;
> >> -       }
> >> -
> >>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
> >>         return wqe;
> >>  }
> >>
> >> +/**
> >> + * rxe_wqe_is_fenced - check if next wqe is fenced
> >> + * @qp: the queue pair
> >> + * @wqe: the next wqe
> >> + *
> >> + * Returns: 1 if wqe is fenced (needs to wait)
> >> + *         0 if wqe is good to go
> >> + */
> >> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
> >> +{
> >> +       unsigned int cons;
> >> +
> >> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
> >> +               return 0;
> >> +
> >> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
> >> +
> >> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
> >> +        * Requires ALL previous operations on the send queue
> >> +        * are complete.
> >> +        */
> >> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
> >> +               return qp->req.wqe_index != cons;
> >
> >
> > Do I understand correctly that according to this code a wr with opcode
> > IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
> > work?
> >
> > If that is the desired behaviour, can you point out where in spec this
> > is mentioned.
>
> According to IBA "Local invalidate fence" (LIF) and regular Fence behave
> differently. (See the referenced sections in the IBA.) For a local invalidate
> operation the fence bit fences all previous operations. That was the old behavior
> which made no distinction between local invalidate and other operations.
> The change here are the other operations with a regular fence which should only
> requires read and atomic operations to be fenced.
>
> Not sure what you mean by 'also'. Per the IBA if the LIF is set then you have
> strict invalidate ordering if not then you have relaxed ordering. The kernel verbs
> API only has one fence bit and does not have a separate LIF bit so I am
> interpreting them to share the one bit.

I see. Now I understand. Thanks for the explanation.

I do have a follow-up question. For a IB_WR_LOCAL_INV wr, without the
fence bit means relaxed ordering. This would mean that the completion
for that wr must take place "before any subsequent WQE has begun
execution". From what I understand, multiple rxe_requester instances
can run in parallel and pick up wqes and execute them. How is the
relaxed ordering criteria fulfilled?

>
> Bob
> >
> > Thanks.
> >
> >
> >> +
> >> +       /* Fence see IBA 10.8.3.3
> >> +        * Requires that all previous read and atomic operations
> >> +        * are complete.
> >> +        */
> >> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
> >> +}
> >> +
> >>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
> >>  {
> >>         switch (opcode) {
> >> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
> >>         if (unlikely(!wqe))
> >>                 goto exit;
> >>
> >> +       if (rxe_wqe_is_fenced(qp, wqe)) {
> >> +               qp->req.wait_fence = 1;
> >> +               goto exit;
> >> +       }
> >> +
> >>         if (wqe->mask & WR_LOCAL_OP_MASK) {
> >>                 ret = rxe_do_local_ops(qp, wqe);
> >>                 if (unlikely(ret))
> >>
> >> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
> >> --
> >> 2.34.1
> >>
>
Bob Pearson May 23, 2022, 6:22 p.m. UTC | #4
On 5/23/22 03:05, Haris Iqbal wrote:
> On Mon, May 23, 2022 at 5:51 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>
>> On 5/22/22 18:59, Haris Iqbal wrote:
>>> On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>>>
>>>> Currently the rxe driver checks if any previous operation
>>>> is not complete to determine if a fence wait is required.
>>>> This is not correct. For a regular fence only previous
>>>> read or atomic operations must be complete while for a local
>>>> invalidate fence all previous operations must be complete.
>>>> This patch corrects this behavior.
>>>>
>>>> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
>>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>>>> ---
>>>>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
>>>>  1 file changed, 36 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
>>>> index ae5fbc79dd5c..f36263855a45 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_req.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
>>>> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
>>>>                      (wqe->state != wqe_state_processing)))
>>>>                 return NULL;
>>>>
>>>> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
>>>> -                                                    (index != cons))) {
>>>> -               qp->req.wait_fence = 1;
>>>> -               return NULL;
>>>> -       }
>>>> -
>>>>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
>>>>         return wqe;
>>>>  }
>>>>
>>>> +/**
>>>> + * rxe_wqe_is_fenced - check if next wqe is fenced
>>>> + * @qp: the queue pair
>>>> + * @wqe: the next wqe
>>>> + *
>>>> + * Returns: 1 if wqe is fenced (needs to wait)
>>>> + *         0 if wqe is good to go
>>>> + */
>>>> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
>>>> +{
>>>> +       unsigned int cons;
>>>> +
>>>> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
>>>> +               return 0;
>>>> +
>>>> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
>>>> +
>>>> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
>>>> +        * Requires ALL previous operations on the send queue
>>>> +        * are complete.
>>>> +        */
>>>> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
>>>> +               return qp->req.wqe_index != cons;
>>>
>>>
>>> Do I understand correctly that according to this code a wr with opcode
>>> IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
>>> work?
>>>
>>> If that is the desired behaviour, can you point out where in spec this
>>> is mentioned.
>>
>> According to IBA "Local invalidate fence" (LIF) and regular Fence behave
>> differently. (See the referenced sections in the IBA.) For a local invalidate
>> operation the fence bit fences all previous operations. That was the old behavior
>> which made no distinction between local invalidate and other operations.
>> The change here are the other operations with a regular fence which should only
>> requires read and atomic operations to be fenced.
>>
>> Not sure what you mean by 'also'. Per the IBA if the LIF is set then you have
>> strict invalidate ordering if not then you have relaxed ordering. The kernel verbs
>> API only has one fence bit and does not have a separate LIF bit so I am
>> interpreting them to share the one bit.
> 
> I see. Now I understand. Thanks for the explanation.
> 
> I do have a follow-up question. For a IB_WR_LOCAL_INV wr, without the
> fence bit means relaxed ordering. This would mean that the completion
> for that wr must take place "before any subsequent WQE has begun
> execution". From what I understand, multiple rxe_requester instances
> can run in parallel and pick up wqes and execute them. How is the
> relaxed ordering criteria fulfilled?

The requester is a tasklet. There is one tasklet instance per QP. Tasklets can only
run on a single cpu so not in parallel. The tasklets for multiple cpus each
execute a single send queue in order.
> 
>>
>> Bob
>>>
>>> Thanks.
>>>
>>>
>>>> +
>>>> +       /* Fence see IBA 10.8.3.3
>>>> +        * Requires that all previous read and atomic operations
>>>> +        * are complete.
>>>> +        */
>>>> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
>>>> +}
>>>> +
>>>>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
>>>>  {
>>>>         switch (opcode) {
>>>> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
>>>>         if (unlikely(!wqe))
>>>>                 goto exit;
>>>>
>>>> +       if (rxe_wqe_is_fenced(qp, wqe)) {
>>>> +               qp->req.wait_fence = 1;
>>>> +               goto exit;
>>>> +       }
>>>> +
>>>>         if (wqe->mask & WR_LOCAL_OP_MASK) {
>>>>                 ret = rxe_do_local_ops(qp, wqe);
>>>>                 if (unlikely(ret))
>>>>
>>>> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
>>>> --
>>>> 2.34.1
>>>>
>>
Haris Iqbal May 24, 2022, 10:28 a.m. UTC | #5
On Mon, May 23, 2022 at 8:22 PM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>
> On 5/23/22 03:05, Haris Iqbal wrote:
> > On Mon, May 23, 2022 at 5:51 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
> >>
> >> On 5/22/22 18:59, Haris Iqbal wrote:
> >>> On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
> >>>>
> >>>> Currently the rxe driver checks if any previous operation
> >>>> is not complete to determine if a fence wait is required.
> >>>> This is not correct. For a regular fence only previous
> >>>> read or atomic operations must be complete while for a local
> >>>> invalidate fence all previous operations must be complete.
> >>>> This patch corrects this behavior.
> >>>>
> >>>> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
> >>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >>>> ---
> >>>>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
> >>>>  1 file changed, 36 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> >>>> index ae5fbc79dd5c..f36263855a45 100644
> >>>> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> >>>> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> >>>> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
> >>>>                      (wqe->state != wqe_state_processing)))
> >>>>                 return NULL;
> >>>>
> >>>> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
> >>>> -                                                    (index != cons))) {
> >>>> -               qp->req.wait_fence = 1;
> >>>> -               return NULL;
> >>>> -       }
> >>>> -
> >>>>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
> >>>>         return wqe;
> >>>>  }
> >>>>
> >>>> +/**
> >>>> + * rxe_wqe_is_fenced - check if next wqe is fenced
> >>>> + * @qp: the queue pair
> >>>> + * @wqe: the next wqe
> >>>> + *
> >>>> + * Returns: 1 if wqe is fenced (needs to wait)
> >>>> + *         0 if wqe is good to go
> >>>> + */
> >>>> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
> >>>> +{
> >>>> +       unsigned int cons;
> >>>> +
> >>>> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
> >>>> +               return 0;
> >>>> +
> >>>> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
> >>>> +
> >>>> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
> >>>> +        * Requires ALL previous operations on the send queue
> >>>> +        * are complete.
> >>>> +        */
> >>>> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
> >>>> +               return qp->req.wqe_index != cons;
> >>>
> >>>
> >>> Do I understand correctly that according to this code a wr with opcode
> >>> IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
> >>> work?
> >>>
> >>> If that is the desired behaviour, can you point out where in spec this
> >>> is mentioned.
> >>
> >> According to IBA "Local invalidate fence" (LIF) and regular Fence behave
> >> differently. (See the referenced sections in the IBA.) For a local invalidate
> >> operation the fence bit fences all previous operations. That was the old behavior
> >> which made no distinction between local invalidate and other operations.
> >> The change here are the other operations with a regular fence which should only
> >> requires read and atomic operations to be fenced.
> >>
> >> Not sure what you mean by 'also'. Per the IBA if the LIF is set then you have
> >> strict invalidate ordering if not then you have relaxed ordering. The kernel verbs
> >> API only has one fence bit and does not have a separate LIF bit so I am
> >> interpreting them to share the one bit.
> >
> > I see. Now I understand. Thanks for the explanation.
> >
> > I do have a follow-up question. For a IB_WR_LOCAL_INV wr, without the
> > fence bit means relaxed ordering. This would mean that the completion
> > for that wr must take place "before any subsequent WQE has begun
> > execution". From what I understand, multiple rxe_requester instances
> > can run in parallel and pick up wqes and execute them. How is the
> > relaxed ordering criteria fulfilled?
>
> The requester is a tasklet. There is one tasklet instance per QP. Tasklets can only
> run on a single cpu so not in parallel. The tasklets for multiple cpus each
> execute a single send queue in order.

I see. So, according to the function rxe_run_task, it will run the
tasklet only if "sched" is set to 1. Otherwise, its is going to run
the function rxe_do_task directly, which calls task->func directly.

I can see places that its calling rxe_run_task with sched = 0, but
they are few. I did not look into all the execution paths through
which these can be hit, but was wondering, if there is a way it is
being made sure that such calls to rxe_run_task with sched = 0, does
not happen in parallel?



> >
> >>
> >> Bob
> >>>
> >>> Thanks.
> >>>
> >>>
> >>>> +
> >>>> +       /* Fence see IBA 10.8.3.3
> >>>> +        * Requires that all previous read and atomic operations
> >>>> +        * are complete.
> >>>> +        */
> >>>> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
> >>>> +}
> >>>> +
> >>>>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
> >>>>  {
> >>>>         switch (opcode) {
> >>>> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
> >>>>         if (unlikely(!wqe))
> >>>>                 goto exit;
> >>>>
> >>>> +       if (rxe_wqe_is_fenced(qp, wqe)) {
> >>>> +               qp->req.wait_fence = 1;
> >>>> +               goto exit;
> >>>> +       }
> >>>> +
> >>>>         if (wqe->mask & WR_LOCAL_OP_MASK) {
> >>>>                 ret = rxe_do_local_ops(qp, wqe);
> >>>>                 if (unlikely(ret))
> >>>>
> >>>> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
> >>>> --
> >>>> 2.34.1
> >>>>
> >>
>
Bob Pearson May 24, 2022, 6:20 p.m. UTC | #6
On 5/24/22 05:28, Haris Iqbal wrote:
> On Mon, May 23, 2022 at 8:22 PM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>
>> On 5/23/22 03:05, Haris Iqbal wrote:
>>> On Mon, May 23, 2022 at 5:51 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>>>
>>>> On 5/22/22 18:59, Haris Iqbal wrote:
>>>>> On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>>>>>
>>>>>> Currently the rxe driver checks if any previous operation
>>>>>> is not complete to determine if a fence wait is required.
>>>>>> This is not correct. For a regular fence only previous
>>>>>> read or atomic operations must be complete while for a local
>>>>>> invalidate fence all previous operations must be complete.
>>>>>> This patch corrects this behavior.
>>>>>>
>>>>>> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
>>>>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>>>>>> ---
>>>>>>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
>>>>>>  1 file changed, 36 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
>>>>>> index ae5fbc79dd5c..f36263855a45 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_req.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
>>>>>> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
>>>>>>                      (wqe->state != wqe_state_processing)))
>>>>>>                 return NULL;
>>>>>>
>>>>>> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
>>>>>> -                                                    (index != cons))) {
>>>>>> -               qp->req.wait_fence = 1;
>>>>>> -               return NULL;
>>>>>> -       }
>>>>>> -
>>>>>>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
>>>>>>         return wqe;
>>>>>>  }
>>>>>>
>>>>>> +/**
>>>>>> + * rxe_wqe_is_fenced - check if next wqe is fenced
>>>>>> + * @qp: the queue pair
>>>>>> + * @wqe: the next wqe
>>>>>> + *
>>>>>> + * Returns: 1 if wqe is fenced (needs to wait)
>>>>>> + *         0 if wqe is good to go
>>>>>> + */
>>>>>> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
>>>>>> +{
>>>>>> +       unsigned int cons;
>>>>>> +
>>>>>> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
>>>>>> +               return 0;
>>>>>> +
>>>>>> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
>>>>>> +
>>>>>> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
>>>>>> +        * Requires ALL previous operations on the send queue
>>>>>> +        * are complete.
>>>>>> +        */
>>>>>> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
>>>>>> +               return qp->req.wqe_index != cons;
>>>>>
>>>>>
>>>>> Do I understand correctly that according to this code a wr with opcode
>>>>> IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
>>>>> work?
>>>>>
>>>>> If that is the desired behaviour, can you point out where in spec this
>>>>> is mentioned.
>>>>
>>>> According to IBA "Local invalidate fence" (LIF) and regular Fence behave
>>>> differently. (See the referenced sections in the IBA.) For a local invalidate
>>>> operation the fence bit fences all previous operations. That was the old behavior
>>>> which made no distinction between local invalidate and other operations.
>>>> The change here are the other operations with a regular fence which should only
>>>> requires read and atomic operations to be fenced.
>>>>
>>>> Not sure what you mean by 'also'. Per the IBA if the LIF is set then you have
>>>> strict invalidate ordering if not then you have relaxed ordering. The kernel verbs
>>>> API only has one fence bit and does not have a separate LIF bit so I am
>>>> interpreting them to share the one bit.
>>>
>>> I see. Now I understand. Thanks for the explanation.
>>>
>>> I do have a follow-up question. For a IB_WR_LOCAL_INV wr, without the
>>> fence bit means relaxed ordering. This would mean that the completion
>>> for that wr must take place "before any subsequent WQE has begun
>>> execution". From what I understand, multiple rxe_requester instances
>>> can run in parallel and pick up wqes and execute them. How is the
>>> relaxed ordering criteria fulfilled?
>>
>> The requester is a tasklet. There is one tasklet instance per QP. Tasklets can only
>> run on a single cpu so not in parallel. The tasklets for multiple cpus each
>> execute a single send queue in order.
> 
> I see. So, according to the function rxe_run_task, it will run the
> tasklet only if "sched" is set to 1. Otherwise, its is going to run
> the function rxe_do_task directly, which calls task->func directly.
> 
> I can see places that its calling rxe_run_task with sched = 0, but
> they are few. I did not look into all the execution paths through
> which these can be hit, but was wondering, if there is a way it is
> being made sure that such calls to rxe_run_task with sched = 0, does
> not happen in parallel?

It's a little more complicated than that. rxe_run_task(task, sched)
runs the tasklet task as a tasklet if sched is nonzero. If it is zero
it runs on the current thread but carefully locks and only runs the
subroutine if the tasklet is not already running otherwise it marks the
task as needing to be continued and leaves it running in the tasklet.
The net effect is that either call to rxe_run_task will cause the
tasklet code to run. When you schedule a tasket it normally runs on the
same cpu as the calling code so there isn't a lot of difference between
sched and non-sched. The sched version leaves two threads able to run
on the cpu. The non-sched just has the original thread.

__rxe_do_task() is also used and just calls the subroutine directly.
It can only be used if you know that the tasklet is disabled since
the tasklet code is non re-entrant and should never be running twice
at the same time. It is used in rxe_qp.c during setup, reset and 
cleanup of queue pairs.

Bob
> 
> 
> 
>>>
>>>>
>>>> Bob
>>>>>
>>>>> Thanks.
>>>>>
>>>>>
>>>>>> +
>>>>>> +       /* Fence see IBA 10.8.3.3
>>>>>> +        * Requires that all previous read and atomic operations
>>>>>> +        * are complete.
>>>>>> +        */
>>>>>> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
>>>>>> +}
>>>>>> +
>>>>>>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
>>>>>>  {
>>>>>>         switch (opcode) {
>>>>>> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
>>>>>>         if (unlikely(!wqe))
>>>>>>                 goto exit;
>>>>>>
>>>>>> +       if (rxe_wqe_is_fenced(qp, wqe)) {
>>>>>> +               qp->req.wait_fence = 1;
>>>>>> +               goto exit;
>>>>>> +       }
>>>>>> +
>>>>>>         if (wqe->mask & WR_LOCAL_OP_MASK) {
>>>>>>                 ret = rxe_do_local_ops(qp, wqe);
>>>>>>                 if (unlikely(ret))
>>>>>>
>>>>>> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
>>>>>> --
>>>>>> 2.34.1
>>>>>>
>>>>
>>
Haris Iqbal May 27, 2022, 10:18 a.m. UTC | #7
On Tue, May 24, 2022 at 8:20 PM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>
> On 5/24/22 05:28, Haris Iqbal wrote:
> > On Mon, May 23, 2022 at 8:22 PM Bob Pearson <rpearsonhpe@gmail.com> wrote:
> >>
> >> On 5/23/22 03:05, Haris Iqbal wrote:
> >>> On Mon, May 23, 2022 at 5:51 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
> >>>>
> >>>> On 5/22/22 18:59, Haris Iqbal wrote:
> >>>>> On Mon, May 23, 2022 at 12:36 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
> >>>>>>
> >>>>>> Currently the rxe driver checks if any previous operation
> >>>>>> is not complete to determine if a fence wait is required.
> >>>>>> This is not correct. For a regular fence only previous
> >>>>>> read or atomic operations must be complete while for a local
> >>>>>> invalidate fence all previous operations must be complete.
> >>>>>> This patch corrects this behavior.
> >>>>>>
> >>>>>> Fixes: 8700e3e7c4857 ("Soft RoCE (RXE) - The software RoCE driver")
> >>>>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >>>>>> ---
> >>>>>>  drivers/infiniband/sw/rxe/rxe_req.c | 42 ++++++++++++++++++++++++-----
> >>>>>>  1 file changed, 36 insertions(+), 6 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> >>>>>> index ae5fbc79dd5c..f36263855a45 100644
> >>>>>> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> >>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> >>>>>> @@ -163,16 +163,41 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
> >>>>>>                      (wqe->state != wqe_state_processing)))
> >>>>>>                 return NULL;
> >>>>>>
> >>>>>> -       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
> >>>>>> -                                                    (index != cons))) {
> >>>>>> -               qp->req.wait_fence = 1;
> >>>>>> -               return NULL;
> >>>>>> -       }
> >>>>>> -
> >>>>>>         wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
> >>>>>>         return wqe;
> >>>>>>  }
> >>>>>>
> >>>>>> +/**
> >>>>>> + * rxe_wqe_is_fenced - check if next wqe is fenced
> >>>>>> + * @qp: the queue pair
> >>>>>> + * @wqe: the next wqe
> >>>>>> + *
> >>>>>> + * Returns: 1 if wqe is fenced (needs to wait)
> >>>>>> + *         0 if wqe is good to go
> >>>>>> + */
> >>>>>> +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
> >>>>>> +{
> >>>>>> +       unsigned int cons;
> >>>>>> +
> >>>>>> +       if (!(wqe->wr.send_flags & IB_SEND_FENCE))
> >>>>>> +               return 0;
> >>>>>> +
> >>>>>> +       cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
> >>>>>> +
> >>>>>> +       /* Local invalidate fence (LIF) see IBA 10.6.5.1
> >>>>>> +        * Requires ALL previous operations on the send queue
> >>>>>> +        * are complete.
> >>>>>> +        */
> >>>>>> +       if (wqe->wr.opcode == IB_WR_LOCAL_INV)
> >>>>>> +               return qp->req.wqe_index != cons;
> >>>>>
> >>>>>
> >>>>> Do I understand correctly that according to this code a wr with opcode
> >>>>> IB_WR_LOCAL_INV needs to have the IB_SEND_FENCE also set for this to
> >>>>> work?
> >>>>>
> >>>>> If that is the desired behaviour, can you point out where in spec this
> >>>>> is mentioned.
> >>>>
> >>>> According to IBA "Local invalidate fence" (LIF) and regular Fence behave
> >>>> differently. (See the referenced sections in the IBA.) For a local invalidate
> >>>> operation the fence bit fences all previous operations. That was the old behavior
> >>>> which made no distinction between local invalidate and other operations.
> >>>> The change here are the other operations with a regular fence which should only
> >>>> requires read and atomic operations to be fenced.
> >>>>
> >>>> Not sure what you mean by 'also'. Per the IBA if the LIF is set then you have
> >>>> strict invalidate ordering if not then you have relaxed ordering. The kernel verbs
> >>>> API only has one fence bit and does not have a separate LIF bit so I am
> >>>> interpreting them to share the one bit.
> >>>
> >>> I see. Now I understand. Thanks for the explanation.
> >>>
> >>> I do have a follow-up question. For a IB_WR_LOCAL_INV wr, without the
> >>> fence bit means relaxed ordering. This would mean that the completion
> >>> for that wr must take place "before any subsequent WQE has begun
> >>> execution". From what I understand, multiple rxe_requester instances
> >>> can run in parallel and pick up wqes and execute them. How is the
> >>> relaxed ordering criteria fulfilled?
> >>
> >> The requester is a tasklet. There is one tasklet instance per QP. Tasklets can only
> >> run on a single cpu so not in parallel. The tasklets for multiple cpus each
> >> execute a single send queue in order.
> >
> > I see. So, according to the function rxe_run_task, it will run the
> > tasklet only if "sched" is set to 1. Otherwise, its is going to run
> > the function rxe_do_task directly, which calls task->func directly.
> >
> > I can see places that its calling rxe_run_task with sched = 0, but
> > they are few. I did not look into all the execution paths through
> > which these can be hit, but was wondering, if there is a way it is
> > being made sure that such calls to rxe_run_task with sched = 0, does
> > not happen in parallel?
>
> It's a little more complicated than that. rxe_run_task(task, sched)
> runs the tasklet task as a tasklet if sched is nonzero. If it is zero
> it runs on the current thread but carefully locks and only runs the
> subroutine if the tasklet is not already running otherwise it marks the
> task as needing to be continued and leaves it running in the tasklet.
> The net effect is that either call to rxe_run_task will cause the
> tasklet code to run. When you schedule a tasket it normally runs on the
> same cpu as the calling code so there isn't a lot of difference between
> sched and non-sched. The sched version leaves two threads able to run
> on the cpu. The non-sched just has the original thread.
>
> __rxe_do_task() is also used and just calls the subroutine directly.
> It can only be used if you know that the tasklet is disabled since
> the tasklet code is non re-entrant and should never be running twice
> at the same time. It is used in rxe_qp.c during setup, reset and
> cleanup of queue pairs.
>
> Bob

I see.. Thanks for the responses.


> >
> >
> >
> >>>
> >>>>
> >>>> Bob
> >>>>>
> >>>>> Thanks.
> >>>>>
> >>>>>
> >>>>>> +
> >>>>>> +       /* Fence see IBA 10.8.3.3
> >>>>>> +        * Requires that all previous read and atomic operations
> >>>>>> +        * are complete.
> >>>>>> +        */
> >>>>>> +       return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
> >>>>>> +}
> >>>>>> +
> >>>>>>  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
> >>>>>>  {
> >>>>>>         switch (opcode) {
> >>>>>> @@ -636,6 +661,11 @@ int rxe_requester(void *arg)
> >>>>>>         if (unlikely(!wqe))
> >>>>>>                 goto exit;
> >>>>>>
> >>>>>> +       if (rxe_wqe_is_fenced(qp, wqe)) {
> >>>>>> +               qp->req.wait_fence = 1;
> >>>>>> +               goto exit;
> >>>>>> +       }
> >>>>>> +
> >>>>>>         if (wqe->mask & WR_LOCAL_OP_MASK) {
> >>>>>>                 ret = rxe_do_local_ops(qp, wqe);
> >>>>>>                 if (unlikely(ret))
> >>>>>>
> >>>>>> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
> >>>>>> --
> >>>>>> 2.34.1
> >>>>>>
> >>>>
> >>
>
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index ae5fbc79dd5c..f36263855a45 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -163,16 +163,41 @@  static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
 		     (wqe->state != wqe_state_processing)))
 		return NULL;
 
-	if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
-						     (index != cons))) {
-		qp->req.wait_fence = 1;
-		return NULL;
-	}
-
 	wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
 	return wqe;
 }
 
+/**
+ * rxe_wqe_is_fenced - check if next wqe is fenced
+ * @qp: the queue pair
+ * @wqe: the next wqe
+ *
+ * Returns: 1 if wqe is fenced (needs to wait)
+ *	    0 if wqe is good to go
+ */
+static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+	unsigned int cons;
+
+	if (!(wqe->wr.send_flags & IB_SEND_FENCE))
+		return 0;
+
+	cons = queue_get_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
+
+	/* Local invalidate fence (LIF) see IBA 10.6.5.1
+	 * Requires ALL previous operations on the send queue
+	 * are complete.
+	 */
+	if (wqe->wr.opcode == IB_WR_LOCAL_INV)
+		return qp->req.wqe_index != cons;
+
+	/* Fence see IBA 10.8.3.3
+	 * Requires that all previous read and atomic operations
+	 * are complete.
+	 */
+	return atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
+}
+
 static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
 {
 	switch (opcode) {
@@ -636,6 +661,11 @@  int rxe_requester(void *arg)
 	if (unlikely(!wqe))
 		goto exit;
 
+	if (rxe_wqe_is_fenced(qp, wqe)) {
+		qp->req.wait_fence = 1;
+		goto exit;
+	}
+
 	if (wqe->mask & WR_LOCAL_OP_MASK) {
 		ret = rxe_do_local_ops(qp, wqe);
 		if (unlikely(ret))