diff mbox series

[[PATCH,v2,for-next] ] RDMA/siw: Fix SQ/RQ drain logic

Message ID 20191002143858.4550-1-bmt@zurich.ibm.com (mailing list archive)
State Superseded
Headers show
Series [[PATCH,v2,for-next] ] RDMA/siw: Fix SQ/RQ drain logic | expand

Commit Message

Bernard Metzler Oct. 2, 2019, 2:38 p.m. UTC
Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to
drain QP/CQ. Current SIW's own drain routines do not properly
wait until all SQ/RQ elements are completed and reaped
from the CQ. This may cause touch after free issues.
New logic relies on generic __ib_drain_sq()/__ib_drain_rq()
posting a final work request, which SIW immediately flushes
to CQ.

Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com>
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
---
v1 -> v2:
- Accept SQ and RQ work requests, if QP is in ERROR
  state. In that case, immediately flush WR's to CQ.
  This already provides needed functionality to
  support ib_drain_sq()/ib_drain_rq() without extra
  state checking in the fast path.

 drivers/infiniband/sw/siw/siw_main.c  |  20 -----
 drivers/infiniband/sw/siw/siw_verbs.c | 103 +++++++++++++++++++++-----
 2 files changed, 86 insertions(+), 37 deletions(-)

Comments

Leon Romanovsky Oct. 2, 2019, 3:47 p.m. UTC | #1
On Wed, Oct 02, 2019 at 04:38:58PM +0200, Bernard Metzler wrote:
> Storage ULPs (e.g. iSER & NVMeOF) use ib_drain_qp() to
> drain QP/CQ. Current SIW's own drain routines do not properly
> wait until all SQ/RQ elements are completed and reaped
> from the CQ. This may cause touch after free issues.
> New logic relies on generic __ib_drain_sq()/__ib_drain_rq()
> posting a final work request, which SIW immediately flushes
> to CQ.
>
> Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
> Signed-off-by: Krishnamraju Eraparaju <krishna2@chelsio.com>
> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> ---
> v1 -> v2:
> - Accept SQ and RQ work requests, if QP is in ERROR
>   state. In that case, immediately flush WR's to CQ.
>   This already provides needed functionality to
>   support ib_drain_sq()/ib_drain_rq() without extra
>   state checking in the fast path.
>
>  drivers/infiniband/sw/siw/siw_main.c  |  20 -----
>  drivers/infiniband/sw/siw/siw_verbs.c | 103 +++++++++++++++++++++-----
>  2 files changed, 86 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
> index 05a92f997f60..fb01407a310f 100644
> --- a/drivers/infiniband/sw/siw/siw_main.c
> +++ b/drivers/infiniband/sw/siw/siw_main.c
> @@ -248,24 +248,6 @@ static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
>  	return NULL;
>  }
>
> -static void siw_verbs_sq_flush(struct ib_qp *base_qp)
> -{
> -	struct siw_qp *qp = to_siw_qp(base_qp);
> -
> -	down_write(&qp->state_lock);
> -	siw_sq_flush(qp);
> -	up_write(&qp->state_lock);
> -}
> -
> -static void siw_verbs_rq_flush(struct ib_qp *base_qp)
> -{
> -	struct siw_qp *qp = to_siw_qp(base_qp);
> -
> -	down_write(&qp->state_lock);
> -	siw_rq_flush(qp);
> -	up_write(&qp->state_lock);
> -}
> -
>  static const struct ib_device_ops siw_device_ops = {
>  	.owner = THIS_MODULE,
>  	.uverbs_abi_ver = SIW_ABI_VERSION,
> @@ -284,8 +266,6 @@ static const struct ib_device_ops siw_device_ops = {
>  	.destroy_cq = siw_destroy_cq,
>  	.destroy_qp = siw_destroy_qp,
>  	.destroy_srq = siw_destroy_srq,
> -	.drain_rq = siw_verbs_rq_flush,
> -	.drain_sq = siw_verbs_sq_flush,
>  	.get_dma_mr = siw_get_dma_mr,
>  	.get_port_immutable = siw_get_port_immutable,
>  	.iw_accept = siw_accept,
> diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
> index 869e02b69a01..40e68e7a4f39 100644
> --- a/drivers/infiniband/sw/siw/siw_verbs.c
> +++ b/drivers/infiniband/sw/siw/siw_verbs.c
> @@ -687,6 +687,47 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
>  	return bytes;
>  }
>
> +/* Complete SQ WR's without processing */
> +static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
> +			   const struct ib_send_wr **bad_wr)
> +{
> +	struct siw_sqe sqe = {};
> +	int rv = 0;
> +
> +	while (wr) {
> +		sqe.id = wr->wr_id;
> +		sqe.opcode = wr->opcode;
> +		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
> +		if (rv) {
> +			if (bad_wr)
> +				*bad_wr = wr;
> +			break;
> +		}
> +		wr = wr->next;
> +	}
> +	return rv;
> +}
> +
> +/* Complete RQ WR's without processing */
> +static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
> +			   const struct ib_recv_wr **bad_wr)
> +{
> +	struct siw_rqe rqe = {};
> +	int rv = 0;
> +
> +	while (wr) {
> +		rqe.id = wr->wr_id;
> +		rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
> +		if (rv) {
> +			if (bad_wr)
> +				*bad_wr = wr;
> +			break;
> +		}
> +		wr = wr->next;
> +	}
> +	return rv;
> +}
> +
>  /*
>   * siw_post_send()
>   *
> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
>  	unsigned long flags;
>  	int rv = 0;
>
> +	if (wr && !qp->kernel_verbs) {

It is not related to this specific patch, but all siw "kernel_verbs"
should go, we have standard way to distinguish between kernel and user
verbs.

Thanks
Bernard Metzler Oct. 4, 2019, 2:09 p.m. UTC | #2
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----
<...>

>>   *
>> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp, const
>struct ib_send_wr *wr,
>>  	unsigned long flags;
>>  	int rv = 0;
>>
>> +	if (wr && !qp->kernel_verbs) {
>
>It is not related to this specific patch, but all siw "kernel_verbs"
>should go, we have standard way to distinguish between kernel and
>user
>verbs.
>
>Thanks
>
Understood. I think we touched on that already.
rdma core objects have a uobject pointer which
is valid only if it belongs to a user land
application. We might better use that. Let me
see if I can compact QP objects to contain the
ib_qp. I'd like to avoid following pointers
potentially causing cache misses on the
fast path. This is why I still have that
little boolean within the siw private
structure.

Thanks and best regards,
Bernard.
Jason Gunthorpe Oct. 4, 2019, 5:48 p.m. UTC | #3
On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
> <...>
> 
> >>   *
> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp, const
> >struct ib_send_wr *wr,
> >>  	unsigned long flags;
> >>  	int rv = 0;
> >>
> >> +	if (wr && !qp->kernel_verbs) {
> >
> >It is not related to this specific patch, but all siw "kernel_verbs"
> >should go, we have standard way to distinguish between kernel and
> >user
> >verbs.
> >
> >Thanks
> >
> Understood. I think we touched on that already.
> rdma core objects have a uobject pointer which
> is valid only if it belongs to a user land
> application. We might better use that. 

No, the uobject pointer is not to be touched by drivers

Jason
Leon Romanovsky Oct. 5, 2019, 8:26 a.m. UTC | #4
On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
> <...>
>
> >>   *
> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp, const
> >struct ib_send_wr *wr,
> >>  	unsigned long flags;
> >>  	int rv = 0;
> >>
> >> +	if (wr && !qp->kernel_verbs) {
> >
> >It is not related to this specific patch, but all siw "kernel_verbs"
> >should go, we have standard way to distinguish between kernel and
> >user
> >verbs.
> >
> >Thanks
> >
> Understood. I think we touched on that already.
> rdma core objects have a uobject pointer which
> is valid only if it belongs to a user land
> application. We might better use that. Let me
> see if I can compact QP objects to contain the
> ib_qp. I'd like to avoid following pointers
> potentially causing cache misses on the
> fast path. This is why I still have that
> little boolean within the siw private
> structure.

You have this variable in CQ and SRQ too.

I have serious doubts that this value gives any performance advantages.
In both flows, you will need to fetch ib_qp pointer, so you don't save
here anything by looking on kernel_verbs value.

Thanks

>
> Thanks and best regards,
> Bernard.
>
Bernard Metzler Oct. 18, 2019, 1:50 p.m. UTC | #5
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----

>To: "Bernard Metzler" <BMT@zurich.ibm.com>
>From: "Leon Romanovsky" <leon@kernel.org>
>Date: 10/05/2019 10:26AM
>Cc: linux-rdma@vger.kernel.org, bharat@chelsio.com, jgg@ziepe.ca,
>nirranjan@chelsio.com, krishna2@chelsio.com, bvanassche@acm.org
>Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix SQ/RQ
>drain logic
>
>On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
>> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>> <...>
>>
>> >>   *
>> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp,
>const
>> >struct ib_send_wr *wr,
>> >>  	unsigned long flags;
>> >>  	int rv = 0;
>> >>
>> >> +	if (wr && !qp->kernel_verbs) {
>> >
>> >It is not related to this specific patch, but all siw
>"kernel_verbs"
>> >should go, we have standard way to distinguish between kernel and
>> >user
>> >verbs.
>> >
>> >Thanks
>> >
>> Understood. I think we touched on that already.
>> rdma core objects have a uobject pointer which
>> is valid only if it belongs to a user land
>> application. We might better use that. Let me
>> see if I can compact QP objects to contain the
>> ib_qp. I'd like to avoid following pointers
>> potentially causing cache misses on the
>> fast path. This is why I still have that
>> little boolean within the siw private
>> structure.
>
>You have this variable in CQ and SRQ too.
>
>I have serious doubts that this value gives any performance
>advantages.
>In both flows, you will need to fetch ib_qp pointer, so you don't
>save
>here anything by looking on kernel_verbs value.
>

Yes, I see you are right for both CQ and SRQ.

For the CQ, we have a nested structure where
siw_cq contains ib_cq. So it is not far away.

For SRQ it is the same.

For QP's we have a split between siw_qp and ib_qp. 
I will look into how to get that one solved.

I will prepare an extra patch for that whole
kernel_verbs thing, but let's not have it gating
acceptance of this unrelated patch.

Thanks very much,
Bernard.
Leon Romanovsky Oct. 22, 2019, 5:49 a.m. UTC | #6
On Fri, Oct 18, 2019 at 01:50:22PM +0000, Bernard Metzler wrote:
> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>
> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
> >From: "Leon Romanovsky" <leon@kernel.org>
> >Date: 10/05/2019 10:26AM
> >Cc: linux-rdma@vger.kernel.org, bharat@chelsio.com, jgg@ziepe.ca,
> >nirranjan@chelsio.com, krishna2@chelsio.com, bvanassche@acm.org
> >Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix SQ/RQ
> >drain logic
> >
> >On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
> >> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
> >> <...>
> >>
> >> >>   *
> >> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp,
> >const
> >> >struct ib_send_wr *wr,
> >> >>  	unsigned long flags;
> >> >>  	int rv = 0;
> >> >>
> >> >> +	if (wr && !qp->kernel_verbs) {
> >> >
> >> >It is not related to this specific patch, but all siw
> >"kernel_verbs"
> >> >should go, we have standard way to distinguish between kernel and
> >> >user
> >> >verbs.
> >> >
> >> >Thanks
> >> >
> >> Understood. I think we touched on that already.
> >> rdma core objects have a uobject pointer which
> >> is valid only if it belongs to a user land
> >> application. We might better use that. Let me
> >> see if I can compact QP objects to contain the
> >> ib_qp. I'd like to avoid following pointers
> >> potentially causing cache misses on the
> >> fast path. This is why I still have that
> >> little boolean within the siw private
> >> structure.
> >
> >You have this variable in CQ and SRQ too.
> >
> >I have serious doubts that this value gives any performance
> >advantages.
> >In both flows, you will need to fetch ib_qp pointer, so you don't
> >save
> >here anything by looking on kernel_verbs value.
> >
>
> Yes, I see you are right for both CQ and SRQ.
>
> For the CQ, we have a nested structure where
> siw_cq contains ib_cq. So it is not far away.
>
> For SRQ it is the same.
>
> For QP's we have a split between siw_qp and ib_qp.
> I will look into how to get that one solved.
>
> I will prepare an extra patch for that whole
> kernel_verbs thing, but let's not have it gating
> acceptance of this unrelated patch.

Sure, it is unrelated.

>
> Thanks very much,
> Bernard.
>
Bernard Metzler Oct. 25, 2019, 12:11 p.m. UTC | #7
-----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: -----

>To: "Bernard Metzler" <BMT@zurich.ibm.com>
>From: "Jason Gunthorpe" <jgg@ziepe.ca>
>Date: 10/04/2019 07:48PM
>Cc: "Leon Romanovsky" <leon@kernel.org>, linux-rdma@vger.kernel.org,
>bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
>bvanassche@acm.org
>Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix SQ/RQ
>drain logic
>
>On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
>> <...>
>> 
>> >>   *
>> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp,
>const
>> >struct ib_send_wr *wr,
>> >>  	unsigned long flags;
>> >>  	int rv = 0;
>> >>
>> >> +	if (wr && !qp->kernel_verbs) {
>> >
>> >It is not related to this specific patch, but all siw
>"kernel_verbs"
>> >should go, we have standard way to distinguish between kernel and
>> >user
>> >verbs.
>> >
>> >Thanks
>> >
>> Understood. I think we touched on that already.
>> rdma core objects have a uobject pointer which
>> is valid only if it belongs to a user land
>> application. We might better use that. 
>
>No, the uobject pointer is not to be touched by drivers
>
Now what would be the appropriate way of remembering/
detecting user level nature of endpoint resources?
I see drivers _are_ doing 'if (!ibqp->uobject)' ... 

Other drivers keep it with the private state, like iw40,
but I learned we shall get rid of it.

We may export an inline query from RDMA core, or simply
#define is_usermode(ib_obj *) (ib_obj->uobject != NULL)
?

Thanks and best regards,
Bernard
Leon Romanovsky Oct. 27, 2019, 5:21 a.m. UTC | #8
On Fri, Oct 25, 2019 at 12:11:16PM +0000, Bernard Metzler wrote:
> -----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: -----
>
> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
> >From: "Jason Gunthorpe" <jgg@ziepe.ca>
> >Date: 10/04/2019 07:48PM
> >Cc: "Leon Romanovsky" <leon@kernel.org>, linux-rdma@vger.kernel.org,
> >bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
> >bvanassche@acm.org
> >Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix SQ/RQ
> >drain logic
> >
> >On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
> >> <...>
> >>
> >> >>   *
> >> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp,
> >const
> >> >struct ib_send_wr *wr,
> >> >>  	unsigned long flags;
> >> >>  	int rv = 0;
> >> >>
> >> >> +	if (wr && !qp->kernel_verbs) {
> >> >
> >> >It is not related to this specific patch, but all siw
> >"kernel_verbs"
> >> >should go, we have standard way to distinguish between kernel and
> >> >user
> >> >verbs.
> >> >
> >> >Thanks
> >> >
> >> Understood. I think we touched on that already.
> >> rdma core objects have a uobject pointer which
> >> is valid only if it belongs to a user land
> >> application. We might better use that.
> >
> >No, the uobject pointer is not to be touched by drivers
> >
> Now what would be the appropriate way of remembering/
> detecting user level nature of endpoint resources?
> I see drivers _are_ doing 'if (!ibqp->uobject)' ...

IMHO, you should rely in "udata" to distinguish user/kernel objects.

>
> Other drivers keep it with the private state, like iw40,
> but I learned we shall get rid of it.
>
> We may export an inline query from RDMA core, or simply
> #define is_usermode(ib_obj *) (ib_obj->uobject != NULL)
> ?
>
> Thanks and best regards,
> Bernard
>
Bernard Metzler Oct. 28, 2019, 12:37 p.m. UTC | #9
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----

>To: "Bernard Metzler" <BMT@zurich.ibm.com>
>From: "Leon Romanovsky" <leon@kernel.org>
>Date: 10/27/2019 06:21AM
>Cc: "Jason Gunthorpe" <jgg@ziepe.ca>, linux-rdma@vger.kernel.org,
>bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
>bvanassche@acm.org
>Subject: [EXTERNAL] Re: Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix
>SQ/RQ drain logic
>
>On Fri, Oct 25, 2019 at 12:11:16PM +0000, Bernard Metzler wrote:
>> -----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: -----
>>
>> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
>> >From: "Jason Gunthorpe" <jgg@ziepe.ca>
>> >Date: 10/04/2019 07:48PM
>> >Cc: "Leon Romanovsky" <leon@kernel.org>,
>linux-rdma@vger.kernel.org,
>> >bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
>> >bvanassche@acm.org
>> >Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix
>SQ/RQ
>> >drain logic
>> >
>> >On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
>> >> <...>
>> >>
>> >> >>   *
>> >> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp,
>> >const
>> >> >struct ib_send_wr *wr,
>> >> >>  	unsigned long flags;
>> >> >>  	int rv = 0;
>> >> >>
>> >> >> +	if (wr && !qp->kernel_verbs) {
>> >> >
>> >> >It is not related to this specific patch, but all siw
>> >"kernel_verbs"
>> >> >should go, we have standard way to distinguish between kernel
>and
>> >> >user
>> >> >verbs.
>> >> >
>> >> >Thanks
>> >> >
>> >> Understood. I think we touched on that already.
>> >> rdma core objects have a uobject pointer which
>> >> is valid only if it belongs to a user land
>> >> application. We might better use that.
>> >
>> >No, the uobject pointer is not to be touched by drivers
>> >
>> Now what would be the appropriate way of remembering/
>> detecting user level nature of endpoint resources?
>> I see drivers _are_ doing 'if (!ibqp->uobject)' ...
>
>IMHO, you should rely in "udata" to distinguish user/kernel objects.
>

right, we already do that at resource creation time, when
'udata' is available.  But there is no such parameter
around during resource access (post_send/post_recv/poll_cq/...),
when user land or kernel land application specific
code might be required.
That's why siw currently saves that info to a resource
(QP/CQ/SRQ) specific parameter 'kernel_verbs'. I agree
this parameter is redundant, if the rdma core object
provides that information as well. The only way I see
it provided is the validity of the uobject pointer of
all those resources.
Either (1) we use that uobject pointer as an indication
of application location, or (2) we remember it from
resource creation time when udata was available, or
(3) we have the rdma core exporting that information
explicitly.
siw, and other drivers, are currently implementing (2).
Some drivers implement (1). I'd be happy to change siw
to implement (1) - to get rid of 'kernel_verbs'.

Thanks and best regards,
Bernard.




>>
>> Other drivers keep it with the private state, like iw40,
>> but I learned we shall get rid of it.
>>
>> We may export an inline query from RDMA core, or simply
>> #define is_usermode(ib_obj *) (ib_obj->uobject != NULL)
>> ?
>>
>> Thanks and best regards,
>> Bernard
>>
>
>
Jason Gunthorpe Oct. 28, 2019, 1:43 p.m. UTC | #10
On Mon, Oct 28, 2019 at 12:37:38PM +0000, Bernard Metzler wrote:

> That's why siw currently saves that info to a resource
> (QP/CQ/SRQ) specific parameter 'kernel_verbs'. I agree
> this parameter is redundant, if the rdma core object
> provides that information as well. The only way I see
> it provided is the validity of the uobject pointer of
> all those resources.

The restrack stuff also keeps track of user/kernel on created objects

Jason
Leon Romanovsky Oct. 29, 2019, 4:54 a.m. UTC | #11
On Mon, Oct 28, 2019 at 12:37:38PM +0000, Bernard Metzler wrote:
> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>
> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
> >From: "Leon Romanovsky" <leon@kernel.org>
> >Date: 10/27/2019 06:21AM
> >Cc: "Jason Gunthorpe" <jgg@ziepe.ca>, linux-rdma@vger.kernel.org,
> >bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
> >bvanassche@acm.org
> >Subject: [EXTERNAL] Re: Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix
> >SQ/RQ drain logic
> >
> >On Fri, Oct 25, 2019 at 12:11:16PM +0000, Bernard Metzler wrote:
> >> -----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: -----
> >>
> >> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
> >> >From: "Jason Gunthorpe" <jgg@ziepe.ca>
> >> >Date: 10/04/2019 07:48PM
> >> >Cc: "Leon Romanovsky" <leon@kernel.org>,
> >linux-rdma@vger.kernel.org,
> >> >bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
> >> >bvanassche@acm.org
> >> >Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix
> >SQ/RQ
> >> >drain logic
> >> >
> >> >On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler wrote:
> >> >> <...>
> >> >>
> >> >> >>   *
> >> >> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp *base_qp,
> >> >const
> >> >> >struct ib_send_wr *wr,
> >> >> >>  	unsigned long flags;
> >> >> >>  	int rv = 0;
> >> >> >>
> >> >> >> +	if (wr && !qp->kernel_verbs) {
> >> >> >
> >> >> >It is not related to this specific patch, but all siw
> >> >"kernel_verbs"
> >> >> >should go, we have standard way to distinguish between kernel
> >and
> >> >> >user
> >> >> >verbs.
> >> >> >
> >> >> >Thanks
> >> >> >
> >> >> Understood. I think we touched on that already.
> >> >> rdma core objects have a uobject pointer which
> >> >> is valid only if it belongs to a user land
> >> >> application. We might better use that.
> >> >
> >> >No, the uobject pointer is not to be touched by drivers
> >> >
> >> Now what would be the appropriate way of remembering/
> >> detecting user level nature of endpoint resources?
> >> I see drivers _are_ doing 'if (!ibqp->uobject)' ...
> >
> >IMHO, you should rely in "udata" to distinguish user/kernel objects.
> >
>
> right, we already do that at resource creation time, when
> 'udata' is available.  But there is no such parameter
> around during resource access (post_send/post_recv/poll_cq/...),
> when user land or kernel land application specific
> code might be required.
> That's why siw currently saves that info to a resource
> (QP/CQ/SRQ) specific parameter 'kernel_verbs'. I agree
> this parameter is redundant, if the rdma core object
> provides that information as well. The only way I see
> it provided is the validity of the uobject pointer of
> all those resources.
> Either (1) we use that uobject pointer as an indication
> of application location, or (2) we remember it from
> resource creation time when udata was available, or
> (3) we have the rdma core exporting that information
> explicitly.
> siw, and other drivers, are currently implementing (2).
> Some drivers implement (1). I'd be happy to change siw
> to implement (1) - to get rid of 'kernel_verbs'.

Many if not all "kernel_verbs" variables are copy/paste.
The usual way to handle difference in internal flows is to
rely on having pointer initialized, e.g.
if (siw_device->some_specific_kernel_pointer)
 do_kernelwork(siw_device->some_specific_kernel_pointer->extra)

Thanks

>
> Thanks and best regards,
> Bernard.
>
>
>
>
> >>
> >> Other drivers keep it with the private state, like iw40,
> >> but I learned we shall get rid of it.
> >>
> >> We may export an inline query from RDMA core, or simply
> >> #define is_usermode(ib_obj *) (ib_obj->uobject != NULL)
> >> ?
> >>
> >> Thanks and best regards,
> >> Bernard
> >>
> >
> >
>
Bernard Metzler Oct. 31, 2019, 1:38 p.m. UTC | #12
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----

>To: "Bernard Metzler" <BMT@zurich.ibm.com>
>From: "Leon Romanovsky" <leon@kernel.org>
>Date: 10/29/2019 05:55AM
>Cc: "Jason Gunthorpe" <jgg@ziepe.ca>, linux-rdma@vger.kernel.org,
>bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
>bvanassche@acm.org
>Subject: [EXTERNAL] Re: Re: Re: Re: [[PATCH v2 for-next]] RDMA/siw:
>Fix SQ/RQ drain logic
>
>On Mon, Oct 28, 2019 at 12:37:38PM +0000, Bernard Metzler wrote:
>> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>>
>> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
>> >From: "Leon Romanovsky" <leon@kernel.org>
>> >Date: 10/27/2019 06:21AM
>> >Cc: "Jason Gunthorpe" <jgg@ziepe.ca>, linux-rdma@vger.kernel.org,
>> >bharat@chelsio.com, nirranjan@chelsio.com, krishna2@chelsio.com,
>> >bvanassche@acm.org
>> >Subject: [EXTERNAL] Re: Re: Re: [[PATCH v2 for-next]] RDMA/siw:
>Fix
>> >SQ/RQ drain logic
>> >
>> >On Fri, Oct 25, 2019 at 12:11:16PM +0000, Bernard Metzler wrote:
>> >> -----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: -----
>> >>
>> >> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
>> >> >From: "Jason Gunthorpe" <jgg@ziepe.ca>
>> >> >Date: 10/04/2019 07:48PM
>> >> >Cc: "Leon Romanovsky" <leon@kernel.org>,
>> >linux-rdma@vger.kernel.org,
>> >> >bharat@chelsio.com, nirranjan@chelsio.com,
>krishna2@chelsio.com,
>> >> >bvanassche@acm.org
>> >> >Subject: [EXTERNAL] Re: Re: [[PATCH v2 for-next]] RDMA/siw: Fix
>> >SQ/RQ
>> >> >drain logic
>> >> >
>> >> >On Fri, Oct 04, 2019 at 02:09:57PM +0000, Bernard Metzler
>wrote:
>> >> >> <...>
>> >> >>
>> >> >> >>   *
>> >> >> >> @@ -705,6 +746,12 @@ int siw_post_send(struct ib_qp
>*base_qp,
>> >> >const
>> >> >> >struct ib_send_wr *wr,
>> >> >> >>  	unsigned long flags;
>> >> >> >>  	int rv = 0;
>> >> >> >>
>> >> >> >> +	if (wr && !qp->kernel_verbs) {
>> >> >> >
>> >> >> >It is not related to this specific patch, but all siw
>> >> >"kernel_verbs"
>> >> >> >should go, we have standard way to distinguish between
>kernel
>> >and
>> >> >> >user
>> >> >> >verbs.
>> >> >> >
>> >> >> >Thanks
>> >> >> >
>> >> >> Understood. I think we touched on that already.
>> >> >> rdma core objects have a uobject pointer which
>> >> >> is valid only if it belongs to a user land
>> >> >> application. We might better use that.
>> >> >
>> >> >No, the uobject pointer is not to be touched by drivers
>> >> >
>> >> Now what would be the appropriate way of remembering/
>> >> detecting user level nature of endpoint resources?
>> >> I see drivers _are_ doing 'if (!ibqp->uobject)' ...
>> >
>> >IMHO, you should rely in "udata" to distinguish user/kernel
>objects.
>> >
>>
>> right, we already do that at resource creation time, when
>> 'udata' is available.  But there is no such parameter
>> around during resource access (post_send/post_recv/poll_cq/...),
>> when user land or kernel land application specific
>> code might be required.
>> That's why siw currently saves that info to a resource
>> (QP/CQ/SRQ) specific parameter 'kernel_verbs'. I agree
>> this parameter is redundant, if the rdma core object
>> provides that information as well. The only way I see
>> it provided is the validity of the uobject pointer of
>> all those resources.
>> Either (1) we use that uobject pointer as an indication
>> of application location, or (2) we remember it from
>> resource creation time when udata was available, or
>> (3) we have the rdma core exporting that information
>> explicitly.
>> siw, and other drivers, are currently implementing (2).
>> Some drivers implement (1). I'd be happy to change siw
>> to implement (1) - to get rid of 'kernel_verbs'.
>
>Many if not all "kernel_verbs" variables are copy/paste.
>The usual way to handle difference in internal flows is to
>rely on having pointer initialized, e.g.
>if (siw_device->some_specific_kernel_pointer)
> do_kernelwork(siw_device->some_specific_kernel_pointer->extra)
>

The conditional code is always rather short:

- a few lines for extra checks during post_sq,
  post_rq and post_srq to forbid writing queue
  entries via syscall.

- write the qp kernel pointer to a CQE only
  if it is a kernel application. Don't expose
  it to user land.

IMO, these checks do not qualify for a change
to a function indirection, which would establish
two very similar functions, differing only in
very few lines. It would also decrease readability.
The function pointer would have to be part of
the resource itself (QP/SRQ/CQ), as the flag
is now.

Let me limit the usage of this obviously unliked
flag to its possible minimum. During resource
construction/destruction I do not really need
it (except setting it), since 'udata' is there.
It would appear only in the fast path for
meentioned checks.
I still prefer that siw private flag to
avoid to rely on potentially changing semantics
of rdma core private structures the driver
IMHO better should not interpret. But I am
completely open to do it that way, if preferred
by the maintainers.

Thanks and best regards,
Bernard.

>Thanks
>
>>
>> Thanks and best regards,
>> Bernard.
>>
>>
>>
>>
>> >>
>> >> Other drivers keep it with the private state, like iw40,
>> >> but I learned we shall get rid of it.
>> >>
>> >> We may export an inline query from RDMA core, or simply
>> >> #define is_usermode(ib_obj *) (ib_obj->uobject != NULL)
>> >> ?
>> >>
>> >> Thanks and best regards,
>> >> Bernard
>> >>
>> >
>> >
>>
>
>
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index 05a92f997f60..fb01407a310f 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -248,24 +248,6 @@  static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
 	return NULL;
 }
 
-static void siw_verbs_sq_flush(struct ib_qp *base_qp)
-{
-	struct siw_qp *qp = to_siw_qp(base_qp);
-
-	down_write(&qp->state_lock);
-	siw_sq_flush(qp);
-	up_write(&qp->state_lock);
-}
-
-static void siw_verbs_rq_flush(struct ib_qp *base_qp)
-{
-	struct siw_qp *qp = to_siw_qp(base_qp);
-
-	down_write(&qp->state_lock);
-	siw_rq_flush(qp);
-	up_write(&qp->state_lock);
-}
-
 static const struct ib_device_ops siw_device_ops = {
 	.owner = THIS_MODULE,
 	.uverbs_abi_ver = SIW_ABI_VERSION,
@@ -284,8 +266,6 @@  static const struct ib_device_ops siw_device_ops = {
 	.destroy_cq = siw_destroy_cq,
 	.destroy_qp = siw_destroy_qp,
 	.destroy_srq = siw_destroy_srq,
-	.drain_rq = siw_verbs_rq_flush,
-	.drain_sq = siw_verbs_sq_flush,
 	.get_dma_mr = siw_get_dma_mr,
 	.get_port_immutable = siw_get_port_immutable,
 	.iw_accept = siw_accept,
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 869e02b69a01..40e68e7a4f39 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -687,6 +687,47 @@  static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 	return bytes;
 }
 
+/* Complete SQ WR's without processing */
+static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
+			   const struct ib_send_wr **bad_wr)
+{
+	struct siw_sqe sqe = {};
+	int rv = 0;
+
+	while (wr) {
+		sqe.id = wr->wr_id;
+		sqe.opcode = wr->opcode;
+		rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
+		if (rv) {
+			if (bad_wr)
+				*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return rv;
+}
+
+/* Complete RQ WR's without processing */
+static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
+			   const struct ib_recv_wr **bad_wr)
+{
+	struct siw_rqe rqe = {};
+	int rv = 0;
+
+	while (wr) {
+		rqe.id = wr->wr_id;
+		rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
+		if (rv) {
+			if (bad_wr)
+				*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return rv;
+}
+
 /*
  * siw_post_send()
  *
@@ -705,6 +746,12 @@  int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 	unsigned long flags;
 	int rv = 0;
 
+	if (wr && !qp->kernel_verbs) {
+		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
 	/*
 	 * Try to acquire QP state lock. Must be non-blocking
 	 * to accommodate kernel clients needs.
@@ -715,16 +762,23 @@  int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 		return -ENOTCONN;
 	}
 	if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+			/*
+			 * Immediately flush this WR to CQ, if QP
+			 * is in ERROR state. SQ is guaranteed to
+			 * be empty, so WR complets in-order.
+			 *
+			 * Typically triggered by ib_drain_sq().
+			 */
+			rv = siw_sq_flush_wr(qp, wr, bad_wr);
+		} else {
+			rv = -ENOTCONN;
+			*bad_wr = wr;
+			siw_dbg_qp(qp, "QP out of state %d\n",
+				   qp->attrs.state);
+		}
 		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
-		return -ENOTCONN;
-	}
-	if (wr && !qp->kernel_verbs) {
-		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
-		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		return -EINVAL;
+		return rv;
 	}
 	spin_lock_irqsave(&qp->sq_lock, flags);
 
@@ -919,6 +973,13 @@  int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
 		*bad_wr = wr;
 		return -EOPNOTSUPP; /* what else from errno.h? */
 	}
+	if (!qp->kernel_verbs) {
+		siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
 	/*
 	 * Try to acquire QP state lock. Must be non-blocking
 	 * to accommodate kernel clients needs.
@@ -927,16 +988,24 @@  int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
 		*bad_wr = wr;
 		return -ENOTCONN;
 	}
-	if (!qp->kernel_verbs) {
-		siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
-		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		return -EINVAL;
-	}
 	if (qp->attrs.state > SIW_QP_STATE_RTS) {
+		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+			/*
+			 * Immediately flush this WR to CQ, if QP
+			 * is in ERROR state. RQ is guaranteed to
+			 * be empty, so WR complets in-order.
+			 *
+			 * Typically triggered by ib_drain_rq().
+			 */
+			rv = siw_rq_flush_wr(qp, wr, bad_wr);
+		} else {
+			rv = -ENOTCONN;
+			*bad_wr = wr;
+			siw_dbg_qp(qp, "QP out of state %d\n",
+				   qp->attrs.state);
+		}
 		up_read(&qp->state_lock);
-		*bad_wr = wr;
-		return -EINVAL;
+		return rv;
 	}
 	/*
 	 * Serialize potentially multiple producers.