diff mbox

[V3,for-next,02/10] IB/core: Introduce Work Queue object and its verbs

Message ID 1460903237-16870-3-git-send-email-yishaih@mellanox.com (mailing list archive)
State Superseded
Headers show

Commit Message

Yishai Hadas April 17, 2016, 2:27 p.m. UTC
Introduce Work Queue object and its create/destroy/modify verbs.

QP can be created without internal WQs "packaged" inside it,
this QP can be configured to use "external" WQ object as its
receive/send queue.
WQ is a necessary component for RSS technology since RSS mechanism
is supposed to distribute the traffic between multiple
Receive Work Queues.

WQ associated (many to one) with Completion Queue and it owns WQ
properties (PD, WQ size, etc.).
WQ has a type, this patch introduces the IB_WQT_RQ (i.e.receive queue),
it may be extend to others such as IB_WQT_SQ. (send queue).
WQ from type IB_WQT_RQ contains receive work requests.

PD is an attribute of a work queue (i.e. send/receive queue), it's used
by the hardware for security validation before scattering to a memory
region which is pointed by the WQ. For that, an external WQ object
needs a PD, letting the hardware makes that validation.

When accessing a memory region that is pointed by the WQ its PD
is used and not the QP's PD, this behavior is similar
to a SRQ and a QP.

WQ context is subject to a well-defined state transitions done by
the modify_wq verb.
When WQ is created its initial state becomes IB_WQS_RESET.
From IB_WQS_RESET it can be modified to itself or to IB_WQS_RDY.
From IB_WQS_RDY it can be modified to itself, to IB_WQS_RESET
or to IB_WQS_ERR.
From IB_WQS_ERR it can be modified to IB_WQS_RESET.

Note: transition to IB_WQS_ERR might occur implicitly in case there
was some HW error.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
---
 drivers/infiniband/core/verbs.c | 82 +++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h         | 56 +++++++++++++++++++++++++++-
 2 files changed, 137 insertions(+), 1 deletion(-)

Comments

Ira Weiny May 20, 2016, 5:30 a.m. UTC | #1
On Sun, Apr 17, 2016 at 05:27:09PM +0300, Yishai Hadas wrote:
> Introduce Work Queue object and its create/destroy/modify verbs.
> 
> QP can be created without internal WQs "packaged" inside it,
> this QP can be configured to use "external" WQ object as its
> receive/send queue.
> WQ is a necessary component for RSS technology since RSS mechanism
> is supposed to distribute the traffic between multiple
> Receive Work Queues.

I'm confused by what a WQ actually is.  Does a QP contain a WQ ("'packaged'
inside it")?  Or is a set of WQ's associated with a single QP?  What is meant
by "internal" and "external" WQ?

Can a WQ be associated with more than 1 QP?  I'm thinking not, except
indirectly when it is associated with a single SRQ.

It looks like the user configures a set of WQs which will get wrs.  What types
of QPs can be associated with a IB_WQT_RQ?

Does the user post Recv WR's to the QP or the WQs?  Looks like to the QP/SRQ.
So are their ordering expectations here or can WRs posted to the QP get
processed out of order depending on which WQ they get sent to?  It seems that
then the user is responsible for dealing with out of order messages or
hopefully does not care?

Given the hash fields specified in the patch series and the information
discussed on the last verbs call it seems like only Raw Ethernet QPs are
supported.  Or can IPoIB UD QPs work as well.  If so how does a low level
driver know where to look for the IP headers?

Shouldn't the size of the indirection table determine the number of WQs or vice
versa?  It seems like the user has to do a lot of work here to make that
association.  What types of errors occur if the indirection table/hash
specifies a WQ which does not exist?

Maybe I'm just confused about the differences between the indirection table and
the hash function?

> 
> WQ associated (many to one) with Completion Queue and it owns WQ
> properties (PD, WQ size, etc.).
> WQ has a type, this patch introduces the IB_WQT_RQ (i.e.receive queue),
> it may be extend to others such as IB_WQT_SQ. (send queue).
> WQ from type IB_WQT_RQ contains receive work requests.
> 
> PD is an attribute of a work queue (i.e. send/receive queue), it's used
> by the hardware for security validation before scattering to a memory
> region which is pointed by the WQ. For that, an external WQ object
> needs a PD, letting the hardware makes that validation.
> 
> When accessing a memory region that is pointed by the WQ its PD
> is used and not the QP's PD, this behavior is similar
> to a SRQ and a QP.
> 
> WQ context is subject to a well-defined state transitions done by
> the modify_wq verb.
> When WQ is created its initial state becomes IB_WQS_RESET.
> From IB_WQS_RESET it can be modified to itself or to IB_WQS_RDY.
> From IB_WQS_RDY it can be modified to itself, to IB_WQS_RESET
> or to IB_WQS_ERR.
> From IB_WQS_ERR it can be modified to IB_WQS_RESET.
> 
> Note: transition to IB_WQS_ERR might occur implicitly in case there
> was some HW error.
> 
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> Signed-off-by: Matan Barak <matanb@mellanox.com>
> ---
>  drivers/infiniband/core/verbs.c | 82 +++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_verbs.h         | 56 +++++++++++++++++++++++++++-
>  2 files changed, 137 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index 15b8adb..c6c5792 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -1516,6 +1516,88 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
>  }
>  EXPORT_SYMBOL(ib_dealloc_xrcd);
>  
> +/**
> + * ib_create_wq - Creates a WQ associated with the specified protection
> + * domain.
> + * @pd: The protection domain associated with the WQ.
> + * @wq_init_attr: A list of initial attributes required to create the

Is this really a list of attributes?

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yishai Hadas May 23, 2016, 12:33 p.m. UTC | #2
On 5/20/2016 8:30 AM, ira.weiny wrote:
> On Sun, Apr 17, 2016 at 05:27:09PM +0300, Yishai Hadas wrote:
>> Introduce Work Queue object and its create/destroy/modify verbs.
>>
>> QP can be created without internal WQs "packaged" inside it,
>> this QP can be configured to use "external" WQ object as its
>> receive/send queue.
>> WQ is a necessary component for RSS technology since RSS mechanism
>> is supposed to distribute the traffic between multiple
>> Receive Work Queues.
>
> I'm confused by what a WQ actually is.  Does a QP contain a WQ ("'packaged'
> inside it")?  Or is a set of WQ's associated with a single QP?  What is meant
> by "internal" and "external" WQ?

Currently when a QP is created its RQ and SQ parts are created 
internally. A WQ is actually one of above (RQ/SQ) based on its type, 
however, it's given externally as part of the QP create API.
This series exposed IB_WQT_RQ, in the future we may add IB_WQT_SQ.

> Can a WQ be associated with more than 1 QP?  I'm thinking not, except
> indirectly when it is associated with a single SRQ.

This series enables setting WQ(s) by an indirection table to a QP, this 
indirection table can be associated with other QPs as well.

>
> It looks like the user configures a set of WQs which will get wrs.  What types
> of QPs can be associated with a IB_WQT_RQ?

This should be based on capabilities, please see cover letter as well. 
Currently in this series, mlx5 driver supports RAW_ETH_QP but it can be 
extended in the future for others as of UD QP.

> Does the user post Recv WR's to the QP or the WQs?  Looks like to the QP/SRQ.
> So are their ordering expectations here or can WRs posted to the QP get
> processed out of order depending on which WQ they get sent to?  It seems that
> then the user is responsible for dealing with out of order messages or
> hopefully does not care?

No, the user should post to a WQ which holds the memory that the HW 
scatters to.

>
> Given the hash fields specified in the patch series and the information
> discussed on the last verbs call it seems like only Raw Ethernet QPs are
> supported.  Or can IPoIB UD QPs work as well.  If so how does a low level
> driver know where to look for the IP headers?

As discussed in the last verbs call the hash attributes (fields, key, 
etc.) were moved to be vendor specific, this enables any vendor to get 
its specific properties to support different cases. Specific to IPoIB 
the HW should be able to detect the packet and to active the RSS 
offload. Please look at V4 series for above change.

>
> Shouldn't the size of the indirection table determine the number of WQs or vice
> versa?  It seems like the user has to do a lot of work here to make that
> association.

Each WQ can be repeated in the indirection table so the number of 
different WQs can differ from the indirection table size.

The user should create WQs, usually it will be based on number of cores 
then create indirection table holding those WQs. It should be quite 
simple from user point of view to do that.

  What types of errors occur if the indirection table/hash
> specifies a WQ which does not exist?

The IB/uverbs layer will return -EINVAL please follow V4 which addressed 
that specifically.

> Maybe I'm just confused about the differences between the indirection table and
> the hash function?

For further understanding the concept please have a look at below URL 
which was also mentioned in the cover letter.
http://lxr.free-electrons.com/source/Documentation/networking/scaling.txt

>>
>> WQ associated (many to one) with Completion Queue and it owns WQ
>> properties (PD, WQ size, etc.).
>> WQ has a type, this patch introduces the IB_WQT_RQ (i.e.receive queue),
>> it may be extend to others such as IB_WQT_SQ. (send queue).
>> WQ from type IB_WQT_RQ contains receive work requests.
>>
>> PD is an attribute of a work queue (i.e. send/receive queue), it's used
>> by the hardware for security validation before scattering to a memory
>> region which is pointed by the WQ. For that, an external WQ object
>> needs a PD, letting the hardware makes that validation.
>>
>> When accessing a memory region that is pointed by the WQ its PD
>> is used and not the QP's PD, this behavior is similar
>> to a SRQ and a QP.
>>
>> WQ context is subject to a well-defined state transitions done by
>> the modify_wq verb.
>> When WQ is created its initial state becomes IB_WQS_RESET.
>> From IB_WQS_RESET it can be modified to itself or to IB_WQS_RDY.
>> From IB_WQS_RDY it can be modified to itself, to IB_WQS_RESET
>> or to IB_WQS_ERR.
>> From IB_WQS_ERR it can be modified to IB_WQS_RESET.
>>
>> Note: transition to IB_WQS_ERR might occur implicitly in case there
>> was some HW error.
>>
>> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
>> Signed-off-by: Matan Barak <matanb@mellanox.com>
>> ---
>>  drivers/infiniband/core/verbs.c | 82 +++++++++++++++++++++++++++++++++++++++++
>>  include/rdma/ib_verbs.h         | 56 +++++++++++++++++++++++++++-
>>  2 files changed, 137 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
>> index 15b8adb..c6c5792 100644
>> --- a/drivers/infiniband/core/verbs.c
>> +++ b/drivers/infiniband/core/verbs.c
>> @@ -1516,6 +1516,88 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
>>  }
>>  EXPORT_SYMBOL(ib_dealloc_xrcd);
>>
>> +/**
>> + * ib_create_wq - Creates a WQ associated with the specified protection
>> + * domain.
>> + * @pd: The protection domain associated with the WQ.
>> + * @wq_init_attr: A list of initial attributes required to create the
>
> Is this really a list of attributes?

Yes, it follows the qp_init_attr notation.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 15b8adb..c6c5792 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1516,6 +1516,88 @@  int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 }
 EXPORT_SYMBOL(ib_dealloc_xrcd);
 
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_init_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_init_attr->max_wr and wq_init_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+			   struct ib_wq_init_attr *wq_attr)
+{
+	struct ib_wq *wq;
+
+	if (!pd->device->create_wq)
+		return ERR_PTR(-ENOSYS);
+
+	wq = pd->device->create_wq(pd, wq_attr, NULL);
+	if (!IS_ERR(wq)) {
+		wq->event_handler = wq_attr->event_handler;
+		wq->wq_context = wq_attr->wq_context;
+		wq->wq_type = wq_attr->wq_type;
+		wq->cq = wq_attr->cq;
+		wq->device = pd->device;
+		wq->pd = pd;
+		wq->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&wq_attr->cq->usecnt);
+		atomic_set(&wq->usecnt, 0);
+	}
+	return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+	int err;
+	struct ib_cq *cq = wq->cq;
+	struct ib_pd *pd = wq->pd;
+
+	if (atomic_read(&wq->usecnt))
+		return -EBUSY;
+
+	err = wq->device->destroy_wq(wq);
+	if (!err) {
+		atomic_dec(&pd->usecnt);
+		atomic_dec(&cq->usecnt);
+	}
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ *   are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		 u32 wq_attr_mask)
+{
+	int err;
+
+	if (!wq->device->modify_wq)
+		return -ENOSYS;
+
+	err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+	return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
 struct ib_flow *ib_create_flow(struct ib_qp *qp,
 			       struct ib_flow_attr *flow_attr,
 			       int domain)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index fb2cef4..5934f2d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1416,6 +1416,48 @@  struct ib_srq {
 	} ext;
 };
 
+enum ib_wq_type {
+	IB_WQT_RQ
+};
+
+enum ib_wq_state {
+	IB_WQS_RESET,
+	IB_WQS_RDY,
+	IB_WQS_ERR
+};
+
+struct ib_wq {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	void		    *wq_context;
+	void		    (*event_handler)(struct ib_event *, void *);
+	struct ib_pd	       *pd;
+	struct ib_cq	       *cq;
+	u32		wq_num;
+	enum ib_wq_state       state;
+	enum ib_wq_type	wq_type;
+	atomic_t		usecnt;
+};
+
+struct ib_wq_init_attr {
+	void		       *wq_context;
+	enum ib_wq_type	wq_type;
+	u32		max_wr;
+	u32		max_sge;
+	struct	ib_cq	       *cq;
+	void		    (*event_handler)(struct ib_event *, void *);
+};
+
+enum ib_wq_attr_mask {
+	IB_WQ_STATE	= 1 << 0,
+	IB_WQ_CUR_STATE	= 1 << 1,
+};
+
+struct ib_wq_attr {
+	enum	ib_wq_state	wq_state;
+	enum	ib_wq_state	curr_wq_state;
+};
+
 struct ib_qp {
 	struct ib_device       *device;
 	struct ib_pd	       *pd;
@@ -1878,7 +1920,14 @@  struct ib_device {
 						   struct ifla_vf_stats *stats);
 	int			   (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid,
 						  int type);
-
+	struct ib_wq *		   (*create_wq)(struct ib_pd *pd,
+						struct ib_wq_init_attr *init_attr,
+						struct ib_udata *udata);
+	int			   (*destroy_wq)(struct ib_wq *wq);
+	int			   (*modify_wq)(struct ib_wq *wq,
+						struct ib_wq_attr *attr,
+						u32 wq_attr_mask,
+						struct ib_udata *udata);
 	struct ib_dma_mapping_ops   *dma_ops;
 
 	struct module               *owner;
@@ -3110,6 +3159,11 @@  int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
 					    u16 pkey, const union ib_gid *gid,
 					    const struct sockaddr *addr);
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+			   struct ib_wq_init_attr *init_attr);
+int ib_destroy_wq(struct ib_wq *wq);
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
+		 u32 wq_attr_mask);
 
 int ib_map_mr_sg(struct ib_mr *mr,
 		 struct scatterlist *sg,