diff mbox series

[v6,03/25] rtrs: private headers with rtrs protocol structs and helpers

Message ID 20191230102942.18395-4-jinpuwang@gmail.com (mailing list archive)
State New, archived
Headers show
Series RTRS (former IBTRS) rdma transport library and RNBD (former IBNBD) rdma network block device | expand

Commit Message

Jinpu Wang Dec. 30, 2019, 10:29 a.m. UTC
From: Jack Wang <jinpu.wang@cloud.ionos.com>

These are common private headers with rtrs protocol structures,
logging, sysfs and other helper functions, which are used on
both client and server sides.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/infiniband/ulp/rtrs/rtrs-log.h |  32 ++
 drivers/infiniband/ulp/rtrs/rtrs-pri.h | 408 +++++++++++++++++++++++++
 2 files changed, 440 insertions(+)
 create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-log.h
 create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-pri.h

Comments

Bart Van Assche Dec. 30, 2019, 7:48 p.m. UTC | #1
On 2019-12-30 02:29, Jack Wang wrote:
> + * InfiniBand Transport Layer

Is RTRS an InfiniBand or an RDMA transport layer?

> +#define rtrs_prefix(obj) (obj->sessname)

Is it really worth it to introduce a macro for accessing a single member
of a single pointer?

> + * InfiniBand Transport Layer

Same question here: is RTRS an InfiniBand or an RDMA transport layer?

> +enum {
> +	SERVICE_CON_QUEUE_DEPTH = 512,

What is a service connection?

> +	/*
> +	 * With the current size of the tag allocated on the client, 4K
> +	 * is the maximum number of tags we can allocate.  This number is
> +	 * also used on the client to allocate the IU for the user connection
> +	 * to receive the RDMA addresses from the server.
> +	 */

What does the word 'tag' mean in the context of the RTRS protocol?

> +struct rtrs_ib_dev;

What does the "rtrs_ib_dev" data structure represent? Additionally, I
think it's confusing that a single name has an "r" that refers to "RDMA"
and "ib" that refers to InfiniBand.

> +struct rtrs_ib_dev_pool {
> +	struct mutex		mutex;
> +	struct list_head	list;
> +	enum ib_pd_flags	pd_flags;
> +	const struct rtrs_ib_dev_pool_ops *ops;
> +};

What is the purpose of an rtrs_ib_dev_pool and what does it contain?

> +struct rtrs_iu {

A comment that explains what the "iu" abbreviation stands for would be
welcome.

> +/**
> + * enum rtrs_msg_types - RTRS message types.
> + * @RTRS_MSG_INFO_REQ:		Client additional info request to the server
> + * @RTRS_MSG_INFO_RSP:		Server additional info response to the client
> + * @RTRS_MSG_WRITE:		Client writes data per RDMA to server
> + * @RTRS_MSG_READ:		Client requests data transfer from server
> + * @RTRS_MSG_RKEY_RSP:		Server refreshed rkey for rbuf
> + */

What is "additional info" in this context?

> +/**
> + * struct rtrs_msg_conn_req - Client connection request to the server
> + * @magic:	   RTRS magic
> + * @version:	   RTRS protocol version
> + * @cid:	   Current connection id
> + * @cid_num:	   Number of connections per session
> + * @recon_cnt:	   Reconnections counter
> + * @sess_uuid:	   UUID of a session (path)
> + * @paths_uuid:	   UUID of a group of sessions (paths)
> + *
> + * NOTE: max size 56 bytes, see man rdma_connect().
> + */
> +struct rtrs_msg_conn_req {
> +	u8		__cma_version; /* Is set to 0 by cma.c in case of
> +					* AF_IB, do not touch that.
> +					*/
> +	u8		__ip_version;  /* On sender side that should be
> +					* set to 0, or cma_save_ip_info()
> +					* extract garbage and will fail.
> +					*/

The above two fields and the comments next to it look suspicious to me.
Does RTRS perhaps try to generate CMA-formatted messages without using
the CMA to format these messages?

> +	u8		reserved[12];

Please leave out the reserved data. If future versions of the protocol
would need any of these bytes it is easy to add more data to this structure.

> +/**
> + * struct rtrs_msg_conn_rsp - Server connection response to the client
> + * @magic:	   RTRS magic
> + * @version:	   RTRS protocol version
> + * @errno:	   If rdma_accept() then 0, if rdma_reject() indicates error
> + * @queue_depth:   max inflight messages (queue-depth) in this session
> + * @max_io_size:   max io size server supports
> + * @max_hdr_size:  max msg header size server supports
> + *
> + * NOTE: size is 56 bytes, max possible is 136 bytes, see man rdma_accept().
> + */
> +struct rtrs_msg_conn_rsp {
> +	__le16		magic;
> +	__le16		version;
> +	__le16		errno;
> +	__le16		queue_depth;
> +	__le32		max_io_size;
> +	__le32		max_hdr_size;
> +	__le32		flags;
> +	u8		reserved[36];
> +};

Same comment here: please leave out the "reserved[]" array. Sending a
bunch of zero-bytes at the end of a message over the wire is not useful.

> +static inline void rtrs_from_imm(u32 imm, u32 *type, u32 *payload)
> +{
> +	*payload = (imm & MAX_IMM_PAYL_MASK);
> +	*type = (imm >> MAX_IMM_PAYL_BITS);
> +}

Please do not use parentheses when not necessary. Such superfluous
parentheses namely hurt readability of the code.

> +	type = (w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM);

Same comment here: I think the parentheses can be left out from the
above statement.

> +static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno)
> +{
> +	/* 9 bits for errno, 19 bits for msg_id */
> +	*msg_id = (payload & 0x7ffff);

Are the parentheses in the above expression necessary?

> +	*errno = -(int)((payload >> 19) & 0x1ff);

Is the '(int)' cast useful in the above expression? Can it be left out?

> +#define STAT_ATTR(type, stat, print, reset)				\
> +STAT_STORE_FUNC(type, stat, reset)					\
> +STAT_SHOW_FUNC(type, stat, print)					\
> +static struct kobj_attribute stat##_attr =				\
> +		__ATTR(stat, 0644,					\
> +		       stat##_show,					\
> +		       stat##_store)

Is the above use of __ATTR() perhaps an open-coded version of __ATTR_RW()?

Thanks,

Bart.
Bart Van Assche Dec. 31, 2019, 12:07 a.m. UTC | #2
On 2019-12-30 02:29, Jack Wang wrote:
> +static inline u32 rtrs_to_io_rsp_imm(u32 msg_id, int errno, bool w_inval)
> +{
> +	enum rtrs_imm_type type;
> +	u32 payload;
> +
> +	/* 9 bits for errno, 19 bits for msg_id */
> +	payload = (abs(errno) & 0x1ff) << 19 | (msg_id & 0x7ffff);
> +	type = (w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM);
> +
> +	return rtrs_to_imm(type, payload);
> +}
> +
> +static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno)
> +{
> +	/* 9 bits for errno, 19 bits for msg_id */
> +	*msg_id = (payload & 0x7ffff);
> +	*errno = -(int)((payload >> 19) & 0x1ff);
> +}

The above comments mention that 19 bits are used for msg_id. The 0x7ffff
mask however has 23 bits set. Did I see that correctly? If so, does that
mean that the errno and msg_id bitfields overlap partially?

Thanks,

Bart.
Jinpu Wang Jan. 2, 2020, 3:27 p.m. UTC | #3
On Mon, Dec 30, 2019 at 8:48 PM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 2019-12-30 02:29, Jack Wang wrote:
> > + * InfiniBand Transport Layer
>
> Is RTRS an InfiniBand or an RDMA transport layer?
The later,  will fix.
>
> > +#define rtrs_prefix(obj) (obj->sessname)
>
> Is it really worth it to introduce a macro for accessing a single member
> of a single pointer?
maybe not, will remove.
>
> > + * InfiniBand Transport Layer
>
> Same question here: is RTRS an InfiniBand or an RDMA transport layer?
will fix.

>
> > +enum {
> > +     SERVICE_CON_QUEUE_DEPTH = 512,
>
> What is a service connection?
s/SERVICE_CON_QUEUE_DEPTH/CON_QUEUE_DEPTH/g, do you think
CON_QUEUE_DEPTH is better or just QUEUE_DEPTH?
>
> > +     /*
> > +      * With the current size of the tag allocated on the client, 4K
> > +      * is the maximum number of tags we can allocate.  This number is
> > +      * also used on the client to allocate the IU for the user connection
> > +      * to receive the RDMA addresses from the server.
> > +      */
>
> What does the word 'tag' mean in the context of the RTRS protocol?
should be struct rtrs_permit, will fix.
>
> > +struct rtrs_ib_dev;
>
> What does the "rtrs_ib_dev" data structure represent? Additionally, I
> think it's confusing that a single name has an "r" that refers to "RDMA"
> and "ib" that refers to InfiniBand.
Naming is hard, it's structure mainly to cache struct ib_device
pointer and ib_pd pointer.
more info can be found below, Roman did try to push it to upstream,
see comment below.
>
> > +struct rtrs_ib_dev_pool {
> > +     struct mutex            mutex;
> > +     struct list_head        list;
> > +     enum ib_pd_flags        pd_flags;
> > +     const struct rtrs_ib_dev_pool_ops *ops;
> > +};
>
> What is the purpose of an rtrs_ib_dev_pool and what does it contain?
The idea was documented in the patchset here:
https://www.spinics.net/lists/linux-rdma/msg64025.html
"'
This is an attempt to make a device pool API out of a common code,
which caches pair of ib_device and ib_pd pointers. I found 4 places,
where this common functionality can be replaced by some lib calls:
nvme, nvmet, iser and isert. Total deduplication gain in loc is not
quite significant, but eventually new ULP IB code can also require
the same device/pd pair cache, e.g. in our IBTRS module the same
code has to be repeated again, which was observed by Sagi and he
suggested to make a common helper function instead of producing
another copy.
'''

>
> > +struct rtrs_iu {
>
> A comment that explains what the "iu" abbreviation stands for would be
> welcome.
ok.
>
> > +/**
> > + * enum rtrs_msg_types - RTRS message types.
> > + * @RTRS_MSG_INFO_REQ:               Client additional info request to the server
> > + * @RTRS_MSG_INFO_RSP:               Server additional info response to the client
> > + * @RTRS_MSG_WRITE:          Client writes data per RDMA to server
> > + * @RTRS_MSG_READ:           Client requests data transfer from server
> > + * @RTRS_MSG_RKEY_RSP:               Server refreshed rkey for rbuf
> > + */
>
> What is "additional info" in this context?
We have a bit more documentation in rtrs/README in patch 14,
"'
3. After all connections of a path are established client sends to server the
RTRS_MSG_INFO_REQ message, containing the name of the session. This message
requests the address information from the server.

4. Server replies to the session info request message with RTRS_MSG_INFO_RSP,
which contains the addresses and keys of the RDMA buffers allocated for that
session.
"'
>
> > +/**
> > + * struct rtrs_msg_conn_req - Client connection request to the server
> > + * @magic:      RTRS magic
> > + * @version:    RTRS protocol version
> > + * @cid:        Current connection id
> > + * @cid_num:    Number of connections per session
> > + * @recon_cnt:          Reconnections counter
> > + * @sess_uuid:          UUID of a session (path)
> > + * @paths_uuid:         UUID of a group of sessions (paths)
> > + *
> > + * NOTE: max size 56 bytes, see man rdma_connect().
> > + */
> > +struct rtrs_msg_conn_req {
> > +     u8              __cma_version; /* Is set to 0 by cma.c in case of
> > +                                     * AF_IB, do not touch that.
> > +                                     */
> > +     u8              __ip_version;  /* On sender side that should be
> > +                                     * set to 0, or cma_save_ip_info()
> > +                                     * extract garbage and will fail.
> > +                                     */
>
> The above two fields and the comments next to it look suspicious to me.
> Does RTRS perhaps try to generate CMA-formatted messages without using
> the CMA to format these messages?
The problem is in cma_format_hdr over-writes the first byte for AF_IB
https://www.spinics.net/lists/linux-rdma/msg22397.html

No one fixes the problem since then.

>
> > +     u8              reserved[12];
>
> Please leave out the reserved data. If future versions of the protocol
> would need any of these bytes it is easy to add more data to this structure.
Sorry, we can't do that, as I explained in the past, we have code
running in production and
there are checks expecting the size the of message are the same, it
will make the transition
to upstream version in the future a lot harder if we change the size
of the controll message.
>
> > +/**
> > + * struct rtrs_msg_conn_rsp - Server connection response to the client
> > + * @magic:      RTRS magic
> > + * @version:    RTRS protocol version
> > + * @errno:      If rdma_accept() then 0, if rdma_reject() indicates error
> > + * @queue_depth:   max inflight messages (queue-depth) in this session
> > + * @max_io_size:   max io size server supports
> > + * @max_hdr_size:  max msg header size server supports
> > + *
> > + * NOTE: size is 56 bytes, max possible is 136 bytes, see man rdma_accept().
> > + */
> > +struct rtrs_msg_conn_rsp {
> > +     __le16          magic;
> > +     __le16          version;
> > +     __le16          errno;
> > +     __le16          queue_depth;
> > +     __le32          max_io_size;
> > +     __le32          max_hdr_size;
> > +     __le32          flags;
> > +     u8              reserved[36];
> > +};
>
> Same comment here: please leave out the "reserved[]" array. Sending a
> bunch of zero-bytes at the end of a message over the wire is not useful.
same here.
>
> > +static inline void rtrs_from_imm(u32 imm, u32 *type, u32 *payload)
> > +{
> > +     *payload = (imm & MAX_IMM_PAYL_MASK);
> > +     *type = (imm >> MAX_IMM_PAYL_BITS);
> > +}
>
> Please do not use parentheses when not necessary. Such superfluous
> parentheses namely hurt readability of the code.
ok, will remove.
>
> > +     type = (w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM);
>
> Same comment here: I think the parentheses can be left out from the
> above statement.
ok, will remove
>
> > +static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno)
> > +{
> > +     /* 9 bits for errno, 19 bits for msg_id */
> > +     *msg_id = (payload & 0x7ffff);
>
> Are the parentheses in the above expression necessary?
will remove.
>
> > +     *errno = -(int)((payload >> 19) & 0x1ff);
>
> Is the '(int)' cast useful in the above expression? Can it be left out?
I think it's necessary, and make it more clear errno is a negative int
value, isn't it?

>
> > +#define STAT_ATTR(type, stat, print, reset)                          \
> > +STAT_STORE_FUNC(type, stat, reset)                                   \
> > +STAT_SHOW_FUNC(type, stat, print)                                    \
> > +static struct kobj_attribute stat##_attr =                           \
> > +             __ATTR(stat, 0644,                                      \
> > +                    stat##_show,                                     \
> > +                    stat##_store)
>
> Is the above use of __ATTR() perhaps an open-coded version of __ATTR_RW()?
right, will use __ATTR_RW() instead.
>
> Thanks,
>
> Bart.
Thanks Bart!
Bart Van Assche Jan. 2, 2020, 5 p.m. UTC | #4
On 1/2/20 7:27 AM, Jinpu Wang wrote:
> On Mon, Dec 30, 2019 at 8:48 PM Bart Van Assche <bvanassche@acm.org> wrote:
>> On 2019-12-30 02:29, Jack Wang wrote:
>>> +enum {
>>> +     SERVICE_CON_QUEUE_DEPTH = 512,
>>
>> What is a service connection?
> s/SERVICE_CON_QUEUE_DEPTH/CON_QUEUE_DEPTH/g, do you think
> CON_QUEUE_DEPTH is better or just QUEUE_DEPTH?

The name of the constant is fine, but what I meant is the following: has 
it been documented anywhere what the role of a "service connection" is?

>>> +struct rtrs_ib_dev_pool {
>>> +     struct mutex            mutex;
>>> +     struct list_head        list;
>>> +     enum ib_pd_flags        pd_flags;
>>> +     const struct rtrs_ib_dev_pool_ops *ops;
>>> +};
>>
>> What is the purpose of an rtrs_ib_dev_pool and what does it contain?
> The idea was documented in the patchset here:
> https://www.spinics.net/lists/linux-rdma/msg64025.html
> "'
> This is an attempt to make a device pool API out of a common code,
> which caches pair of ib_device and ib_pd pointers. I found 4 places,
> where this common functionality can be replaced by some lib calls:
> nvme, nvmet, iser and isert. Total deduplication gain in loc is not
> quite significant, but eventually new ULP IB code can also require
> the same device/pd pair cache, e.g. in our IBTRS module the same
> code has to be repeated again, which was observed by Sagi and he
> suggested to make a common helper function instead of producing
> another copy.
> '''

The word "pool" suggest ownership. Since struct rtrs_ib_dev_pool owns 
protection domains instead of RDMA devices, how about renaming that data 
structure into rtrs_pd_per_rdma_dev, rtrs_rdma_dev_pd or something 
similar? How about adding a comment like the following above that data 
structure?

/*
  * Data structure used to associate one protection domain (PD) with each
  * RDMA device.
  */

>>> +/**
>>> + * struct rtrs_msg_conn_req - Client connection request to the server
>>> + * @magic:      RTRS magic
>>> + * @version:    RTRS protocol version
>>> + * @cid:        Current connection id
>>> + * @cid_num:    Number of connections per session
>>> + * @recon_cnt:          Reconnections counter
>>> + * @sess_uuid:          UUID of a session (path)
>>> + * @paths_uuid:         UUID of a group of sessions (paths)
>>> + *
>>> + * NOTE: max size 56 bytes, see man rdma_connect().
>>> + */
>>> +struct rtrs_msg_conn_req {
>>> +     u8              __cma_version; /* Is set to 0 by cma.c in case of
>>> +                                     * AF_IB, do not touch that.
>>> +                                     */
>>> +     u8              __ip_version;  /* On sender side that should be
>>> +                                     * set to 0, or cma_save_ip_info()
>>> +                                     * extract garbage and will fail.
>>> +                                     */
>>
>> The above two fields and the comments next to it look suspicious to me.
>> Does RTRS perhaps try to generate CMA-formatted messages without using
>> the CMA to format these messages?
> The problem is in cma_format_hdr over-writes the first byte for AF_IB
> https://www.spinics.net/lists/linux-rdma/msg22397.html
> 
> No one fixes the problem since then.

How about adding that URL to the comment block above struct 
rtrs_msg_conn_req?

>>
>>> +     *errno = -(int)((payload >> 19) & 0x1ff);
>>
>> Is the '(int)' cast useful in the above expression? Can it be left out?
> I think it's necessary, and make it more clear errno is a negative int
> value, isn't it?

According to the C standard operations on unsigned integers "wrap 
around" so removing the cast should be safe. Anyway, this is not 
something I consider important.

Thanks,

Bart.
Jason Gunthorpe Jan. 2, 2020, 6:26 p.m. UTC | #5
On Thu, Jan 02, 2020 at 09:00:53AM -0800, Bart Van Assche wrote:

> > > > +/**
> > > > + * struct rtrs_msg_conn_req - Client connection request to the server
> > > > + * @magic:      RTRS magic
> > > > + * @version:    RTRS protocol version
> > > > + * @cid:        Current connection id
> > > > + * @cid_num:    Number of connections per session
> > > > + * @recon_cnt:          Reconnections counter
> > > > + * @sess_uuid:          UUID of a session (path)
> > > > + * @paths_uuid:         UUID of a group of sessions (paths)
> > > > + *
> > > > + * NOTE: max size 56 bytes, see man rdma_connect().
> > > > + */
> > > > +struct rtrs_msg_conn_req {
> > > > +     u8              __cma_version; /* Is set to 0 by cma.c in case of
> > > > +                                     * AF_IB, do not touch that.
> > > > +                                     */
> > > > +     u8              __ip_version;  /* On sender side that should be
> > > > +                                     * set to 0, or cma_save_ip_info()
> > > > +                                     * extract garbage and will fail.
> > > > +                                     */
> > > 
> > > The above two fields and the comments next to it look suspicious to me.
> > > Does RTRS perhaps try to generate CMA-formatted messages without using
> > > the CMA to format these messages?
> > The problem is in cma_format_hdr over-writes the first byte for AF_IB
> > https://www.spinics.net/lists/linux-rdma/msg22397.html
> > 
> > No one fixes the problem since then.
> 
> How about adding that URL to the comment block above struct
> rtrs_msg_conn_req?

Or just fixing whatever the problem is..

Jason
Jinpu Wang Jan. 3, 2020, 12:27 p.m. UTC | #6
On Thu, Jan 2, 2020 at 6:00 PM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 1/2/20 7:27 AM, Jinpu Wang wrote:
> > On Mon, Dec 30, 2019 at 8:48 PM Bart Van Assche <bvanassche@acm.org> wrote:
> >> On 2019-12-30 02:29, Jack Wang wrote:
> >>> +enum {
> >>> +     SERVICE_CON_QUEUE_DEPTH = 512,
> >>
> >> What is a service connection?
> > s/SERVICE_CON_QUEUE_DEPTH/CON_QUEUE_DEPTH/g, do you think
> > CON_QUEUE_DEPTH is better or just QUEUE_DEPTH?
>
> The name of the constant is fine, but what I meant is the following: has
> it been documented anywhere what the role of a "service connection" is?
ah, get your point now, will add a comment before the constant.
>
> >>> +struct rtrs_ib_dev_pool {
> >>> +     struct mutex            mutex;
> >>> +     struct list_head        list;
> >>> +     enum ib_pd_flags        pd_flags;
> >>> +     const struct rtrs_ib_dev_pool_ops *ops;
> >>> +};
> >>
> >> What is the purpose of an rtrs_ib_dev_pool and what does it contain?
> > The idea was documented in the patchset here:
> > https://www.spinics.net/lists/linux-rdma/msg64025.html
> > "'
> > This is an attempt to make a device pool API out of a common code,
> > which caches pair of ib_device and ib_pd pointers. I found 4 places,
> > where this common functionality can be replaced by some lib calls:
> > nvme, nvmet, iser and isert. Total deduplication gain in loc is not
> > quite significant, but eventually new ULP IB code can also require
> > the same device/pd pair cache, e.g. in our IBTRS module the same
> > code has to be repeated again, which was observed by Sagi and he
> > suggested to make a common helper function instead of producing
> > another copy.
> > '''
>
> The word "pool" suggest ownership. Since struct rtrs_ib_dev_pool owns
> protection domains instead of RDMA devices, how about renaming that data
> structure into rtrs_pd_per_rdma_dev, rtrs_rdma_dev_pd or something
> similar? How about adding a comment like the following above that data
> structure?
rtrs_rdma_dev_pd sounds better to me, will also add the comments.
>
> /*
>   * Data structure used to associate one protection domain (PD) with each
>   * RDMA device.
>   */
>
> >>> +/**
> >>> + * struct rtrs_msg_conn_req - Client connection request to the server
> >>> + * @magic:      RTRS magic
> >>> + * @version:    RTRS protocol version
> >>> + * @cid:        Current connection id
> >>> + * @cid_num:    Number of connections per session
> >>> + * @recon_cnt:          Reconnections counter
> >>> + * @sess_uuid:          UUID of a session (path)
> >>> + * @paths_uuid:         UUID of a group of sessions (paths)
> >>> + *
> >>> + * NOTE: max size 56 bytes, see man rdma_connect().
> >>> + */
> >>> +struct rtrs_msg_conn_req {
> >>> +     u8              __cma_version; /* Is set to 0 by cma.c in case of
> >>> +                                     * AF_IB, do not touch that.
> >>> +                                     */
> >>> +     u8              __ip_version;  /* On sender side that should be
> >>> +                                     * set to 0, or cma_save_ip_info()
> >>> +                                     * extract garbage and will fail.
> >>> +                                     */
> >>
> >> The above two fields and the comments next to it look suspicious to me.
> >> Does RTRS perhaps try to generate CMA-formatted messages without using
> >> the CMA to format these messages?
> > The problem is in cma_format_hdr over-writes the first byte for AF_IB
> > https://www.spinics.net/lists/linux-rdma/msg22397.html
> >
> > No one fixes the problem since then.
>
> How about adding that URL to the comment block above struct
> rtrs_msg_conn_req?
Ok

Thanks
Jinpu Wang Jan. 3, 2020, 12:31 p.m. UTC | #7
On Thu, Jan 2, 2020 at 7:26 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Jan 02, 2020 at 09:00:53AM -0800, Bart Van Assche wrote:
>
> > > > > +/**
> > > > > + * struct rtrs_msg_conn_req - Client connection request to the server
> > > > > + * @magic:      RTRS magic
> > > > > + * @version:    RTRS protocol version
> > > > > + * @cid:        Current connection id
> > > > > + * @cid_num:    Number of connections per session
> > > > > + * @recon_cnt:          Reconnections counter
> > > > > + * @sess_uuid:          UUID of a session (path)
> > > > > + * @paths_uuid:         UUID of a group of sessions (paths)
> > > > > + *
> > > > > + * NOTE: max size 56 bytes, see man rdma_connect().
> > > > > + */
> > > > > +struct rtrs_msg_conn_req {
> > > > > +     u8              __cma_version; /* Is set to 0 by cma.c in case of
> > > > > +                                     * AF_IB, do not touch that.
> > > > > +                                     */
> > > > > +     u8              __ip_version;  /* On sender side that should be
> > > > > +                                     * set to 0, or cma_save_ip_info()
> > > > > +                                     * extract garbage and will fail.
> > > > > +                                     */
> > > >
> > > > The above two fields and the comments next to it look suspicious to me.
> > > > Does RTRS perhaps try to generate CMA-formatted messages without using
> > > > the CMA to format these messages?
> > > The problem is in cma_format_hdr over-writes the first byte for AF_IB
> > > https://www.spinics.net/lists/linux-rdma/msg22397.html
> > >
> > > No one fixes the problem since then.
> >
> > How about adding that URL to the comment block above struct
> > rtrs_msg_conn_req?
>
> Or just fixing whatever the problem is..
>
> Jason
I can do another try if no one else fix the problem, but I have plenty of to-do.

Thanks
Jinpu Wang Jan. 3, 2020, 1:48 p.m. UTC | #8
On Tue, Dec 31, 2019 at 1:07 AM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 2019-12-30 02:29, Jack Wang wrote:
> > +static inline u32 rtrs_to_io_rsp_imm(u32 msg_id, int errno, bool w_inval)
> > +{
> > +     enum rtrs_imm_type type;
> > +     u32 payload;
> > +
> > +     /* 9 bits for errno, 19 bits for msg_id */
> > +     payload = (abs(errno) & 0x1ff) << 19 | (msg_id & 0x7ffff);
> > +     type = (w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM);
> > +
> > +     return rtrs_to_imm(type, payload);
> > +}
> > +
> > +static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno)
> > +{
> > +     /* 9 bits for errno, 19 bits for msg_id */
> > +     *msg_id = (payload & 0x7ffff);
> > +     *errno = -(int)((payload >> 19) & 0x1ff);
> > +}
>
> The above comments mention that 19 bits are used for msg_id. The 0x7ffff
> mask however has 23 bits set. Did I see that correctly? If so, does that
> mean that the errno and msg_id bitfields overlap partially?
Double checked with calculator 0x7ffff is 19 bits set, not 23 bits :)
>
> Thanks,
>
> Bart.
diff mbox series

Patch

diff --git a/drivers/infiniband/ulp/rtrs/rtrs-log.h b/drivers/infiniband/ulp/rtrs/rtrs-log.h
new file mode 100644
index 000000000000..570329a73ee4
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-log.h
@@ -0,0 +1,32 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ *
+ * Copyright (c) 2019 1&1 IONOS SE. All rights reserved.
+ */
+#ifndef RTRS_LOG_H
+#define RTRS_LOG_H
+
+#define rtrs_prefix(obj) (obj->sessname)
+
+#define rtrs_log(fn, obj, fmt, ...)				\
+	fn("<%s>: " fmt, rtrs_prefix(obj), ##__VA_ARGS__)
+
+#define rtrs_err(obj, fmt, ...)	\
+	rtrs_log(pr_err, obj, fmt, ##__VA_ARGS__)
+#define rtrs_err_rl(obj, fmt, ...)	\
+	rtrs_log(pr_err_ratelimited, obj, fmt, ##__VA_ARGS__)
+#define rtrs_wrn(obj, fmt, ...)	\
+	rtrs_log(pr_warn, obj, fmt, ##__VA_ARGS__)
+#define rtrs_wrn_rl(obj, fmt, ...) \
+	rtrs_log(pr_warn_ratelimited, obj, fmt, ##__VA_ARGS__)
+#define rtrs_info(obj, fmt, ...) \
+	rtrs_log(pr_info, obj, fmt, ##__VA_ARGS__)
+#define rtrs_info_rl(obj, fmt, ...) \
+	rtrs_log(pr_info_ratelimited, obj, fmt, ##__VA_ARGS__)
+
+#endif /* RTRS_LOG_H */
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h
new file mode 100644
index 000000000000..f215e6c0ce73
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h
@@ -0,0 +1,408 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ *
+ * Copyright (c) 2019 1&1 IONOS SE. All rights reserved.
+ */
+
+#ifndef RTRS_PRI_H
+#define RTRS_PRI_H
+
+#include <linux/uuid.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib.h>
+
+#include "rtrs.h"
+
+#define RTRS_PROTO_VER_MAJOR 2
+#define RTRS_PROTO_VER_MINOR 0
+
+#define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \
+			       __stringify(RTRS_PROTO_VER_MINOR)
+
+enum rtrs_imm_const {
+	MAX_IMM_TYPE_BITS = 4,
+	MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1),
+	MAX_IMM_PAYL_BITS = 28,
+	MAX_IMM_PAYL_MASK = ((1 << MAX_IMM_PAYL_BITS) - 1),
+};
+
+enum rtrs_imm_type {
+	RTRS_IO_REQ_IMM       = 0, /* client to server */
+	RTRS_IO_RSP_IMM       = 1, /* server to client */
+	RTRS_IO_RSP_W_INV_IMM = 2, /* server to client */
+
+	RTRS_HB_MSG_IMM = 8,
+	RTRS_HB_ACK_IMM = 9,
+
+	RTRS_LAST_IMM,
+};
+
+enum {
+	SERVICE_CON_QUEUE_DEPTH = 512,
+
+	MIN_RTR_CNT = 1,
+	MAX_RTR_CNT = 7,
+
+	MAX_PATHS_NUM = 128,
+
+	/*
+	 * With the current size of the tag allocated on the client, 4K
+	 * is the maximum number of tags we can allocate.  This number is
+	 * also used on the client to allocate the IU for the user connection
+	 * to receive the RDMA addresses from the server.
+	 */
+	MAX_SESS_QUEUE_DEPTH = 4096,
+
+	RTRS_HB_INTERVAL_MS = 5000,
+	RTRS_HB_MISSED_MAX = 5,
+
+	RTRS_MAGIC = 0x1BBD,
+	RTRS_PROTO_VER = (RTRS_PROTO_VER_MAJOR << 8) | RTRS_PROTO_VER_MINOR,
+};
+
+struct rtrs_ib_dev;
+
+struct rtrs_ib_dev_pool_ops {
+	struct rtrs_ib_dev *(*alloc)(void);
+	void (*free)(struct rtrs_ib_dev *dev);
+	int (*init)(struct rtrs_ib_dev *dev);
+	void (*deinit)(struct rtrs_ib_dev *dev);
+};
+
+struct rtrs_ib_dev_pool {
+	struct mutex		mutex;
+	struct list_head	list;
+	enum ib_pd_flags	pd_flags;
+	const struct rtrs_ib_dev_pool_ops *ops;
+};
+
+struct rtrs_ib_dev {
+	struct ib_device	 *ib_dev;
+	struct ib_pd		 *ib_pd;
+	struct kref		 ref;
+	struct list_head	 entry;
+	struct rtrs_ib_dev_pool *pool;
+};
+
+struct rtrs_con {
+	struct rtrs_sess	*sess;
+	struct ib_qp		*qp;
+	struct ib_cq		*cq;
+	struct rdma_cm_id	*cm_id;
+	unsigned int		cid;
+};
+
+typedef void (rtrs_hb_handler_t)(struct rtrs_con *con);
+
+struct rtrs_sess {
+	struct list_head	entry;
+	struct sockaddr_storage dst_addr;
+	struct sockaddr_storage src_addr;
+	char			sessname[NAME_MAX];
+	uuid_t			uuid;
+	struct rtrs_con	**con;
+	unsigned int		con_num;
+	unsigned int		recon_cnt;
+	struct rtrs_ib_dev	*dev;
+	int			dev_ref;
+	struct ib_cqe		*hb_cqe;
+	rtrs_hb_handler_t	*hb_err_handler;
+	struct workqueue_struct *hb_wq;
+	struct delayed_work	hb_dwork;
+	unsigned int		hb_interval_ms;
+	unsigned int		hb_missed_cnt;
+	unsigned int		hb_missed_max;
+};
+
+struct rtrs_iu {
+	struct list_head        list;
+	struct ib_cqe           cqe;
+	dma_addr_t              dma_addr;
+	void                    *buf;
+	size_t                  size;
+	enum dma_data_direction direction;
+};
+
+/**
+ * enum rtrs_msg_types - RTRS message types.
+ * @RTRS_MSG_INFO_REQ:		Client additional info request to the server
+ * @RTRS_MSG_INFO_RSP:		Server additional info response to the client
+ * @RTRS_MSG_WRITE:		Client writes data per RDMA to server
+ * @RTRS_MSG_READ:		Client requests data transfer from server
+ * @RTRS_MSG_RKEY_RSP:		Server refreshed rkey for rbuf
+ */
+enum rtrs_msg_types {
+	RTRS_MSG_INFO_REQ,
+	RTRS_MSG_INFO_RSP,
+	RTRS_MSG_WRITE,
+	RTRS_MSG_READ,
+	RTRS_MSG_RKEY_RSP,
+};
+
+/**
+ * enum rtrs_msg_flags - RTRS message flags.
+ * @RTRS_NEED_INVAL:	Send invalidation in response.
+ * @RTRS_MSG_NEW_RKEY_F: Send refreshed rkey in response.
+ */
+enum rtrs_msg_flags {
+	RTRS_MSG_NEED_INVAL_F = 1 << 0,
+	RTRS_MSG_NEW_RKEY_F = 1 << 1,
+};
+
+/**
+ * struct rtrs_sg_desc - RDMA-Buffer entry description
+ * @addr:	Address of RDMA destination buffer
+ * @key:	Authorization rkey to write to the buffer
+ * @len:	Size of the buffer
+ */
+struct rtrs_sg_desc {
+	__le64			addr;
+	__le32			key;
+	__le32			len;
+};
+
+/**
+ * struct rtrs_msg_conn_req - Client connection request to the server
+ * @magic:	   RTRS magic
+ * @version:	   RTRS protocol version
+ * @cid:	   Current connection id
+ * @cid_num:	   Number of connections per session
+ * @recon_cnt:	   Reconnections counter
+ * @sess_uuid:	   UUID of a session (path)
+ * @paths_uuid:	   UUID of a group of sessions (paths)
+ *
+ * NOTE: max size 56 bytes, see man rdma_connect().
+ */
+struct rtrs_msg_conn_req {
+	u8		__cma_version; /* Is set to 0 by cma.c in case of
+					* AF_IB, do not touch that.
+					*/
+	u8		__ip_version;  /* On sender side that should be
+					* set to 0, or cma_save_ip_info()
+					* extract garbage and will fail.
+					*/
+	__le16		magic;
+	__le16		version;
+	__le16		cid;
+	__le16		cid_num;
+	__le16		recon_cnt;
+	uuid_t		sess_uuid;
+	uuid_t		paths_uuid;
+	u8		reserved[12];
+};
+
+/**
+ * struct rtrs_msg_conn_rsp - Server connection response to the client
+ * @magic:	   RTRS magic
+ * @version:	   RTRS protocol version
+ * @errno:	   If rdma_accept() then 0, if rdma_reject() indicates error
+ * @queue_depth:   max inflight messages (queue-depth) in this session
+ * @max_io_size:   max io size server supports
+ * @max_hdr_size:  max msg header size server supports
+ *
+ * NOTE: size is 56 bytes, max possible is 136 bytes, see man rdma_accept().
+ */
+struct rtrs_msg_conn_rsp {
+	__le16		magic;
+	__le16		version;
+	__le16		errno;
+	__le16		queue_depth;
+	__le32		max_io_size;
+	__le32		max_hdr_size;
+	__le32		flags;
+	u8		reserved[36];
+};
+
+/**
+ * struct rtrs_msg_info_req
+ * @type:		@RTRS_MSG_INFO_REQ
+ * @sessname:		Session name chosen by client
+ */
+struct rtrs_msg_info_req {
+	__le16		type;
+	u8		sessname[NAME_MAX];
+	u8		reserved[15];
+};
+
+/**
+ * struct rtrs_msg_info_rsp
+ * @type:		@RTRS_MSG_INFO_RSP
+ * @sg_cnt:		Number of @desc entries
+ * @desc:		RDMA buffers where the client can write to server
+ */
+struct rtrs_msg_info_rsp {
+	__le16		type;
+	__le16          sg_cnt;
+	u8              reserved[4];
+	struct rtrs_sg_desc desc[];
+};
+
+/**
+ * struct rtrs_msg_rkey_rsp
+ * @type:		@RTRS_MSG_RKEY_RSP
+ * @buf_id:		RDMA buf_id of the new rkey
+ * @rkey:		new remote key for RDMA buffers id from server
+ */
+struct rtrs_msg_rkey_rsp {
+	__le16		type;
+	__le16          buf_id;
+	__le32		rkey;
+};
+
+/**
+ * struct rtrs_msg_rdma_read - RDMA data transfer request from client
+ * @type:		always @RTRS_MSG_READ
+ * @usr_len:		length of user payload
+ * @sg_cnt:		number of @desc entries
+ * @desc:		RDMA buffers where the server can write the result to
+ */
+struct rtrs_msg_rdma_read {
+	__le16			type;
+	__le16			usr_len;
+	__le16			flags;
+	__le16			sg_cnt;
+	struct rtrs_sg_desc    desc[];
+};
+
+/**
+ * struct_msg_rdma_write - Message transferred to server with RDMA-Write
+ * @type:		always @RTRS_MSG_WRITE
+ * @usr_len:		length of user payload
+ */
+struct rtrs_msg_rdma_write {
+	__le16			type;
+	__le16			usr_len;
+};
+
+/**
+ * struct_msg_rdma_hdr - header for read or write request
+ * @type:		@RTRS_MSG_WRITE | @RTRS_MSG_READ
+ */
+struct rtrs_msg_rdma_hdr {
+	__le16			type;
+};
+
+/* rtrs.c */
+
+struct rtrs_iu *rtrs_iu_alloc(u32 queue_size, size_t size, gfp_t t,
+			      struct ib_device *dev, enum dma_data_direction,
+			      void (*done)(struct ib_cq *cq, struct ib_wc *wc));
+void rtrs_iu_free(struct rtrs_iu *iu, enum dma_data_direction dir,
+		  struct ib_device *dev, u32 queue_size);
+int rtrs_iu_post_recv(struct rtrs_con *con, struct rtrs_iu *iu);
+int rtrs_iu_post_send(struct rtrs_con *con, struct rtrs_iu *iu, size_t size,
+		      struct ib_send_wr *head);
+int rtrs_iu_post_rdma_write_imm(struct rtrs_con *con, struct rtrs_iu *iu,
+				struct ib_sge *sge, unsigned int num_sge,
+				u32 rkey, u64 rdma_addr, u32 imm_data,
+				enum ib_send_flags flags,
+				struct ib_send_wr *head);
+
+int rtrs_post_recv_empty(struct rtrs_con *con, struct ib_cqe *cqe);
+int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe);
+int rtrs_post_rdma_write_imm_empty(struct rtrs_con *con, struct ib_cqe *cqe,
+				   u32 imm_data, enum ib_send_flags flags,
+				   struct ib_send_wr *head);
+
+int rtrs_cq_qp_create(struct rtrs_sess *rtrs_sess, struct rtrs_con *con,
+		      u32 max_send_sge, int cq_vector, u16 cq_size,
+		      u16 wr_queue_size, enum ib_poll_context poll_ctx);
+void rtrs_cq_qp_destroy(struct rtrs_con *con);
+
+void rtrs_init_hb(struct rtrs_sess *sess, struct ib_cqe *cqe,
+		  unsigned int interval_ms, unsigned int missed_max,
+		  rtrs_hb_handler_t *err_handler,
+		  struct workqueue_struct *wq);
+void rtrs_start_hb(struct rtrs_sess *sess);
+void rtrs_stop_hb(struct rtrs_sess *sess);
+void rtrs_send_hb_ack(struct rtrs_sess *sess);
+
+void rtrs_ib_dev_pool_init(enum ib_pd_flags pd_flags,
+			   struct rtrs_ib_dev_pool *pool);
+void rtrs_ib_dev_pool_deinit(struct rtrs_ib_dev_pool *pool);
+
+struct rtrs_ib_dev *rtrs_ib_dev_find_or_add(struct ib_device *ib_dev,
+					    struct rtrs_ib_dev_pool *pool);
+int rtrs_ib_dev_put(struct rtrs_ib_dev *dev);
+
+static inline u32 rtrs_to_imm(u32 type, u32 payload)
+{
+	BUILD_BUG_ON(MAX_IMM_PAYL_BITS + MAX_IMM_TYPE_BITS != 32);
+	BUILD_BUG_ON(RTRS_LAST_IMM > (1<<MAX_IMM_TYPE_BITS));
+	return ((type & MAX_IMM_TYPE_MASK) << MAX_IMM_PAYL_BITS) |
+		(payload & MAX_IMM_PAYL_MASK);
+}
+
+static inline void rtrs_from_imm(u32 imm, u32 *type, u32 *payload)
+{
+	*payload = (imm & MAX_IMM_PAYL_MASK);
+	*type = (imm >> MAX_IMM_PAYL_BITS);
+}
+
+static inline u32 rtrs_to_io_req_imm(u32 addr)
+{
+	return rtrs_to_imm(RTRS_IO_REQ_IMM, addr);
+}
+
+static inline u32 rtrs_to_io_rsp_imm(u32 msg_id, int errno, bool w_inval)
+{
+	enum rtrs_imm_type type;
+	u32 payload;
+
+	/* 9 bits for errno, 19 bits for msg_id */
+	payload = (abs(errno) & 0x1ff) << 19 | (msg_id & 0x7ffff);
+	type = (w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM);
+
+	return rtrs_to_imm(type, payload);
+}
+
+static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno)
+{
+	/* 9 bits for errno, 19 bits for msg_id */
+	*msg_id = (payload & 0x7ffff);
+	*errno = -(int)((payload >> 19) & 0x1ff);
+}
+
+#define STAT_STORE_FUNC(type, set_value, reset)				\
+static ssize_t set_value##_store(struct kobject *kobj,			\
+			     struct kobj_attribute *attr,		\
+			     const char *buf, size_t count)		\
+{									\
+	int ret = -EINVAL;						\
+	type *sess = container_of(kobj, type, kobj_stats);		\
+									\
+	if (sysfs_streq(buf, "1"))					\
+		ret = reset(&sess->stats, true);			\
+	else if (sysfs_streq(buf, "0"))					\
+		ret = reset(&sess->stats, false);			\
+	if (ret)							\
+		return ret;						\
+									\
+	return count;							\
+}
+
+#define STAT_SHOW_FUNC(type, get_value, print)				\
+static ssize_t get_value##_show(struct kobject *kobj,			\
+			   struct kobj_attribute *attr,			\
+			   char *page)					\
+{									\
+	type *sess = container_of(kobj, type, kobj_stats);		\
+									\
+	return print(&sess->stats, page, PAGE_SIZE);			\
+}
+
+#define STAT_ATTR(type, stat, print, reset)				\
+STAT_STORE_FUNC(type, stat, reset)					\
+STAT_SHOW_FUNC(type, stat, print)					\
+static struct kobj_attribute stat##_attr =				\
+		__ATTR(stat, 0644,					\
+		       stat##_show,					\
+		       stat##_store)
+
+#endif /* RTRS_PRI_H */