diff mbox series

RDMA/efa: Add rdma write capability to device caps

Message ID 20230404154313.35194-1-ynachum@amazon.com (mailing list archive)
State Accepted
Commit 531094dc7164718d28ebb581d729807d7e846363
Headers show
Series RDMA/efa: Add rdma write capability to device caps | expand

Commit Message

Nachum, Yonatan April 4, 2023, 3:43 p.m. UTC
From: Yonatan Nachum <ynachum@amazon.com>

Add rdma write capability that is propagated from the device to
rdma-core.
Enable MR creation with remote write permissions according to this
device capability.

Reviewed-by: Firas Jahjah <firasj@amazon.com>
Reviewed-by: Michael Margolin <mrgolin@amazon.com>
Signed-off-by: Yonatan Nachum <ynachum@amazon.com>
---
 .../infiniband/hw/efa/efa_admin_cmds_defs.h   | 12 ++++--
 drivers/infiniband/hw/efa/efa_io_defs.h       | 42 +++++++++++++------
 drivers/infiniband/hw/efa/efa_verbs.c         |  6 ++-
 include/uapi/rdma/efa-abi.h                   |  1 +
 4 files changed, 44 insertions(+), 17 deletions(-)

Comments

Leon Romanovsky April 9, 2023, 7:32 a.m. UTC | #1
On Tue, Apr 04, 2023 at 03:43:13PM +0000, ynachum@amazon.com wrote:
> From: Yonatan Nachum <ynachum@amazon.com>
> 
> Add rdma write capability that is propagated from the device to
> rdma-core.
> Enable MR creation with remote write permissions according to this
> device capability.
> 
> Reviewed-by: Firas Jahjah <firasj@amazon.com>
> Reviewed-by: Michael Margolin <mrgolin@amazon.com>
> Signed-off-by: Yonatan Nachum <ynachum@amazon.com>
> ---
>  .../infiniband/hw/efa/efa_admin_cmds_defs.h   | 12 ++++--
>  drivers/infiniband/hw/efa/efa_io_defs.h       | 42 +++++++++++++------
>  drivers/infiniband/hw/efa/efa_verbs.c         |  6 ++-
>  include/uapi/rdma/efa-abi.h                   |  1 +
>  4 files changed, 44 insertions(+), 17 deletions(-)

<...>

>  #endif /* _EFA_IO_H_ */
> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
> index c27c36418f7f..a394011a598c 100644
> --- a/drivers/infiniband/hw/efa/efa_verbs.c
> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> @@ -253,6 +253,9 @@ int efa_query_device(struct ib_device *ibdev,
>  		if (EFA_DEV_CAP(dev, DATA_POLLING_128))
>  			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128;
>  
> +		if (EFA_DEV_CAP(dev, RDMA_WRITE))
> +			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE;
> +
>  		if (dev->neqs)
>  			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
>  
> @@ -1571,7 +1574,8 @@ static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
>  
>  	supp_access_flags =
>  		IB_ACCESS_LOCAL_WRITE |
> -		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
> +		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0) |
> +		(EFA_DEV_CAP(dev, RDMA_WRITE) ? IB_ACCESS_REMOTE_WRITE : 0);
>  
>  	access_flags &= ~IB_ACCESS_OPTIONAL;
>  	if (access_flags & ~supp_access_flags) {
> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
> index 74406b4817ce..d94c32f28804 100644
> --- a/include/uapi/rdma/efa-abi.h
> +++ b/include/uapi/rdma/efa-abi.h
> @@ -121,6 +121,7 @@ enum {
>  	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
>  	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
>  	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
> +	EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,

Why do you need special device capability while all rdma-core users
set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
providers?

Thanks

>  };
>  
>  struct efa_ibv_ex_query_device_resp {
> -- 
> 2.39.2
>
Nachum, Yonatan April 9, 2023, 11:28 a.m. UTC | #2
>>
>>       access_flags &= ~IB_ACCESS_OPTIONAL;
>>       if (access_flags & ~supp_access_flags) {
>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
>> index 74406b4817ce..d94c32f28804 100644
>> --- a/include/uapi/rdma/efa-abi.h
>> +++ b/include/uapi/rdma/efa-abi.h
>> @@ -121,6 +121,7 @@ enum {
>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
> 
> Why do you need special device capability while all rdma-core users
> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
> providers?
> 
> Thanks

We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code. 

Thanks.
Leon Romanovsky April 9, 2023, 5:21 p.m. UTC | #3
On Sun, Apr 09, 2023 at 02:28:04PM +0300, Nachum, Yonatan wrote:
> 
> >>
> >>       access_flags &= ~IB_ACCESS_OPTIONAL;
> >>       if (access_flags & ~supp_access_flags) {
> >> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
> >> index 74406b4817ce..d94c32f28804 100644
> >> --- a/include/uapi/rdma/efa-abi.h
> >> +++ b/include/uapi/rdma/efa-abi.h
> >> @@ -121,6 +121,7 @@ enum {
> >>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
> >>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
> >>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
> >> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
> > 
> > Why do you need special device capability while all rdma-core users
> > set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
> > providers?
> > 
> > Thanks
> 
> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.

Nice, but it doesn't answer my question. Please pay attention to the
second part of my question "while all rdma-core ....".

Thanks

> 
> Thanks.
Nachum, Yonatan April 10, 2023, 12:24 p.m. UTC | #4
>>>>
>>>>       access_flags &= ~IB_ACCESS_OPTIONAL;
>>>>       if (access_flags & ~supp_access_flags) {
>>>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
>>>> index 74406b4817ce..d94c32f28804 100644
>>>> --- a/include/uapi/rdma/efa-abi.h
>>>> +++ b/include/uapi/rdma/efa-abi.h
>>>> @@ -121,6 +121,7 @@ enum {
>>>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
>>>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
>>>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
>>>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
>>>
>>> Why do you need special device capability while all rdma-core users
>>> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
>>> providers?
>>>
>>> Thanks
>>
>> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.
> 
> Nice, but it doesn't answer my question. Please pay attention to the
> second part of my question "while all rdma-core ....".
> 
> Thanks
> 

There are rdma-core users that doesn’t fail on unsupported features but fallback to supported ones. One example is Libfabric EFA provider that emulates RDMA write by read if device write isn’t supported but there are other similar examples. Correct way doing this in user code is by querying rdma-core for device supported capabilities, then selecting a suitable work flow. This is why existing and the additional capability bits are required.

Thanks
Leon Romanovsky April 10, 2023, 12:38 p.m. UTC | #5
On Mon, Apr 10, 2023 at 03:24:27PM +0300, Nachum, Yonatan wrote:
> 
> >>>>
> >>>>       access_flags &= ~IB_ACCESS_OPTIONAL;
> >>>>       if (access_flags & ~supp_access_flags) {
> >>>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
> >>>> index 74406b4817ce..d94c32f28804 100644
> >>>> --- a/include/uapi/rdma/efa-abi.h
> >>>> +++ b/include/uapi/rdma/efa-abi.h
> >>>> @@ -121,6 +121,7 @@ enum {
> >>>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
> >>>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
> >>>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
> >>>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
> >>>
> >>> Why do you need special device capability while all rdma-core users
> >>> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
> >>> providers?
> >>>
> >>> Thanks
> >>
> >> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.
> > 
> > Nice, but it doesn't answer my question. Please pay attention to the
> > second part of my question "while all rdma-core ....".
> > 
> > Thanks
> > 
> 
> There are rdma-core users that doesn’t fail on unsupported features but fallback to supported ones. One example is Libfabric EFA provider that emulates RDMA write by read if device write isn’t supported but there are other similar examples. Correct way doing this in user code is by querying rdma-core for device supported capabilities, then selecting a suitable work flow. This is why existing and the additional capability bits are required.

AFAIK, RDMA write is different here as fallback is performed in the kernel and
not in the rdma-core provider. So why should EFA be different here?

BTW, Please fix your mailer to break lines, so we will be able to reply
on specific sentence with less effort.

Thanks

> 
> Thanks
Nachum, Yonatan April 10, 2023, 2:14 p.m. UTC | #6
>>>>>>
>>>>>>       access_flags &= ~IB_ACCESS_OPTIONAL;
>>>>>>       if (access_flags & ~supp_access_flags) {
>>>>>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
>>>>>> index 74406b4817ce..d94c32f28804 100644
>>>>>> --- a/include/uapi/rdma/efa-abi.h
>>>>>> +++ b/include/uapi/rdma/efa-abi.h
>>>>>> @@ -121,6 +121,7 @@ enum {
>>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
>>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
>>>>>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
>>>>>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
>>>>>
>>>>> Why do you need special device capability while all rdma-core users
>>>>> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
>>>>> providers?
>>>>>
>>>>> Thanks
>>>>
>>>> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.
>>>
>>> Nice, but it doesn't answer my question. Please pay attention to the
>>> second part of my question "while all rdma-core ....".
>>>
>>> Thanks
>>>
>>
>> There are rdma-core users that doesn’t fail on unsupported features but fallback to supported ones. One example is Libfabric EFA provider that emulates RDMA write by read if device write isn’t supported but there are other similar examples. Correct way doing this in user code is by querying rdma-core for device supported capabilities, then selecting a suitable work flow. This is why existing and the additional capability bits are required.
> 
> AFAIK, RDMA write is different here as fallback is performed in the kernel and
> not in the rdma-core provider. So why should EFA be different here?
> 
> BTW, Please fix your mailer to break lines, so we will be able to reply
> on specific sentence with less effort.
> 
> Thanks

Can you please elaborate more on the fallback performed in the kernel?
What kind of fallback being performed? Is it in create MR/QP?
Does the fallback happens when providing unsupported write capability
and to what it fallback to?

Thanks
Leon Romanovsky April 13, 2023, 8:22 a.m. UTC | #7
On Mon, Apr 10, 2023 at 05:14:03PM +0300, Nachum, Yonatan wrote:
> 
> >>>>>>
> >>>>>>       access_flags &= ~IB_ACCESS_OPTIONAL;
> >>>>>>       if (access_flags & ~supp_access_flags) {
> >>>>>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
> >>>>>> index 74406b4817ce..d94c32f28804 100644
> >>>>>> --- a/include/uapi/rdma/efa-abi.h
> >>>>>> +++ b/include/uapi/rdma/efa-abi.h
> >>>>>> @@ -121,6 +121,7 @@ enum {
> >>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
> >>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
> >>>>>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
> >>>>>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
> >>>>>
> >>>>> Why do you need special device capability while all rdma-core users
> >>>>> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
> >>>>> providers?
> >>>>>
> >>>>> Thanks
> >>>>
> >>>> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.
> >>>
> >>> Nice, but it doesn't answer my question. Please pay attention to the
> >>> second part of my question "while all rdma-core ....".
> >>>
> >>> Thanks
> >>>
> >>
> >> There are rdma-core users that doesn’t fail on unsupported features but fallback to supported ones. One example is Libfabric EFA provider that emulates RDMA write by read if device write isn’t supported but there are other similar examples. Correct way doing this in user code is by querying rdma-core for device supported capabilities, then selecting a suitable work flow. This is why existing and the additional capability bits are required.
> > 
> > AFAIK, RDMA write is different here as fallback is performed in the kernel and
> > not in the rdma-core provider. So why should EFA be different here?
> > 
> > BTW, Please fix your mailer to break lines, so we will be able to reply
> > on specific sentence with less effort.
> > 
> > Thanks
> 
> Can you please elaborate more on the fallback performed in the kernel?
> What kind of fallback being performed? Is it in create MR/QP?
> Does the fallback happens when providing unsupported write capability
> and to what it fallback to?

OK, looked again, "Fallback" was in my imagination, sorry about that.

But my main question is continued to be, how other vendors which support
RDMA write work without capability?

Thanks

> 
> Thanks
Nachum, Yonatan April 13, 2023, 1:43 p.m. UTC | #8
>>>>>>>>
>>>>>>>>       access_flags &= ~IB_ACCESS_OPTIONAL;
>>>>>>>>       if (access_flags & ~supp_access_flags) {
>>>>>>>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
>>>>>>>> index 74406b4817ce..d94c32f28804 100644
>>>>>>>> --- a/include/uapi/rdma/efa-abi.h
>>>>>>>> +++ b/include/uapi/rdma/efa-abi.h
>>>>>>>> @@ -121,6 +121,7 @@ enum {
>>>>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
>>>>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
>>>>>>>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
>>>>>>>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
>>>>>>>
>>>>>>> Why do you need special device capability while all rdma-core users
>>>>>>> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
>>>>>>> providers?
>>>>>>>
>>>>>>> Thanks
>>>>>>
>>>>>> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.
>>>>>
>>>>> Nice, but it doesn't answer my question. Please pay attention to the
>>>>> second part of my question "while all rdma-core ....".
>>>>>
>>>>> Thanks
>>>>>
>>>>
>>>> There are rdma-core users that doesn’t fail on unsupported features but fallback to supported ones. One example is Libfabric EFA provider that emulates RDMA write by read if device write isn’t supported but there are other similar examples. Correct way doing this in user code is by querying rdma-core for device supported capabilities, then selecting a suitable work flow. This is why existing and the additional capability bits are required.
>>>
>>> AFAIK, RDMA write is different here as fallback is performed in the kernel and
>>> not in the rdma-core provider. So why should EFA be different here?
>>>
>>> BTW, Please fix your mailer to break lines, so we will be able to reply
>>> on specific sentence with less effort.
>>>
>>> Thanks
>>
>> Can you please elaborate more on the fallback performed in the kernel?
>> What kind of fallback being performed? Is it in create MR/QP?
>> Does the fallback happens when providing unsupported write capability
>> and to what it fallback to?
> 
> OK, looked again, "Fallback" was in my imagination, sorry about that.
> 
> But my main question is continued to be, how other vendors which support
> RDMA write work without capability?
> 
> Thanks

Vendors that always support RDMA write don’t need a query for this capability.
Some EFA devices don’t support write capability so we provide the ability to
query the device to know if write is supported.
It is like mlx5 support query capabilities through direct verb.

Thanks
Nachum, Yonatan April 18, 2023, 2:03 p.m. UTC | #9
>>>>>>>>>       access_flags &= ~IB_ACCESS_OPTIONAL;
>>>>>>>>>       if (access_flags & ~supp_access_flags) {
>>>>>>>>> diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
>>>>>>>>> index 74406b4817ce..d94c32f28804 100644
>>>>>>>>> --- a/include/uapi/rdma/efa-abi.h
>>>>>>>>> +++ b/include/uapi/rdma/efa-abi.h
>>>>>>>>> @@ -121,6 +121,7 @@ enum {
>>>>>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
>>>>>>>>>       EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
>>>>>>>>>       EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
>>>>>>>>> +     EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
>>>>>>>>
>>>>>>>> Why do you need special device capability while all rdma-core users
>>>>>>>> set IBV_ACCESS_REMOTE_WRITE anyway without relying on anything from
>>>>>>>> providers?
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>
>>>>>>> We need to query the device because not every device supprort the same RDMA capabilities. Upper layers in the SW stack needs this supported flags to indicate which flows they can use. In addition this is identical to the existing RDMA read support in our code.
>>>>>>
>>>>>> Nice, but it doesn't answer my question. Please pay attention to the
>>>>>> second part of my question "while all rdma-core ....".
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>
>>>>> There are rdma-core users that doesn’t fail on unsupported features but fallback to supported ones. One example is Libfabric EFA provider that emulates RDMA write by read if device write isn’t supported but there are other similar examples. Correct way doing this in user code is by querying rdma-core for device supported capabilities, then selecting a suitable work flow. This is why existing and the additional capability bits are required.
>>>>
>>>> AFAIK, RDMA write is different here as fallback is performed in the kernel and
>>>> not in the rdma-core provider. So why should EFA be different here?
>>>>
>>>> BTW, Please fix your mailer to break lines, so we will be able to reply
>>>> on specific sentence with less effort.
>>>>
>>>> Thanks
>>>
>>> Can you please elaborate more on the fallback performed in the kernel?
>>> What kind of fallback being performed? Is it in create MR/QP?
>>> Does the fallback happens when providing unsupported write capability
>>> and to what it fallback to?
>>
>> OK, looked again, "Fallback" was in my imagination, sorry about that.
>>
>> But my main question is continued to be, how other vendors which support
>> RDMA write work without capability?
>>
>> Thanks
> 
> Vendors that always support RDMA write don’t need a query for this capability.
> Some EFA devices don’t support write capability so we provide the ability to
> query the device to know if write is supported.
> It is like mlx5 support query capabilities through direct verb.
> 
> Thanks

Hello,
Kind reminder for this patch.

Thanks
Jason Gunthorpe April 21, 2023, 10:20 p.m. UTC | #10
On Tue, Apr 04, 2023 at 03:43:13PM +0000, ynachum@amazon.com wrote:
> From: Yonatan Nachum <ynachum@amazon.com>
> 
> Add rdma write capability that is propagated from the device to
> rdma-core.
> Enable MR creation with remote write permissions according to this
> device capability.
> 
> Reviewed-by: Firas Jahjah <firasj@amazon.com>
> Reviewed-by: Michael Margolin <mrgolin@amazon.com>
> Signed-off-by: Yonatan Nachum <ynachum@amazon.com>
> ---
>  .../infiniband/hw/efa/efa_admin_cmds_defs.h   | 12 ++++--
>  drivers/infiniband/hw/efa/efa_io_defs.h       | 42 +++++++++++++------
>  drivers/infiniband/hw/efa/efa_verbs.c         |  6 ++-
>  include/uapi/rdma/efa-abi.h                   |  1 +
>  4 files changed, 44 insertions(+), 17 deletions(-)

Applied to for-next, thanks

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index 3db791e6c030..4e93ef7f84ee 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -376,7 +376,9 @@  struct efa_admin_reg_mr_cmd {
 	 * 0 : local_write_enable - Local write permissions:
 	 *    must be set for RQ buffers and buffers posted for
 	 *    RDMA Read requests
-	 * 1 : reserved1 - MBZ
+	 * 1 : remote_write_enable - Remote write
+	 *    permissions: must be set to enable RDMA write to
+	 *    the region
 	 * 2 : remote_read_enable - Remote read permissions:
 	 *    must be set to enable RDMA read from the region
 	 * 7:3 : reserved2 - MBZ
@@ -620,7 +622,9 @@  struct efa_admin_feature_device_attr_desc {
 	 *    modify QP command
 	 * 2 : data_polling_128 - If set, 128 bytes data
 	 *    polling is supported
-	 * 31:3 : reserved - MBZ
+	 * 3 : rdma_write - If set, RDMA Write is supported
+	 *    on TX queues
+	 * 31:4 : reserved - MBZ
 	 */
 	u32 device_caps;
 
@@ -674,7 +678,7 @@  struct efa_admin_feature_queue_attr_desc {
 	/* The maximum size of LLQ in bytes */
 	u32 max_llq_size;
 
-	/* Maximum number of SGEs for a single RDMA read WQE */
+	/* Maximum number of SGEs for a single RDMA read/write WQE */
 	u16 max_wr_rdma_sges;
 
 	/*
@@ -979,6 +983,7 @@  struct efa_admin_host_info {
 #define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
 #define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
 #define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+#define EFA_ADMIN_REG_MR_CMD_REMOTE_WRITE_ENABLE_MASK       BIT(1)
 #define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
 
 /* create_cq_cmd */
@@ -994,6 +999,7 @@  struct efa_admin_host_info {
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK  BIT(3)
 
 /* create_eq_cmd */
 #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
diff --git a/drivers/infiniband/hw/efa/efa_io_defs.h b/drivers/infiniband/hw/efa/efa_io_defs.h
index 17ba8984b11e..2d8eb96eaa81 100644
--- a/drivers/infiniband/hw/efa/efa_io_defs.h
+++ b/drivers/infiniband/hw/efa/efa_io_defs.h
@@ -1,6 +1,6 @@ 
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_IO_H_
@@ -23,6 +23,8 @@  enum efa_io_send_op_type {
 	EFA_IO_SEND                                 = 0,
 	/* RDMA read */
 	EFA_IO_RDMA_READ                            = 1,
+	/* RDMA write */
+	EFA_IO_RDMA_WRITE                           = 2,
 };
 
 enum efa_io_comp_status {
@@ -62,8 +64,7 @@  struct efa_io_tx_meta_desc {
 
 	/*
 	 * control flags
-	 * 3:0 : op_type - operation type: send/rdma/fast mem
-	 *    ops/etc
+	 * 3:0 : op_type - enum efa_io_send_op_type
 	 * 4 : has_imm - immediate_data field carries valid
 	 *    data.
 	 * 5 : inline_msg - inline mode - inline message data
@@ -219,21 +220,22 @@  struct efa_io_cdesc_common {
 	 * 2:1 : q_type - enum efa_io_queue_type: send/recv
 	 * 3 : has_imm - indicates that immediate data is
 	 *    present - for RX completions only
-	 * 7:4 : reserved28 - MBZ
+	 * 6:4 : op_type - enum efa_io_send_op_type
+	 * 7 : reserved31 - MBZ
 	 */
 	u8 flags;
 
 	/* local QP number */
 	u16 qp_num;
-
-	/* Transferred length */
-	u16 length;
 };
 
 /* Tx completion descriptor */
 struct efa_io_tx_cdesc {
 	/* Common completion info */
 	struct efa_io_cdesc_common common;
+
+	/* MBZ */
+	u16 reserved16;
 };
 
 /* Rx Completion Descriptor */
@@ -241,6 +243,9 @@  struct efa_io_rx_cdesc {
 	/* Common completion info */
 	struct efa_io_cdesc_common common;
 
+	/* Transferred length bits[15:0] */
+	u16 length;
+
 	/* Remote Address Handle FW index, 0xFFFF indicates invalid ah */
 	u16 ah;
 
@@ -250,16 +255,26 @@  struct efa_io_rx_cdesc {
 	u32 imm;
 };
 
+/* Rx Completion Descriptor RDMA write info */
+struct efa_io_rx_cdesc_rdma_write {
+	/* Transferred length bits[31:16] */
+	u16 length_hi;
+};
+
 /* Extended Rx Completion Descriptor */
 struct efa_io_rx_cdesc_ex {
 	/* Base RX completion info */
-	struct efa_io_rx_cdesc rx_cdesc_base;
+	struct efa_io_rx_cdesc base;
 
-	/*
-	 * Valid only in case of unknown AH (0xFFFF) and CQ set_src_addr is
-	 * enabled.
-	 */
-	u8 src_addr[16];
+	union {
+		struct efa_io_rx_cdesc_rdma_write rdma_write;
+
+		/*
+		 * Valid only in case of unknown AH (0xFFFF) and CQ
+		 * set_src_addr is enabled.
+		 */
+		u8 src_addr[16];
+	} u;
 };
 
 /* tx_meta_desc */
@@ -285,5 +300,6 @@  struct efa_io_rx_cdesc_ex {
 #define EFA_IO_CDESC_COMMON_PHASE_MASK                      BIT(0)
 #define EFA_IO_CDESC_COMMON_Q_TYPE_MASK                     GENMASK(2, 1)
 #define EFA_IO_CDESC_COMMON_HAS_IMM_MASK                    BIT(3)
+#define EFA_IO_CDESC_COMMON_OP_TYPE_MASK                    GENMASK(6, 4)
 
 #endif /* _EFA_IO_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index c27c36418f7f..a394011a598c 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -253,6 +253,9 @@  int efa_query_device(struct ib_device *ibdev,
 		if (EFA_DEV_CAP(dev, DATA_POLLING_128))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128;
 
+		if (EFA_DEV_CAP(dev, RDMA_WRITE))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE;
+
 		if (dev->neqs)
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
 
@@ -1571,7 +1574,8 @@  static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
 
 	supp_access_flags =
 		IB_ACCESS_LOCAL_WRITE |
-		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
+		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0) |
+		(EFA_DEV_CAP(dev, RDMA_WRITE) ? IB_ACCESS_REMOTE_WRITE : 0);
 
 	access_flags &= ~IB_ACCESS_OPTIONAL;
 	if (access_flags & ~supp_access_flags) {
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 74406b4817ce..d94c32f28804 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -121,6 +121,7 @@  enum {
 	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
 	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
 	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
+	EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
 };
 
 struct efa_ibv_ex_query_device_resp {