diff mbox series

[RESEND,v5,1/2] RDMA/rxe: Support RDMA Atomic Write operation

Message ID 20220708040228.6703-2-yangx.jy@fujitsu.com (mailing list archive)
State Superseded
Headers show
Series RDMA/rxe: Add RDMA Atomic Write operation | expand

Commit Message

Xiao Yang July 8, 2022, 4:02 a.m. UTC
This patch implements RDMA Atomic Write operation for RC service.

Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe_comp.c   |  4 ++
 drivers/infiniband/sw/rxe/rxe_opcode.c | 18 +++++
 drivers/infiniband/sw/rxe/rxe_opcode.h |  3 +
 drivers/infiniband/sw/rxe/rxe_req.c    | 15 +++-
 drivers/infiniband/sw/rxe/rxe_resp.c   | 94 ++++++++++++++++++++++++--
 include/rdma/ib_pack.h                 |  2 +
 include/rdma/ib_verbs.h                |  2 +
 include/uapi/rdma/ib_user_verbs.h      |  2 +
 include/uapi/rdma/rdma_user_rxe.h      |  1 +
 9 files changed, 134 insertions(+), 7 deletions(-)

Comments

Jason Gunthorpe Sept. 23, 2022, 10:22 p.m. UTC | #1
On Fri, Jul 08, 2022 at 04:02:36AM +0000, yangx.jy@fujitsu.com wrote:
> +static enum resp_states atomic_write_reply(struct rxe_qp *qp,
> +					   struct rxe_pkt_info *pkt)
> +{
> +	u64 src, *dst;
> +	struct resp_res *res = qp->resp.res;
> +	struct rxe_mr *mr = qp->resp.mr;
> +	int payload = payload_size(pkt);
> +
> +	if (!res) {
> +		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK);
> +		qp->resp.res = res;
> +	}
> +
> +	if (!res->replay) {
> +#ifdef CONFIG_64BIT
> +		memcpy(&src, payload_addr(pkt), payload);
> +
> +		dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload);
> +		/* check vaddr is 8 bytes aligned. */
> +		if (!dst || (uintptr_t)dst & 7)
> +			return RESPST_ERR_MISALIGNED_ATOMIC;
> +
> +		/* Do atomic write after all prior operations have completed */
> +		smp_store_release(dst, src);

Someone needs to fix iova_to_vaddr to do the missing kmap, we can't
just assume you can cast a u64 pfn to a vaddr like this.

> +		/* decrease resp.resid to zero */
> +		qp->resp.resid -= sizeof(payload);
> +
> +		qp->resp.msn++;
> +
> +		/* next expected psn, read handles this separately */
> +		qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
> +		qp->resp.ack_psn = qp->resp.psn;
> +
> +		qp->resp.opcode = pkt->opcode;
> +		qp->resp.status = IB_WC_SUCCESS;
> +
> +		return RESPST_ACKNOWLEDGE;
> +#else
> +		pr_err("32-bit arch doesn't support 8-byte atomic write\n");
> +		return RESPST_ERR_UNSUPPORTED_OPCODE;

No print on receiving a remote packet

Jason
Xiao Yang Sept. 26, 2022, 6:55 a.m. UTC | #2
On 2022/9/24 6:22, Jason Gunthorpe wrote:
> On Fri, Jul 08, 2022 at 04:02:36AM +0000, yangx.jy@fujitsu.com wrote:
>> +static enum resp_states atomic_write_reply(struct rxe_qp *qp,
>> +					   struct rxe_pkt_info *pkt)
>> +{
>> +	u64 src, *dst;
>> +	struct resp_res *res = qp->resp.res;
>> +	struct rxe_mr *mr = qp->resp.mr;
>> +	int payload = payload_size(pkt);
>> +
>> +	if (!res) {
>> +		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK);
>> +		qp->resp.res = res;
>> +	}
>> +
>> +	if (!res->replay) {
>> +#ifdef CONFIG_64BIT
>> +		memcpy(&src, payload_addr(pkt), payload);
>> +
>> +		dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload);
>> +		/* check vaddr is 8 bytes aligned. */
>> +		if (!dst || (uintptr_t)dst & 7)
>> +			return RESPST_ERR_MISALIGNED_ATOMIC;
>> +
>> +		/* Do atomic write after all prior operations have completed */
>> +		smp_store_release(dst, src);
> 
> Someone needs to fix iova_to_vaddr to do the missing kmap, we can't
> just assume you can cast a u64 pfn to a vaddr like this.

Hi Jason,
Cc Ira,

When using PMEM with DAX mode(devdax or fsdax), we cannot ensure that 
iova_to_vaddr() can cast a u64 pfn to a vaddr, right? so we have to 
replace page_address() with kmap_local_page().

Without Ira's PKS patch set, I didn't see any failure when accessing 
remote PMEM with devdax mode by RDMA based on RXE. I don't know why I 
cannot trigger any failure in the condition.

> 
>> +		/* decrease resp.resid to zero */
>> +		qp->resp.resid -= sizeof(payload);
>> +
>> +		qp->resp.msn++;
>> +
>> +		/* next expected psn, read handles this separately */
>> +		qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
>> +		qp->resp.ack_psn = qp->resp.psn;
>> +
>> +		qp->resp.opcode = pkt->opcode;
>> +		qp->resp.status = IB_WC_SUCCESS;
>> +
>> +		return RESPST_ACKNOWLEDGE;
>> +#else
>> +		pr_err("32-bit arch doesn't support 8-byte atomic write\n");
>> +		return RESPST_ERR_UNSUPPORTED_OPCODE;
> 
> No print on receiving a remote packet

OK

Best Regards,
Xiao Yang

> 
> Jason
Zhijian Li (Fujitsu) Sept. 27, 2022, 8:18 a.m. UTC | #3
Hi Yang

I wonder if you need to do something if a user register MR with ATOMIC_WRITE to non-rxe device, something like in my flush:

Thanks
Zhijian

  static inline int ib_check_mr_access(struct ib_device *ib_dev,
  				     unsigned int flags)
  {
+	u64 device_cap = ib_dev->attrs.device_cap_flags;
+
  	/*
  	 * Local write permission is required if remote write or
  	 * remote atomic permission is also requested.
@@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
  	if (flags & IB_ACCESS_ON_DEMAND &&
  	    !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
  		return -EINVAL;
+
+	if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
+	    !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
+	    (flags & IB_ACCESS_FLUSH_PERSISTENT &&
+	    !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
+		return -EINVAL;
+
  	return 0;
  }
  

On 08/07/2022 12:02, Yang, Xiao/杨 晓 wrote:
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 9c6317cf80d5..7834285c8498 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
Jason Gunthorpe Sept. 27, 2022, 1:17 p.m. UTC | #4
On Tue, Sep 27, 2022 at 04:18:52PM +0800, Li Zhijian wrote:
> Hi Yang
> 
> I wonder if you need to do something if a user register MR with
> ATOMIC_WRITE to non-rxe device, something like in my flush:

This makes sense..

Jason
Xiao Yang Sept. 29, 2022, 3:58 a.m. UTC | #5
On 2022/9/27 21:17, Jason Gunthorpe wrote:
> On Tue, Sep 27, 2022 at 04:18:52PM +0800, Li Zhijian wrote:
>> Hi Yang
>>
>> I wonder if you need to do something if a user register MR with
>> ATOMIC_WRITE to non-rxe device, something like in my flush:
> 
> This makes sense..

Hi Zhijian, Jason

Agreed. I will add the check in ib_check_mr_access().

Best Regards,
Xiao Yang
> 
> Jason
Xiao Yang Sept. 29, 2022, 5:36 a.m. UTC | #6
On 2022/9/29 11:58, Yang, Xiao/杨 晓 wrote:
> On 2022/9/27 21:17, Jason Gunthorpe wrote:
>> On Tue, Sep 27, 2022 at 04:18:52PM +0800, Li Zhijian wrote:
>>> Hi Yang
>>>
>>> I wonder if you need to do something if a user register MR with
>>> ATOMIC_WRITE to non-rxe device, something like in my flush:
>>
>> This makes sense..
> 
> Hi Zhijian, Jason
> 
> Agreed. I will add the check in ib_check_mr_access().

Hi Zhijian, Jason,

Sorry for the rough reply.

After reading the IBTA Spec A19.4.5.3 again, it seems not suitable for 
Atomic Write to do the similar check in ib_check_mr_access().

Atomic Write uses the original IB_ACCESS_REMOTE_WRITE flag during memory 
registration. (In other words, it doesn't introduce a new 
IB_ACCESS_REMOTE_ATOMIC_WRITE flag)

In this case, we should not return -EINVAL when IB_ACCESS_REMOTE_WRITE 
is specified and ib_dev->attrs.device_cap_flags doesn't support 
IB_DEVICE_ATOMIC_WRITE.

Best Regards,
Xiao Yang

> 
> Best Regards,
> Xiao Yang
>>
>> Jason
Xiao Yang Oct. 6, 2022, 7:53 a.m. UTC | #7
On 2022/9/24 6:22, Jason Gunthorpe wrote:
> On Fri, Jul 08, 2022 at 04:02:36AM +0000, yangx.jy@fujitsu.com wrote:
>> +static enum resp_states atomic_write_reply(struct rxe_qp *qp,
>> +					   struct rxe_pkt_info *pkt)
>> +{
>> +	u64 src, *dst;
>> +	struct resp_res *res = qp->resp.res;
>> +	struct rxe_mr *mr = qp->resp.mr;
>> +	int payload = payload_size(pkt);
>> +
>> +	if (!res) {
>> +		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK);
>> +		qp->resp.res = res;
>> +	}
>> +
>> +	if (!res->replay) {
>> +#ifdef CONFIG_64BIT
>> +		memcpy(&src, payload_addr(pkt), payload);
>> +
>> +		dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload);
>> +		/* check vaddr is 8 bytes aligned. */
>> +		if (!dst || (uintptr_t)dst & 7)
>> +			return RESPST_ERR_MISALIGNED_ATOMIC;
>> +
>> +		/* Do atomic write after all prior operations have completed */
>> +		smp_store_release(dst, src);
> 
> Someone needs to fix iova_to_vaddr to do the missing kmap, we can't
> just assume you can cast a u64 pfn to a vaddr like this.

Hi Jason,

Sorry, it is still not clear to me after looking into the related code 
again.

IMO, SoftRoCE depends on INFINIBAND_VIRT_DMA Kconfig which only allows 
!HIGHMEM so that SoftRoCE can call page_address() to gain a kernel 
virtual address for a page allocated on low memory zone. If a page is 
allocated on high memory zone, we need to gain a kernel virtual address 
by kmap()/kmap_atomic(). Did I miss something? I wonder why it is 
necessary to call kmap()?

Reference:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b1e678bf290db5a76f1b6a9f7c381310e03440d6

Best Regards,
Xiao Yang

> 
>> +		/* decrease resp.resid to zero */
>> +		qp->resp.resid -= sizeof(payload);
>> +
>> +		qp->resp.msn++;
>> +
>> +		/* next expected psn, read handles this separately */
>> +		qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
>> +		qp->resp.ack_psn = qp->resp.psn;
>> +
>> +		qp->resp.opcode = pkt->opcode;
>> +		qp->resp.status = IB_WC_SUCCESS;
>> +
>> +		return RESPST_ACKNOWLEDGE;
>> +#else
>> +		pr_err("32-bit arch doesn't support 8-byte atomic write\n");
>> +		return RESPST_ERR_UNSUPPORTED_OPCODE;
> 
> No print on receiving a remote packet
> 
> Jason
Jason Gunthorpe Oct. 14, 2022, 4 p.m. UTC | #8
On Thu, Oct 06, 2022 at 03:53:36PM +0800, Yang, Xiao/杨 晓 wrote:
> On 2022/9/24 6:22, Jason Gunthorpe wrote:
> > On Fri, Jul 08, 2022 at 04:02:36AM +0000, yangx.jy@fujitsu.com wrote:
> > > +static enum resp_states atomic_write_reply(struct rxe_qp *qp,
> > > +					   struct rxe_pkt_info *pkt)
> > > +{
> > > +	u64 src, *dst;
> > > +	struct resp_res *res = qp->resp.res;
> > > +	struct rxe_mr *mr = qp->resp.mr;
> > > +	int payload = payload_size(pkt);
> > > +
> > > +	if (!res) {
> > > +		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK);
> > > +		qp->resp.res = res;
> > > +	}
> > > +
> > > +	if (!res->replay) {
> > > +#ifdef CONFIG_64BIT
> > > +		memcpy(&src, payload_addr(pkt), payload);
> > > +
> > > +		dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload);
> > > +		/* check vaddr is 8 bytes aligned. */
> > > +		if (!dst || (uintptr_t)dst & 7)
> > > +			return RESPST_ERR_MISALIGNED_ATOMIC;
> > > +
> > > +		/* Do atomic write after all prior operations have completed */
> > > +		smp_store_release(dst, src);
> > 
> > Someone needs to fix iova_to_vaddr to do the missing kmap, we can't
> > just assume you can cast a u64 pfn to a vaddr like this.
> 
> Hi Jason,
> 
> Sorry, it is still not clear to me after looking into the related code
> again.
> 
> IMO, SoftRoCE depends on INFINIBAND_VIRT_DMA Kconfig which only allows
> !HIGHMEM so that SoftRoCE can call page_address() to gain a kernel virtual
> address for a page allocated on low memory zone. If a page is allocated on
> high memory zone, we need to gain a kernel virtual address by
> kmap()/kmap_atomic(). Did I miss something? I wonder why it is necessary to
> call kmap()?

People have been thinking of new uses for kmap, rxe still should be
calling it.

The above just explains why it doesn't fail today, it doesn't excuse
the wrong usage.

Jason
Xiao Yang Oct. 20, 2022, 1:25 p.m. UTC | #9
On 2022/10/15 0:00, Jason Gunthorpe wrote:
> People have been thinking of new uses for kmap, rxe still should be
> calling it.

Hi Jason,

Thanks a lot for your reply.
Could you tell me what is the new usage about kmap()?

> 
> The above just explains why it doesn't fail today, it doesn't excuse
> the wrong usage.

I wonder why we need to use kmap() in this case?
I'm sorry to ask the stupid question.

Best Regards,
Xiao Yang
> 
> Jason
Jason Gunthorpe Oct. 24, 2022, 5:02 p.m. UTC | #10
On Thu, Oct 20, 2022 at 09:25:46PM +0800, Yang, Xiao/杨 晓 wrote:
> On 2022/10/15 0:00, Jason Gunthorpe wrote:
> > People have been thinking of new uses for kmap, rxe still should be
> > calling it.
> 
> Hi Jason,
> 
> Thanks a lot for your reply.
> Could you tell me what is the new usage about kmap()?

New in-kernel memory protection schemes
 
> > The above just explains why it doesn't fail today, it doesn't excuse
> > the wrong usage.
> 
> I wonder why we need to use kmap() in this case?
> I'm sorry to ask the stupid question.

It is the defined API to convert a struct page into an address the CPU
can access.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index da3a398053b8..16b90d68d2cb 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -104,6 +104,7 @@  static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
 	case IB_WR_LOCAL_INV:			return IB_WC_LOCAL_INV;
 	case IB_WR_REG_MR:			return IB_WC_REG_MR;
 	case IB_WR_BIND_MW:			return IB_WC_BIND_MW;
+	case IB_WR_RDMA_ATOMIC_WRITE:		return IB_WC_RDMA_ATOMIC_WRITE;
 
 	default:
 		return 0xff;
@@ -256,6 +257,9 @@  static inline enum comp_state check_ack(struct rxe_qp *qp,
 		if ((syn & AETH_TYPE_MASK) != AETH_ACK)
 			return COMPST_ERROR;
 
+		if (wqe->wr.opcode == IB_WR_RDMA_ATOMIC_WRITE)
+			return COMPST_WRITE_SEND;
+
 		fallthrough;
 		/* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
 		 */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
index d4ba4d506f17..d284fa8798c3 100644
--- a/drivers/infiniband/sw/rxe/rxe_opcode.c
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
@@ -101,6 +101,12 @@  struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
 			[IB_QPT_UC]	= WR_LOCAL_OP_MASK,
 		},
 	},
+	[IB_WR_RDMA_ATOMIC_WRITE]			= {
+		.name   = "IB_WR_RDMA_ATOMIC_WRITE",
+		.mask   = {
+			[IB_QPT_RC]	= WR_ATOMIC_WRITE_MASK,
+		},
+	},
 };
 
 struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
@@ -378,6 +384,18 @@  struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
 					  RXE_IETH_BYTES,
 		}
 	},
+	[IB_OPCODE_RC_RDMA_ATOMIC_WRITE]			= {
+		.name   = "IB_OPCODE_RC_RDMA_ATOMIC_WRITE",
+		.mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+			  RXE_ATOMIC_WRITE_MASK | RXE_START_MASK |
+			  RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES + RXE_RETH_BYTES,
+		}
+	},
 
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]			= {
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
index 8f9aaaf260f2..5962f5fc66a6 100644
--- a/drivers/infiniband/sw/rxe/rxe_opcode.h
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -20,6 +20,7 @@  enum rxe_wr_mask {
 	WR_READ_MASK			= BIT(3),
 	WR_WRITE_MASK			= BIT(4),
 	WR_LOCAL_OP_MASK		= BIT(5),
+	WR_ATOMIC_WRITE_MASK            = BIT(7),
 
 	WR_READ_OR_WRITE_MASK		= WR_READ_MASK | WR_WRITE_MASK,
 	WR_WRITE_OR_SEND_MASK		= WR_WRITE_MASK | WR_SEND_MASK,
@@ -81,6 +82,8 @@  enum rxe_hdr_mask {
 
 	RXE_LOOPBACK_MASK	= BIT(NUM_HDR_TYPES + 12),
 
+	RXE_ATOMIC_WRITE_MASK	= BIT(NUM_HDR_TYPES + 14),
+
 	RXE_READ_OR_ATOMIC_MASK	= (RXE_READ_MASK | RXE_ATOMIC_MASK),
 	RXE_WRITE_OR_SEND_MASK	= (RXE_WRITE_MASK | RXE_SEND_MASK),
 	RXE_READ_OR_WRITE_MASK	= (RXE_READ_MASK | RXE_WRITE_MASK),
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 15fefc689ca3..613c7031f562 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -235,6 +235,10 @@  static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
 		else
 			return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
 				IB_OPCODE_RC_SEND_FIRST;
+
+	case IB_WR_RDMA_ATOMIC_WRITE:
+		return IB_OPCODE_RC_RDMA_ATOMIC_WRITE;
+
 	case IB_WR_REG_MR:
 	case IB_WR_LOCAL_INV:
 		return opcode;
@@ -463,6 +467,11 @@  static int finish_packet(struct rxe_qp *qp, struct rxe_av *av,
 		}
 	}
 
+	if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
+		memcpy(payload_addr(pkt), wqe->wr.wr.rdma.atomic_wr, payload);
+		wqe->dma.resid -= payload;
+	}
+
 	return 0;
 }
 
@@ -663,13 +672,15 @@  int rxe_requester(void *arg)
 	}
 
 	mask = rxe_opcode[opcode].mask;
-	if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) {
+	if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK |
+			RXE_ATOMIC_WRITE_MASK))) {
 		if (check_init_depth(qp, wqe))
 			goto exit;
 	}
 
 	mtu = get_mtu(qp);
-	payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0;
+	payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ?
+			wqe->dma.resid : 0;
 	if (payload > mtu) {
 		if (qp_type(qp) == IB_QPT_UD) {
 			/* C10-93.1.1: If the total sum of all the buffer lengths specified for a
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 28033849d404..2cf544abe0dc 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -22,6 +22,7 @@  enum resp_states {
 	RESPST_EXECUTE,
 	RESPST_READ_REPLY,
 	RESPST_ATOMIC_REPLY,
+	RESPST_ATOMIC_WRITE_REPLY,
 	RESPST_COMPLETE,
 	RESPST_ACKNOWLEDGE,
 	RESPST_CLEANUP,
@@ -57,6 +58,7 @@  static char *resp_state_name[] = {
 	[RESPST_EXECUTE]			= "EXECUTE",
 	[RESPST_READ_REPLY]			= "READ_REPLY",
 	[RESPST_ATOMIC_REPLY]			= "ATOMIC_REPLY",
+	[RESPST_ATOMIC_WRITE_REPLY]		= "ATOMIC_WRITE_REPLY",
 	[RESPST_COMPLETE]			= "COMPLETE",
 	[RESPST_ACKNOWLEDGE]			= "ACKNOWLEDGE",
 	[RESPST_CLEANUP]			= "CLEANUP",
@@ -260,7 +262,7 @@  static enum resp_states check_op_valid(struct rxe_qp *qp,
 	case IB_QPT_RC:
 		if (((pkt->mask & RXE_READ_MASK) &&
 		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
-		    ((pkt->mask & RXE_WRITE_MASK) &&
+		    ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) &&
 		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
 		    ((pkt->mask & RXE_ATOMIC_MASK) &&
 		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
@@ -364,7 +366,7 @@  static enum resp_states check_resource(struct rxe_qp *qp,
 		}
 	}
 
-	if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) {
+	if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		/* it is the requesters job to not send
 		 * too many read/atomic ops, we just
 		 * recycle the responder resource queue
@@ -415,7 +417,7 @@  static enum resp_states check_rkey(struct rxe_qp *qp,
 	enum resp_states state;
 	int access;
 
-	if (pkt->mask & RXE_READ_OR_WRITE_MASK) {
+	if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		if (pkt->mask & RXE_RETH_MASK) {
 			qp->resp.va = reth_va(pkt);
 			qp->resp.offset = 0;
@@ -483,7 +485,7 @@  static enum resp_states check_rkey(struct rxe_qp *qp,
 		goto err;
 	}
 
-	if (pkt->mask & RXE_WRITE_MASK)	 {
+	if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		if (resid > mtu) {
 			if (pktlen != mtu || bth_pad(pkt)) {
 				state = RESPST_ERR_LENGTH;
@@ -583,6 +585,7 @@  static struct resp_res *rxe_prepare_res(struct rxe_qp *qp,
 		res->state = rdatm_res_state_new;
 		break;
 	case RXE_ATOMIC_MASK:
+	case RXE_ATOMIC_WRITE_MASK:
 		res->first_psn = pkt->psn;
 		res->last_psn = pkt->psn;
 		res->cur_psn = pkt->psn;
@@ -652,6 +655,53 @@  static enum resp_states atomic_reply(struct rxe_qp *qp,
 	return ret;
 }
 
+static enum resp_states atomic_write_reply(struct rxe_qp *qp,
+					   struct rxe_pkt_info *pkt)
+{
+	u64 src, *dst;
+	struct resp_res *res = qp->resp.res;
+	struct rxe_mr *mr = qp->resp.mr;
+	int payload = payload_size(pkt);
+
+	if (!res) {
+		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK);
+		qp->resp.res = res;
+	}
+
+	if (!res->replay) {
+#ifdef CONFIG_64BIT
+		memcpy(&src, payload_addr(pkt), payload);
+
+		dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload);
+		/* check vaddr is 8 bytes aligned. */
+		if (!dst || (uintptr_t)dst & 7)
+			return RESPST_ERR_MISALIGNED_ATOMIC;
+
+		/* Do atomic write after all prior operations have completed */
+		smp_store_release(dst, src);
+
+		/* decrease resp.resid to zero */
+		qp->resp.resid -= sizeof(payload);
+
+		qp->resp.msn++;
+
+		/* next expected psn, read handles this separately */
+		qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+		qp->resp.ack_psn = qp->resp.psn;
+
+		qp->resp.opcode = pkt->opcode;
+		qp->resp.status = IB_WC_SUCCESS;
+
+		return RESPST_ACKNOWLEDGE;
+#else
+		pr_err("32-bit arch doesn't support 8-byte atomic write\n");
+		return RESPST_ERR_UNSUPPORTED_OPCODE;
+#endif /* CONFIG_64BIT */
+	}
+
+	return RESPST_ACKNOWLEDGE;
+}
+
 static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
 					  struct rxe_pkt_info *ack,
 					  int opcode,
@@ -892,6 +942,8 @@  static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
 		return RESPST_READ_REPLY;
 	} else if (pkt->mask & RXE_ATOMIC_MASK) {
 		return RESPST_ATOMIC_REPLY;
+	} else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
+		return RESPST_ATOMIC_WRITE_REPLY;
 	} else {
 		/* Unreachable */
 		WARN_ON_ONCE(1);
@@ -1074,6 +1126,31 @@  static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
 	return err;
 }
 
+static int send_read_response(struct rxe_qp *qp, u8 syndrome, u32 psn)
+{
+	int err = 0;
+	struct rxe_pkt_info ack_pkt;
+	struct sk_buff *skb;
+
+	skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY,
+				0, psn, syndrome);
+	if (!skb) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = rxe_xmit_packet(qp, &ack_pkt, skb);
+	if (err)
+		pr_err_ratelimited("Failed sending read response\n");
+
+	/* have to clear this since it is used to trigger
+	 * long read replies
+	 */
+	qp->resp.res = NULL;
+out:
+	return err;
+}
+
 static enum resp_states acknowledge(struct rxe_qp *qp,
 				    struct rxe_pkt_info *pkt)
 {
@@ -1084,6 +1161,8 @@  static enum resp_states acknowledge(struct rxe_qp *qp,
 		send_ack(qp, qp->resp.aeth_syndrome, pkt->psn);
 	else if (pkt->mask & RXE_ATOMIC_MASK)
 		send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
+	else if (pkt->mask & RXE_ATOMIC_WRITE_MASK)
+		send_read_response(qp, AETH_ACK_UNLIMITED, pkt->psn);
 	else if (bth_ack(pkt))
 		send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
 
@@ -1195,7 +1274,9 @@  static enum resp_states duplicate_request(struct rxe_qp *qp,
 			res->replay = 1;
 			res->cur_psn = pkt->psn;
 			qp->resp.res = res;
-			rc = RESPST_ATOMIC_REPLY;
+			rc = pkt->mask & RXE_ATOMIC_MASK ?
+					RESPST_ATOMIC_REPLY :
+					RESPST_ATOMIC_WRITE_REPLY;
 			goto out;
 		}
 
@@ -1335,6 +1416,9 @@  int rxe_responder(void *arg)
 		case RESPST_ATOMIC_REPLY:
 			state = atomic_reply(qp, pkt);
 			break;
+		case RESPST_ATOMIC_WRITE_REPLY:
+			state = atomic_write_reply(qp, pkt);
+			break;
 		case RESPST_ACKNOWLEDGE:
 			state = acknowledge(qp, pkt);
 			break;
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index a9162f25beaf..519ec6b841e7 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -84,6 +84,7 @@  enum {
 	/* opcode 0x15 is reserved */
 	IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
 	IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
+	IB_OPCODE_RDMA_ATOMIC_WRITE                 = 0x1D,
 
 	/* real constants follow -- see comment about above IB_OPCODE()
 	   macro for more details */
@@ -112,6 +113,7 @@  enum {
 	IB_OPCODE(RC, FETCH_ADD),
 	IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
 	IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+	IB_OPCODE(RC, RDMA_ATOMIC_WRITE),
 
 	/* UC */
 	IB_OPCODE(UC, SEND_FIRST),
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9c6317cf80d5..7834285c8498 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -985,6 +985,7 @@  enum ib_wc_opcode {
 	IB_WC_REG_MR,
 	IB_WC_MASKED_COMP_SWAP,
 	IB_WC_MASKED_FETCH_ADD,
+	IB_WC_RDMA_ATOMIC_WRITE = IB_UVERBS_WC_RDMA_ATOMIC_WRITE,
 /*
  * Set value of IB_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IB_WC_RECV).
@@ -1325,6 +1326,7 @@  enum ib_wr_opcode {
 		IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
 	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
 		IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+	IB_WR_RDMA_ATOMIC_WRITE = IB_UVERBS_WR_RDMA_ATOMIC_WRITE,
 
 	/* These are kernel only and can not be issued by userspace */
 	IB_WR_REG_MR = 0x20,
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 7dd903d932e5..175ade79e358 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -466,6 +466,7 @@  enum ib_uverbs_wc_opcode {
 	IB_UVERBS_WC_BIND_MW = 5,
 	IB_UVERBS_WC_LOCAL_INV = 6,
 	IB_UVERBS_WC_TSO = 7,
+	IB_UVERBS_WC_RDMA_ATOMIC_WRITE = 9,
 };
 
 struct ib_uverbs_wc {
@@ -784,6 +785,7 @@  enum ib_uverbs_wr_opcode {
 	IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
 	IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
 	IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
+	IB_UVERBS_WR_RDMA_ATOMIC_WRITE = 15,
 	/* Review enum ib_wr_opcode before modifying this */
 };
 
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
index f09c5c9e3dd5..845da9cb04fd 100644
--- a/include/uapi/rdma/rdma_user_rxe.h
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -86,6 +86,7 @@  struct rxe_send_wr {
 			__aligned_u64 remote_addr;
 			__u32	rkey;
 			__u32	reserved;
+			__u8	atomic_wr[8];
 		} rdma;
 		struct {
 			__aligned_u64 remote_addr;