diff mbox

[rdma-core,07/14] mlx4: Update to use new udma write barriers

Message ID 1487272989-8215-8-git-send-email-jgunthorpe@obsidianresearch.com (mailing list archive)
State Accepted
Headers show

Commit Message

Jason Gunthorpe Feb. 16, 2017, 7:23 p.m. UTC
The mlx4 comments are good so these translate fairly directly.

- Added barrier at the top of mlx4_post_send, this makes the driver
  ready for a change to a stronger udma_to_device_barrier /
  weaker udma_order_write_barrier() which  would make the post loop a bit
  faster. No change on x86-64
- The wmb() directly before the BF copy is upgraded to a wc_wmb(),
  this is consistent with what mlx5 does and makes sense.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
---
 providers/mlx4/cq.c  |  6 +++---
 providers/mlx4/qp.c  | 19 +++++++++++--------
 providers/mlx4/srq.c |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

Comments

Yishai Hadas Feb. 20, 2017, 5:46 p.m. UTC | #1
On 2/16/2017 9:23 PM, Jason Gunthorpe wrote:
> The mlx4 comments are good so these translate fairly directly.
>
> - Added barrier at the top of mlx4_post_send, this makes the driver
>   ready for a change to a stronger udma_to_device_barrier /
>   weaker udma_order_write_barrier() which  would make the post loop a bit
>   faster. No change on x86-64
> - The wmb() directly before the BF copy is upgraded to a wc_wmb(),
>   this is consistent with what mlx5 does and makes sense.
>
> Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> ---
>  providers/mlx4/cq.c  |  6 +++---
>  providers/mlx4/qp.c  | 19 +++++++++++--------
>  providers/mlx4/srq.c |  2 +-
>  3 files changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/providers/mlx4/cq.c b/providers/mlx4/cq.c
> index 6a5cf8be218892..14f8cbce6d75ed 100644
> --- a/providers/mlx4/cq.c
> +++ b/providers/mlx4/cq.c
> @@ -222,7 +222,7 @@ static inline int mlx4_get_next_cqe(struct mlx4_cq *cq,
>  	 * Make sure we read CQ entry contents after we've checked the
>  	 * ownership bit.
>  	 */
> -	rmb();
> +	udma_from_device_barrier();
>
>  	*pcqe = cqe;
>
> @@ -698,7 +698,7 @@ int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
>  	 * Make sure that the doorbell record in host memory is
>  	 * written before ringing the doorbell via PCI MMIO.
>  	 */
> -	wmb();
> +	udma_to_device_barrier();
>
>  	doorbell[0] = htonl(sn << 28 | cmd | cq->cqn);
>  	doorbell[1] = htonl(ci);
> @@ -764,7 +764,7 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
>  		 * Make sure update of buffer contents is done before
>  		 * updating consumer index.
>  		 */
> -		wmb();
> +		udma_to_device_barrier();
>  		mlx4_update_cons_index(cq);
>  	}
>  }
> diff --git a/providers/mlx4/qp.c b/providers/mlx4/qp.c
> index a607326c7c452c..77a4a34576cb69 100644
> --- a/providers/mlx4/qp.c
> +++ b/providers/mlx4/qp.c
> @@ -204,7 +204,7 @@ static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
>  	 * chunk and get a valid (!= * 0xffffffff) byte count but
>  	 * stale data, and end up sending the wrong data.
>  	 */
> -	wmb();
> +	udma_ordering_write_barrier();
>
>  	if (likely(sg->length))
>  		dseg->byte_count = htonl(sg->length);
> @@ -228,6 +228,9 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>
>  	pthread_spin_lock(&qp->sq.lock);
>
> +	/* Get all user DMA buffers ready to go */
> +	udma_to_device_barrier();
> +
>  	/* XXX check that state is OK to post send */

Not clear why do we need here an extra barrier ? what is the future 
optimization that you pointed on ?
We prefer not adding any new instructions at least on some ARCHs without 
a clear justification.

>  	ind = qp->sq.head;
> @@ -400,7 +403,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>  					wqe += to_copy;
>  					addr += to_copy;
>  					seg_len += to_copy;
> -					wmb(); /* see comment below */
> +					udma_ordering_write_barrier(); /* see comment below */
>  					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
>  					seg_len = 0;
>  					seg = wqe;
> @@ -428,7 +431,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>  				 * data, and end up sending the wrong
>  				 * data.
>  				 */
> -				wmb();
> +				udma_ordering_write_barrier();
>  				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
>  			}
>
> @@ -450,7 +453,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>  		 * setting ownership bit (because HW can start
>  		 * executing as soon as we do).
>  		 */
> -		wmb();
> +		udma_ordering_write_barrier();
>
>  		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
>  			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
> @@ -478,7 +481,7 @@ out:
>  		 * Make sure that descriptor is written to memory
>  		 * before writing to BlueFlame page.
>  		 */
> -		wmb();
> +		mmio_wc_start();

Why to make this change which affects at least X86_64 ? the data was 
previously written to the host memory so we expect that wmb is enough 
here. See above comment which explicitly points on.
>
>  		++qp->sq.head;
>
> @@ -486,7 +489,7 @@ out:
>
>  		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
>  			     align(size * 16, 64));
> -		wc_wmb();
> +		mmio_flush_writes();
>
>  		ctx->bf_offset ^= ctx->bf_buf_size;
>
> @@ -498,7 +501,7 @@ out:
>  		 * Make sure that descriptors are written before
>  		 * doorbell record.
>  		 */
> -		wmb();
> +		udma_to_device_barrier();
>
>  		mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
>  			    qp->doorbell_qpn);
> @@ -566,7 +569,7 @@ out:
>  		 * Make sure that descriptors are written before
>  		 * doorbell record.
>  		 */
> -		wmb();
> +		udma_to_device_barrier();
>
>  		*qp->db = htonl(qp->rq.head & 0xffff);
>  	}
> diff --git a/providers/mlx4/srq.c b/providers/mlx4/srq.c
> index 4f90efdf927209..6e4ff5663d019b 100644
> --- a/providers/mlx4/srq.c
> +++ b/providers/mlx4/srq.c
> @@ -113,7 +113,7 @@ int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
>  		 * Make sure that descriptors are written before
>  		 * we write doorbell record.
>  		 */
> -		wmb();
> +		udma_to_device_barrier();
>
>  		*srq->db = htonl(srq->counter);
>  	}
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 21, 2017, 6:14 p.m. UTC | #2
On Mon, Feb 20, 2017 at 07:46:02PM +0200, Yishai Hadas wrote:

> > 	pthread_spin_lock(&qp->sq.lock);
> >
> >+	/* Get all user DMA buffers ready to go */
> >+	udma_to_device_barrier();
> >+
> > 	/* XXX check that state is OK to post send */
> 
> Not clear why do we need here an extra barrier ? what is the future
> optimization that you pointed on ?

Writes to different memory types are not guarenteed to be strongly
ordered. This is narrowly true on x86-64 too apparently.

The purpose of udma_to_device_barrier is to serialize all memory
types.

This allows the follow on code to use the weaker
'udma_ordering_write_barrier' when it knows it is working exclusively
with cached memory.

Eg in an ideal world on x86 udma_to_device_barrier should be SFENCE
and 'udma_ordering_write_barrier' should be compiler_barrier()

Since the barriers have been so ill defined and wonky on x86 other
arches use much stronger barriers than they actually need for each
cases. Eg ARM64 can switch to much weaker barriers.

Since there is no cost on x86-64 to this barrier I would like to leave
it here as it lets us actually optimize the ARM and other barriers. If
you take it out then 'udma_ordering_write_barrier' is forced to be the
strongest barrier on all arches.

> >@@ -478,7 +481,7 @@ out:
> > 		 * Make sure that descriptor is written to memory
> > 		 * before writing to BlueFlame page.
> > 		 */
> >-		wmb();
> >+		mmio_wc_start();
> 
> Why to make this change which affects at least X86_64 ? the data was
> previously written to the host memory so we expect that wmb is
> enough here.

Same as above, writes to different memory types are not strongly
ordered and wmb() is not the right barrier to serialize writes to
DMA'able ctrl with the WC memcpy.

If this is left wrong then again other arches are again required to
adopt the strongest barrier for everything which hurts them.

Even on x86, it is very questionable to not have the SFENCE in that
spot. AFAIK it is not defined to be strongly ordered.

mlx5 has the SFENCE here, for instance.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yishai Hadas March 6, 2017, 2:57 p.m. UTC | #3
On 2/21/2017 8:14 PM, Jason Gunthorpe wrote:
> On Mon, Feb 20, 2017 at 07:46:02PM +0200, Yishai Hadas wrote:
>
>>> 	pthread_spin_lock(&qp->sq.lock);
>>>
>>> +	/* Get all user DMA buffers ready to go */
>>> +	udma_to_device_barrier();
>>> +
>>> 	/* XXX check that state is OK to post send */
>>
>> Not clear why do we need here an extra barrier ? what is the future
>> optimization that you pointed on ?
>
> Writes to different memory types are not guarenteed to be strongly
> ordered. This is narrowly true on x86-64 too apparently.
>
> The purpose of udma_to_device_barrier is to serialize all memory
> types.
>
> This allows the follow on code to use the weaker
> 'udma_ordering_write_barrier' when it knows it is working exclusively
> with cached memory.
>
> Eg in an ideal world on x86 udma_to_device_barrier should be SFENCE
> and 'udma_ordering_write_barrier' should be compiler_barrier()
>
> Since the barriers have been so ill defined and wonky on x86 other
> arches use much stronger barriers than they actually need for each
> cases. Eg ARM64 can switch to much weaker barriers.
>
> Since there is no cost on x86-64 to this barrier I would like to leave
> it here as it lets us actually optimize the ARM and other barriers. If
> you take it out then 'udma_ordering_write_barrier' is forced to be the
> strongest barrier on all arches.

Till we make the further optimizations, we suspect a performance 
degradation in other ARCH(s) rather than X86, as this patch introduce an 
extra barrier which wasn't before (i.e udma_to_device_barrier).
Usually such a change should come with its follows improvement patch to 
justify it. The impact is still something that we should measure but 
basically any new code in the data path must be justified when it's 
added from performance point of view.

>>> @@ -478,7 +481,7 @@ out:
>>> 		 * Make sure that descriptor is written to memory
>>> 		 * before writing to BlueFlame page.
>>> 		 */
>>> -		wmb();
>>> +		mmio_wc_start();
>>
>> Why to make this change which affects at least X86_64 ? the data was
>> previously written to the host memory so we expect that wmb is
>> enough here.
>
> Same as above, writes to different memory types are not strongly
> ordered and wmb() is not the right barrier to serialize writes to
> DMA'able ctrl with the WC memcpy.
>
> If this is left wrong then again other arches are again required to
> adopt the strongest barrier for everything which hurts them.
>
> Even on x86, it is very questionable to not have the SFENCE in that
> spot. AFAIK it is not defined to be strongly ordered.
>
> mlx5 has the SFENCE here, for instance.

We made some performance testing with the above change, initial results 
point on degradation of about 3% in the message rate in the above 
BlueFlame path in X86, this is something that we should prevent.

Based on below analysis it looks as the change to use 'mmio_wc_start()' 
which is mapped to SFENCE in X86 is redundant.

Details:
There is a call to 'pthread_spin_lock()' in between the WB (write back 
host memory) to WC writes.

pthread_spin_lock is defined [1] to x86 XCHG atomic instruction. Based 
on Intel® 64 and IA-32 Architectures [2] such an operation operates as 
some barrier to make sure that previous memory writes were completed.
As such we expect to see here some other macro which consider whether 
lock is a barrier based on ARCH instead of taking in call cases the 
SFENCE barrier.



[1] http://code.metager.de/source/xref/gnu/glibc/nptl/pthread_spin_lock.c
Call atomic_exchange_acq() which is defined in Line 115 at:
http://code.metager.de/source/xref/gnu/glibc/sysdeps/x86_64/atomic-machine.h


[2] Intel® 64 and IA-32 Architectures Software Developer’s Manual
At: 
https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf

8.1.2.2 Software Controlled Bus Locking

On page 8-3 Vol. 3A  : “The LOCK prefix is automatically assumed for 
XCHG instruction.”

On page 8-4 Vol. 3A: “Locked instructions can be used to synchronize 
data written by one processor and read by another processor.
For the P6 family processors, locked operations serialize all 
outstanding load and store operations (that is, wait for
them to complete). This rule is also true for the Pentium 4 and Intel 
Xeon processors, with one exception. Load
operations that reference weakly ordered memory types (such as the WC 
memory type) may not be serialized.

8.3 SERIALIZING INSTRUCTIONS

On page 8-16 Vol. 3A: “Locking operations typically operate like I/O 
operations in that they wait for all previous instructions to complete 
and for all buffered writes to drain to memory”









--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/providers/mlx4/cq.c b/providers/mlx4/cq.c
index 6a5cf8be218892..14f8cbce6d75ed 100644
--- a/providers/mlx4/cq.c
+++ b/providers/mlx4/cq.c
@@ -222,7 +222,7 @@  static inline int mlx4_get_next_cqe(struct mlx4_cq *cq,
 	 * Make sure we read CQ entry contents after we've checked the
 	 * ownership bit.
 	 */
-	rmb();
+	udma_from_device_barrier();
 
 	*pcqe = cqe;
 
@@ -698,7 +698,7 @@  int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
 	 * Make sure that the doorbell record in host memory is
 	 * written before ringing the doorbell via PCI MMIO.
 	 */
-	wmb();
+	udma_to_device_barrier();
 
 	doorbell[0] = htonl(sn << 28 | cmd | cq->cqn);
 	doorbell[1] = htonl(ci);
@@ -764,7 +764,7 @@  void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
 		 * Make sure update of buffer contents is done before
 		 * updating consumer index.
 		 */
-		wmb();
+		udma_to_device_barrier();
 		mlx4_update_cons_index(cq);
 	}
 }
diff --git a/providers/mlx4/qp.c b/providers/mlx4/qp.c
index a607326c7c452c..77a4a34576cb69 100644
--- a/providers/mlx4/qp.c
+++ b/providers/mlx4/qp.c
@@ -204,7 +204,7 @@  static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 	 * chunk and get a valid (!= * 0xffffffff) byte count but
 	 * stale data, and end up sending the wrong data.
 	 */
-	wmb();
+	udma_ordering_write_barrier();
 
 	if (likely(sg->length))
 		dseg->byte_count = htonl(sg->length);
@@ -228,6 +228,9 @@  int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 
 	pthread_spin_lock(&qp->sq.lock);
 
+	/* Get all user DMA buffers ready to go */
+	udma_to_device_barrier();
+
 	/* XXX check that state is OK to post send */
 
 	ind = qp->sq.head;
@@ -400,7 +403,7 @@  int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 					wqe += to_copy;
 					addr += to_copy;
 					seg_len += to_copy;
-					wmb(); /* see comment below */
+					udma_ordering_write_barrier(); /* see comment below */
 					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
 					seg_len = 0;
 					seg = wqe;
@@ -428,7 +431,7 @@  int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 				 * data, and end up sending the wrong
 				 * data.
 				 */
-				wmb();
+				udma_ordering_write_barrier();
 				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
 			}
 
@@ -450,7 +453,7 @@  int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 		 * setting ownership bit (because HW can start
 		 * executing as soon as we do).
 		 */
-		wmb();
+		udma_ordering_write_barrier();
 
 		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
 			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
@@ -478,7 +481,7 @@  out:
 		 * Make sure that descriptor is written to memory
 		 * before writing to BlueFlame page.
 		 */
-		wmb();
+		mmio_wc_start();
 
 		++qp->sq.head;
 
@@ -486,7 +489,7 @@  out:
 
 		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
 			     align(size * 16, 64));
-		wc_wmb();
+		mmio_flush_writes();
 
 		ctx->bf_offset ^= ctx->bf_buf_size;
 
@@ -498,7 +501,7 @@  out:
 		 * Make sure that descriptors are written before
 		 * doorbell record.
 		 */
-		wmb();
+		udma_to_device_barrier();
 
 		mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
 			    qp->doorbell_qpn);
@@ -566,7 +569,7 @@  out:
 		 * Make sure that descriptors are written before
 		 * doorbell record.
 		 */
-		wmb();
+		udma_to_device_barrier();
 
 		*qp->db = htonl(qp->rq.head & 0xffff);
 	}
diff --git a/providers/mlx4/srq.c b/providers/mlx4/srq.c
index 4f90efdf927209..6e4ff5663d019b 100644
--- a/providers/mlx4/srq.c
+++ b/providers/mlx4/srq.c
@@ -113,7 +113,7 @@  int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
 		 * Make sure that descriptors are written before
 		 * we write doorbell record.
 		 */
-		wmb();
+		udma_to_device_barrier();
 
 		*srq->db = htonl(srq->counter);
 	}