diff mbox

[rdma-core,06/14] i40iw: Get rid of unique barrier macros

Message ID 1487272989-8215-7-git-send-email-jgunthorpe@obsidianresearch.com (mailing list archive)
State Accepted
Headers show

Commit Message

Jason Gunthorpe Feb. 16, 2017, 7:23 p.m. UTC
Use our standard versions from util instead. There doesn't seem
to be anything tricky here, but the inlined versions were like our
wc_wmb() barriers, not the wmb().

There appears to be no WC memory in this driver, so despite the comments,
these barriers are also making sure that user DMA data is flushed out. Make
them all wmb()

Guess at where the missing rmb() should be.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
---
 providers/i40iw/i40iw_osdep.h | 14 --------------
 providers/i40iw/i40iw_uk.c    | 26 ++++++++++++++------------
 2 files changed, 14 insertions(+), 26 deletions(-)

Comments

Saleem, Shiraz March 1, 2017, 5:29 p.m. UTC | #1
On Thu, Feb 16, 2017 at 12:23:01PM -0700, Jason Gunthorpe wrote:
> Use our standard versions from util instead. There doesn't seem
> to be anything tricky here, but the inlined versions were like our
> wc_wmb() barriers, not the wmb().
> 
> There appears to be no WC memory in this driver, so despite the comments,
> these barriers are also making sure that user DMA data is flushed out. Make
> them all wmb()
> 
> Guess at where the missing rmb() should be.
> 
> Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> ---
>  providers/i40iw/i40iw_osdep.h | 14 --------------
>  providers/i40iw/i40iw_uk.c    | 26 ++++++++++++++------------
>  2 files changed, 14 insertions(+), 26 deletions(-)
> 
> diff --git a/providers/i40iw/i40iw_osdep.h b/providers/i40iw/i40iw_osdep.h
> index fddedf40dd8ae2..92bedd31633eb5 100644
> --- a/providers/i40iw/i40iw_osdep.h
> +++ b/providers/i40iw/i40iw_osdep.h
> @@ -105,18 +105,4 @@ static inline void db_wr32(u32 value, u32 *wqe_word)
>  #define ACQUIRE_LOCK()
>  #define RELEASE_LOCK()
>  
> -#if defined(__i386__)
> -#define i40iw_mb() mb()		/* full memory barrier */
> -#define i40iw_wmb() mb()	/* write memory barrier */
> -#elif defined(__x86_64__)
> -#define i40iw_mb() asm volatile("mfence" ::: "memory")	 /* full memory barrier */
> -#define i40iw_wmb() asm volatile("sfence" ::: "memory")	 /* write memory barrier */
> -#else
> -#define i40iw_mb() mb()		/* full memory barrier */
> -#define i40iw_wmb() wmb()	/* write memory barrier */
> -#endif
> -#define i40iw_rmb() rmb()	/* read memory barrier */
> -#define i40iw_smp_mb() smp_mb()		/* memory barrier */
> -#define i40iw_smp_wmb() smp_wmb()	/* write memory barrier */
> -#define i40iw_smp_rmb() smp_rmb()	/* read memory barrier */
>  #endif				/* _I40IW_OSDEP_H_ */
> diff --git a/providers/i40iw/i40iw_uk.c b/providers/i40iw/i40iw_uk.c
> index d3e4fec7d8515b..b20748e9f09199 100644
> --- a/providers/i40iw/i40iw_uk.c
> +++ b/providers/i40iw/i40iw_uk.c
> @@ -75,7 +75,7 @@ static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
>  	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
>  	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID) | nop_signature++;
>  
> -	i40iw_wmb();	/* Memory barrier to ensure data is written before valid bit is set */
> +	udma_to_device_barrier();	/* Memory barrier to ensure data is written before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  	return 0;
> @@ -91,7 +91,7 @@ void i40iw_qp_post_wr(struct i40iw_qp_uk *qp)
>  	u32 hw_sq_tail;
>  	u32 sw_sq_head;
>  
> -	i40iw_mb(); /* valid bit is written and loads completed before reading shadow */
> +	udma_to_device_barrier(); /* valid bit is written and loads completed before reading shadow */

The constraint here is that the writes to SQ WQE must be globally visible to the 
PCIe device before the read of the shadow area. For x86, since loads can be reordered 
with older stores to a different location, we need some sort of a storeload barrier to 
enforce the constraint and hence the mfence was chosen. The udma_to_device_barrier() 
equates to just a compiler barrier on x86 and isn't sufficient for this purpose.

>  
>  	/* read the doorbell shadow area */
>  	get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);
> @@ -297,7 +297,7 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
>  		byte_off += 16;
>  	}
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  
> @@ -347,7 +347,7 @@ static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp,
>  
>  	i40iw_set_fragment(wqe, I40IW_BYTE_0, &op_info->lo_addr);
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  	if (post_sq)
> @@ -410,7 +410,7 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
>  		byte_off += 16;
>  	}
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  	if (post_sq)
> @@ -478,7 +478,7 @@ static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp,
>  		memcpy(dest, src, op_info->len - I40IW_BYTE_16);
>  	}
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  
> @@ -552,7 +552,7 @@ static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp,
>  		memcpy(dest, src, op_info->len - I40IW_BYTE_16);
>  	}
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  
> @@ -601,7 +601,7 @@ static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp
>  	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
>  	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  
> @@ -650,7 +650,7 @@ static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp,
>  	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
>  	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  
> @@ -694,7 +694,7 @@ static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
>  		byte_off += 16;
>  	}
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  
> @@ -731,7 +731,7 @@ static void i40iw_cq_request_notification(struct i40iw_cq_uk *cq,
>  
>  	set_64bit_val(cq->shadow_area, I40IW_BYTE_32, temp_val);
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	db_wr32(cq->cq_id, cq->cqe_alloc_reg);
>  }
> @@ -780,6 +780,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
>  	if (polarity != cq->polarity)
>  		return I40IW_ERR_QUEUE_EMPTY;
>  
> +	udma_from_device_barrier();
> +
>  	q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
>  	info->error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
>  	info->push_dropped = (bool)RS_64(qword3, I40IWCQ_PSHDROP);
> @@ -1121,7 +1123,7 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
>  	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
>  	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
>  
> -	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
> +	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
>  
>  	set_64bit_val(wqe, I40IW_BYTE_24, header);
>  	if (post_sq)
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe March 1, 2017, 5:55 p.m. UTC | #2
On Wed, Mar 01, 2017 at 11:29:20AM -0600, Shiraz Saleem wrote:

> The constraint here is that the writes to SQ WQE must be globally
> visible to the PCIe device before the read of the shadow area.

I'm struggling to understand how this can make any sense..

It looks like shadow_area is in system memory:

providers/i40iw/i40iw_uverbs.c: info.cq_base = memalign(I40IW_HW_PAGE_SIZE, totalsize);
providers/i40iw/i40iw_uverbs.c: info.shadow_area = (u64 *)((u8 *)info.cq_base + (cq_pages << 12));
providers/i40iw/i40iw_uk.c:     qp->shadow_area = info->shadow_area;

Is there DMA occuring to shadow_area?

Is this trying to implement CPU atomics in shadow_area for SMP?

I'm struggling to understand what a store/load barrier should even do
when talking about DMA.

> For x86, since loads can be reordered with older stores to a
> different location, we need some sort of a storeload barrier to
> enforce the constraint and hence the mfence was chosen. The
> udma_to_device_barrier() equates to just a compiler barrier on x86
> and isn't sufficient for this purpose.

We've never had MFENCE in our set of standard barriers. If it really
is a needed operation for DMA then we will have to add a new barrier
macro for it.

However - I can't explain what that macro should be doing relative to
DMA, so I need more help.. Can you diagram a ladder chart to show the
issue?

This is the sequence at the CPU we are looking at:

udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
set_64bit_val(wqe, I40IW_BYTE_24, header);
udma_to_device_barrier();
get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);

What is the 2nd actor doing? Is it DMA from the PCI-E device? Is it
another SMP core?

What can wrong if it executes like this?

get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);
udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
set_64bit_val(wqe, I40IW_BYTE_24, header);
udma_to_device_barrier();

Is this the only barrier you are worried about?
Are the other changes OK?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saleem, Shiraz March 1, 2017, 10:14 p.m. UTC | #3
On Wed, Mar 01, 2017 at 10:55:21AM -0700, Jason Gunthorpe wrote:
> On Wed, Mar 01, 2017 at 11:29:20AM -0600, Shiraz Saleem wrote:
> 
> 
> Is there DMA occuring to shadow_area?

The shadow area contains status variables which are read by SW and 
updated by PCI device.

> 
> Is this trying to implement CPU atomics in shadow_area for SMP?

No.

> What can wrong if it executes like this?
> 
> get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);
> udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
> set_64bit_val(wqe, I40IW_BYTE_24, header);
> udma_to_device_barrier();

We need strict ordering that ensures write of the WQE completes before 
read of the shadow area. This ensures the value read from the shadow can 
be used to determine if a DB ring is needed. If the shadow area is read first, 
the algorithm, in certain cases, would not ring the DB when it should and the 
HW may go idle with work requests posted.

> Is this the only barrier you are worried about?
> Are the other changes OK?

This is of most concern. The other changes look ok. Being reviewed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe March 1, 2017, 11:05 p.m. UTC | #4
On Wed, Mar 01, 2017 at 04:14:20PM -0600, Shiraz Saleem wrote:
> > Is there DMA occuring to shadow_area?
> 
> The shadow area contains status variables which are read by SW and 
> updated by PCI device.

So the device is DMA'ing to it, and the driver is reading DMA memory..

> > What can wrong if it executes like this?
> > 
> > get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);
> > udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
> > set_64bit_val(wqe, I40IW_BYTE_24, header);
> > udma_to_device_barrier();
> 
> We need strict ordering that ensures write of the WQE completes before 
> read of the shadow area.

> This ensures the value read from the shadow can be used to determine
> if a DB ring is needed. If the shadow area is read first, the
> algorithm, in certain cases, would not ring the DB when it should
> and the HW may go idle with work requests posted.

This still is not making a lot of sense to me.. I really need to see a
ladder diagram to understand your case.

Here is an example, I think what you are saying is: The HW could have
fetched valid = 0 and stopped the queue and the driver needs to
doorbell it to wake it up again. However, the driver optimizes away
the doorbell rings in certain cases based on reading a DMA result.

So here is a possible ladder diagram:

CPU                         DMA DEVICE
                            Issue READ#1 of valid bit
 Respond to READ#1
 SFENCE
 set_valid_bit
 MEFENCE
 read_tail
                            Receive READ#1 response with valid bit unset
			    Issue DMA WRITE to shadow_area indicating STOPPED
 DMA WRITE arrives

And the version where the DMA is seen:

CPU                         DMA DEVICE
                            Issue READ#1 of valid bit
 SFENCE
 Respond to READ#1
 set_valid_bit
 MEFENCE
                            Receive READ#1 response with valid bit unset
			    Issue DMA WRITE to shadow_area indicating STOPPED
 DMA WRITE arrives
 read_tail

These diagrams attempt to show that the DMA device reads the valid bit
then DMA's back to the shadow_area depending on what it read.

Given the semantics of MFENCE, both are possible, so I still don't
understand what the MFENCE is supposed to help with.

I get the feeling this approach requires MFENCE to do something it
doesn't...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saleem, Shiraz March 3, 2017, 9:45 p.m. UTC | #5
On Wed, Mar 01, 2017 at 04:05:06PM -0700, Jason Gunthorpe wrote:
> On Wed, Mar 01, 2017 at 04:14:20PM -0600, Shiraz Saleem wrote:
> > > Is there DMA occuring to shadow_area?
> > 
> > The shadow area contains status variables which are read by SW and 
> > updated by PCI device.
> 
> So the device is DMA'ing to it, and the driver is reading DMA memory..
> 
> > > What can wrong if it executes like this?
> > > 
> > > get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);
> > > udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
> > > set_64bit_val(wqe, I40IW_BYTE_24, header);
> > > udma_to_device_barrier();
> > 
> > We need strict ordering that ensures write of the WQE completes before 
> > read of the shadow area.
> 
> > This ensures the value read from the shadow can be used to determine
> > if a DB ring is needed. If the shadow area is read first, the
> > algorithm, in certain cases, would not ring the DB when it should
> > and the HW may go idle with work requests posted.
> 
> This still is not making a lot of sense to me.. I really need to see a
> ladder diagram to understand your case.
> 
> Here is an example, I think what you are saying is: The HW could have
> fetched valid = 0 and stopped the queue and the driver needs to
> doorbell it to wake it up again. However, the driver optimizes away
> the doorbell rings in certain cases based on reading a DMA result.
> 
> So here is a possible ladder diagram:
> 
> CPU                         DMA DEVICE
>                             Issue READ#1 of valid bit
>  Respond to READ#1
>  SFENCE
>  set_valid_bit
>  MEFENCE
>  read_tail
>                             Receive READ#1 response with valid bit unset
> 			    Issue DMA WRITE to shadow_area indicating STOPPED
>  DMA WRITE arrives
> 
> And the version where the DMA is seen:
> 
> CPU                         DMA DEVICE
>                             Issue READ#1 of valid bit
>  SFENCE
>  Respond to READ#1
>  set_valid_bit
>  MEFENCE
>                             Receive READ#1 response with valid bit unset
> 			    Issue DMA WRITE to shadow_area indicating STOPPED
>  DMA WRITE arrives
>  read_tail
> 
> These diagrams attempt to show that the DMA device reads the valid bit
> then DMA's back to the shadow_area depending on what it read.


This is not quite how our DB logic works. There are additional HW steps and nuances 
in the flow. Unfortunately, to explain this, we need to provide details of our internal 
HW flow for the DB logic. We are unable to do so at this time.

The ordering is a HW requirement. i.e write valid bit of WQE __must__ precede the read 
tail of the shadow.

> 
> I get the feeling this approach requires MFENCE to do something it
> doesn't...

Mfence guarantees that load won't be reordered before the store, and thus 
we are using it.

We understand this is a unique requirement specific to our design but it is necessary.

The rest of the changes look ok.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe March 3, 2017, 10:22 p.m. UTC | #6
On Fri, Mar 03, 2017 at 03:45:14PM -0600, Shiraz Saleem wrote:

> This is not quite how our DB logic works. There are additional HW
> steps and nuances in the flow. Unfortunately, to explain this, we
> need to provide details of our internal HW flow for the DB logic. We
> are unable to do so at this time.

Well, it is very problematic to help you define what a cross-arch
barrier should do if you can't explain what you need to have happen
relative to PCI-E.

> > I get the feeling this approach requires MFENCE to do something it
> > doesn't...
> 
> Mfence guarantees that load won't be reordered before the store, and
> thus we are using it.

If that is all then the driver can use LFENCE and the
udma_from_device_barrier() .. Is that OK?

But fundamentally, PCI is fairly lax about what it permits the root
complex to do, and to what degree it requires strong ordering within
the root complex itself for PCI issued LOAD/STORES.

It is hard to understand how the order of CPU operations matters when
the PCI operations to different cache lines can be re-ordered inside
the root complex.

An approach may work on some x86-64 systems but be unreliable on other
arches, or even on unusual x86 (eg SGI's x86 NUMA systems have
conformant, but lax, ordering rules)

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saleem, Shiraz March 6, 2017, 6:18 p.m. UTC | #7
On Thu, Feb 16, 2017 at 12:23:01PM -0700, Jason Gunthorpe wrote:
> Use our standard versions from util instead. There doesn't seem
> to be anything tricky here, but the inlined versions were like our
> wc_wmb() barriers, not the wmb().
> 
> There appears to be no WC memory in this driver, so despite the comments,
> these barriers are also making sure that user DMA data is flushed out. Make
> them all wmb()
> 
> Guess at where the missing rmb() should be.
> 
> Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> ---


> @@ -780,6 +780,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
>  	if (polarity != cq->polarity)
>  		return I40IW_ERR_QUEUE_EMPTY;
>  
> +	udma_from_device_barrier();
> +

What is the need for the barrier here?

>  	q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
>  	info->error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
>  	info->push_dropped = (bool)RS_64(qword3, I40IWCQ_PSHDROP);
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe March 6, 2017, 7:07 p.m. UTC | #8
On Mon, Mar 06, 2017 at 12:18:09PM -0600, Shiraz Saleem wrote:
> On Thu, Feb 16, 2017 at 12:23:01PM -0700, Jason Gunthorpe wrote:
> > Use our standard versions from util instead. There doesn't seem
> > to be anything tricky here, but the inlined versions were like our
> > wc_wmb() barriers, not the wmb().
> > 
> > There appears to be no WC memory in this driver, so despite the comments,
> > these barriers are also making sure that user DMA data is flushed out. Make
> > them all wmb()
> > 
> > Guess at where the missing rmb() should be.
> > 
> > Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> 
> 
> > @@ -780,6 +780,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
> >  	if (polarity != cq->polarity)
> >  		return I40IW_ERR_QUEUE_EMPTY;
> >  
> > +	udma_from_device_barrier();
> > +
> 
> What is the need for the barrier here?

Every driver doing DMA needs to have a rmb(), I guessed this is
approximately the right place to put it for i10iw.

Within the PCI producer/consumer model the rmb needs to be placed
after the CPU observes a DMA write that proves other DMA writes have
occured.

It serializes the CPU with the write stream from PCI and prevents the
compiler/CPU from re-ordering dependent loads to be before the proving
load.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saleem, Shiraz March 6, 2017, 7:16 p.m. UTC | #9
On Fri, Mar 03, 2017 at 03:22:44PM -0700, Jason Gunthorpe wrote:
> On Fri, Mar 03, 2017 at 03:45:14PM -0600, Shiraz Saleem wrote:
> 
> > This is not quite how our DB logic works. There are additional HW
> > steps and nuances in the flow. Unfortunately, to explain this, we
> > need to provide details of our internal HW flow for the DB logic. We
> > are unable to do so at this time.
> 
> Well, it is very problematic to help you define what a cross-arch
> barrier should do if you can't explain what you need to have happen
> relative to PCI-E.
> 

Unfortunately, we can help with this only at the point when this information 
is made public. If you must have an explanation for all barriers defined in 
utils, an option here is to leave this barrier in i40iw and migrate it to 
utils when documentation is available. 

> > Mfence guarantees that load won't be reordered before the store, and
> > thus we are using it.
> 
> If that is all then the driver can use LFENCE and the
> udma_from_device_barrier() .. Is that OK?
>

The write valid WQE needs to be globally visible before read tail. LFENCE does not 
guarantee this. MFENCE does.

https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf

LFENCE (Vol. 3A 8-16)
"Serializes all load (read) operations that occurred prior to the LFENCE instruction 
in the program instruction stream, but does not affect store operations"

LFENCE (Vol. 2A 3-529)
"An LFENCE that follows an instruction that stores to memory might complete before 
the data being stored have become globally visible. Instructions following an LFENCE 
may be fetched from memory before the LFENCE, but they will not execute until the LFENCE 
completes"	

MFENCE (Vol. 2B 4-22)
"This serializing operation guarantees that every load and store instruction that precedes 
the MFENCE instruction in program order becomes globally visible before any load or store 
instruction that follows the MFENCE instruction"



 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saleem, Shiraz March 7, 2017, 11:16 p.m. UTC | #10
On Mon, Mar 06, 2017 at 12:07:51PM -0700, Jason Gunthorpe wrote:
> On Mon, Mar 06, 2017 at 12:18:09PM -0600, Shiraz Saleem wrote:
> > On Thu, Feb 16, 2017 at 12:23:01PM -0700, Jason Gunthorpe wrote:
> > > Use our standard versions from util instead. There doesn't seem
> > > to be anything tricky here, but the inlined versions were like our
> > > wc_wmb() barriers, not the wmb().
> > > 
> > > There appears to be no WC memory in this driver, so despite the comments,
> > > these barriers are also making sure that user DMA data is flushed out. Make
> > > them all wmb()
> > > 
> > > Guess at where the missing rmb() should be.
> > > 
> > > Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> > 
> > 
> > > @@ -780,6 +780,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
> > >  	if (polarity != cq->polarity)
> > >  		return I40IW_ERR_QUEUE_EMPTY;
> > >  
> > > +	udma_from_device_barrier();
> > > +
> > 
> > What is the need for the barrier here?
> 
> Every driver doing DMA needs to have a rmb(), I guessed this is
> approximately the right place to put it for i10iw.
> 
> Within the PCI producer/consumer model the rmb needs to be placed
> after the CPU observes a DMA write that proves other DMA writes have
> occured.
> 
> It serializes the CPU with the write stream from PCI and prevents the
> compiler/CPU from re-ordering dependent loads to be before the proving
> load.
> 

OK. This change makes sense. ACK.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/providers/i40iw/i40iw_osdep.h b/providers/i40iw/i40iw_osdep.h
index fddedf40dd8ae2..92bedd31633eb5 100644
--- a/providers/i40iw/i40iw_osdep.h
+++ b/providers/i40iw/i40iw_osdep.h
@@ -105,18 +105,4 @@  static inline void db_wr32(u32 value, u32 *wqe_word)
 #define ACQUIRE_LOCK()
 #define RELEASE_LOCK()
 
-#if defined(__i386__)
-#define i40iw_mb() mb()		/* full memory barrier */
-#define i40iw_wmb() mb()	/* write memory barrier */
-#elif defined(__x86_64__)
-#define i40iw_mb() asm volatile("mfence" ::: "memory")	 /* full memory barrier */
-#define i40iw_wmb() asm volatile("sfence" ::: "memory")	 /* write memory barrier */
-#else
-#define i40iw_mb() mb()		/* full memory barrier */
-#define i40iw_wmb() wmb()	/* write memory barrier */
-#endif
-#define i40iw_rmb() rmb()	/* read memory barrier */
-#define i40iw_smp_mb() smp_mb()		/* memory barrier */
-#define i40iw_smp_wmb() smp_wmb()	/* write memory barrier */
-#define i40iw_smp_rmb() smp_rmb()	/* read memory barrier */
 #endif				/* _I40IW_OSDEP_H_ */
diff --git a/providers/i40iw/i40iw_uk.c b/providers/i40iw/i40iw_uk.c
index d3e4fec7d8515b..b20748e9f09199 100644
--- a/providers/i40iw/i40iw_uk.c
+++ b/providers/i40iw/i40iw_uk.c
@@ -75,7 +75,7 @@  static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
 	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
 	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID) | nop_signature++;
 
-	i40iw_wmb();	/* Memory barrier to ensure data is written before valid bit is set */
+	udma_to_device_barrier();	/* Memory barrier to ensure data is written before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 	return 0;
@@ -91,7 +91,7 @@  void i40iw_qp_post_wr(struct i40iw_qp_uk *qp)
 	u32 hw_sq_tail;
 	u32 sw_sq_head;
 
-	i40iw_mb(); /* valid bit is written and loads completed before reading shadow */
+	udma_to_device_barrier(); /* valid bit is written and loads completed before reading shadow */
 
 	/* read the doorbell shadow area */
 	get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp);
@@ -297,7 +297,7 @@  static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
 		byte_off += 16;
 	}
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 
@@ -347,7 +347,7 @@  static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp,
 
 	i40iw_set_fragment(wqe, I40IW_BYTE_0, &op_info->lo_addr);
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 	if (post_sq)
@@ -410,7 +410,7 @@  static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
 		byte_off += 16;
 	}
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 	if (post_sq)
@@ -478,7 +478,7 @@  static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp,
 		memcpy(dest, src, op_info->len - I40IW_BYTE_16);
 	}
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 
@@ -552,7 +552,7 @@  static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp,
 		memcpy(dest, src, op_info->len - I40IW_BYTE_16);
 	}
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 
@@ -601,7 +601,7 @@  static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp
 	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
 	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 
@@ -650,7 +650,7 @@  static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp,
 	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
 	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 
@@ -694,7 +694,7 @@  static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
 		byte_off += 16;
 	}
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 
@@ -731,7 +731,7 @@  static void i40iw_cq_request_notification(struct i40iw_cq_uk *cq,
 
 	set_64bit_val(cq->shadow_area, I40IW_BYTE_32, temp_val);
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	db_wr32(cq->cq_id, cq->cqe_alloc_reg);
 }
@@ -780,6 +780,8 @@  static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
 	if (polarity != cq->polarity)
 		return I40IW_ERR_QUEUE_EMPTY;
 
+	udma_from_device_barrier();
+
 	q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
 	info->error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
 	info->push_dropped = (bool)RS_64(qword3, I40IWCQ_PSHDROP);
@@ -1121,7 +1123,7 @@  enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
 	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
 	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
 
-	i40iw_wmb(); /* make sure WQE is populated before valid bit is set */
+	udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */
 
 	set_64bit_val(wqe, I40IW_BYTE_24, header);
 	if (post_sq)