diff mbox

[v4,4/6] infiniband: cxgb4: Eliminate duplicate barriers on weakly-ordered archs

Message ID 1521514068-8856-5-git-send-email-okaya@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Sinan Kaya March 20, 2018, 2:47 a.m. UTC
Code includes wmb() followed by writel(). writel() already has a barrier on
some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/infiniband/hw/cxgb4/t4.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

Comments

Jason Gunthorpe March 20, 2018, 2:51 p.m. UTC | #1
On Mon, Mar 19, 2018 at 10:47:46PM -0400, Sinan Kaya wrote:
> Code includes wmb() followed by writel(). writel() already has a barrier on
> some architectures like arm64.
> 
> This ends up CPU observing two barriers back to back before executing the
> register write.
> 
> Since code already has an explicit barrier call, changing writel() to
> writel_relaxed().
> 
> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
>  drivers/infiniband/hw/cxgb4/t4.h | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
> index 8369c7c..6e5658a 100644
> +++ b/drivers/infiniband/hw/cxgb4/t4.h
> @@ -457,7 +457,7 @@ static inline void pio_copy(u64 __iomem *dst, u64 *src)
>  	int count = 8;
>  
>  	while (count) {
> -		writeq(*src, dst);
> +		writeq_relaxed(*src, dst);
>  		src++;
>  		dst++;
>  		count--;

This is another case where writes can be re-ordered.. IIRC dst is WC
BAR memory, so the NIC should tolerate re-ordering, but Steve will
have to ack this.

Jason
Steve Wise March 20, 2018, 3:10 p.m. UTC | #2
> On Mon, Mar 19, 2018 at 10:47:46PM -0400, Sinan Kaya wrote:
> > Code includes wmb() followed by writel(). writel() already has a barrier
on
> > some architectures like arm64.
> >
> > This ends up CPU observing two barriers back to back before executing
> the
> > register write.
> >
> > Since code already has an explicit barrier call, changing writel() to
> > writel_relaxed().
> >
> > Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> >  drivers/infiniband/hw/cxgb4/t4.h | 14 +++++++-------
> >  1 file changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/cxgb4/t4.h
> b/drivers/infiniband/hw/cxgb4/t4.h
> > index 8369c7c..6e5658a 100644
> > +++ b/drivers/infiniband/hw/cxgb4/t4.h
> > @@ -457,7 +457,7 @@ static inline void pio_copy(u64 __iomem *dst, u64
> *src)
> >  	int count = 8;
> >
> >  	while (count) {
> > -		writeq(*src, dst);
> > +		writeq_relaxed(*src, dst);
> >  		src++;
> >  		dst++;
> >  		count--;
> 
> This is another case where writes can be re-ordered.. IIRC dst is WC
> BAR memory, so the NIC should tolerate re-ordering, but Steve will
> have to ack this.
> 

Yes, this is WC BAR memory.  The goal is that pio_copy() will enable
write-combining this into a single 64B pci-e transaction.
Steve Wise March 20, 2018, 3:38 p.m. UTC | #3
> > On Mon, Mar 19, 2018 at 10:47:46PM -0400, Sinan Kaya wrote:
> > > Code includes wmb() followed by writel(). writel() already has a
barrier
> on
> > > some architectures like arm64.
> > >
> > > This ends up CPU observing two barriers back to back before executing
> > the
> > > register write.
> > >
> > > Since code already has an explicit barrier call, changing writel() to
> > > writel_relaxed().
> > >
> > > Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> > >  drivers/infiniband/hw/cxgb4/t4.h | 14 +++++++-------
> > >  1 file changed, 7 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/infiniband/hw/cxgb4/t4.h
> > b/drivers/infiniband/hw/cxgb4/t4.h
> > > index 8369c7c..6e5658a 100644
> > > +++ b/drivers/infiniband/hw/cxgb4/t4.h
> > > @@ -457,7 +457,7 @@ static inline void pio_copy(u64 __iomem *dst,
> u64
> > *src)
> > >  	int count = 8;
> > >
> > >  	while (count) {
> > > -		writeq(*src, dst);
> > > +		writeq_relaxed(*src, dst);
> > >  		src++;
> > >  		dst++;
> > >  		count--;
> >
> > This is another case where writes can be re-ordered.. IIRC dst is WC
> > BAR memory, so the NIC should tolerate re-ordering, but Steve will
> > have to ack this.
> >
> 
> Yes, this is WC BAR memory.  The goal is that pio_copy() will enable
write-
> combining this into a single 64B pci-e transaction.
> 


I'd like to see the PPC issue resolved...but

Acked-by: Steve Wise <swise@opengridcomputing.com>
kernel test robot March 22, 2018, 6:44 a.m. UTC | #4
Hi Sinan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc6 next-20180321]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Sinan-Kaya/ib-Eliminate-duplicate-barriers-on-weakly-ordered-archs/20180321-091659
config: xtensa-allyesconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   In file included from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:73:0,
                    from drivers/infiniband/hw/cxgb4/device.c:40:
   drivers/infiniband/hw/cxgb4/t4.h: In function 'pio_copy':
>> drivers/infiniband/hw/cxgb4/t4.h:460:3: error: implicit declaration of function 'writeq_relaxed'; did you mean 'writel_relaxed'? [-Werror=implicit-function-declaration]
      writeq_relaxed(*src, dst);
      ^~~~~~~~~~~~~~
      writel_relaxed
   cc1: some warnings being treated as errors

vim +460 drivers/infiniband/hw/cxgb4/t4.h

   450	
   451	/* This function copies 64 byte coalesced work request to memory
   452	 * mapped BAR2 space. For coalesced WRs, the SGE fetches data
   453	 * from the FIFO instead of from Host.
   454	 */
   455	static inline void pio_copy(u64 __iomem *dst, u64 *src)
   456	{
   457		int count = 8;
   458	
   459		while (count) {
 > 460			writeq_relaxed(*src, dst);
   461			src++;
   462			dst++;
   463			count--;
   464		}
   465	}
   466	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Sinan Kaya March 22, 2018, 12:24 p.m. UTC | #5
On 2018-03-22 02:44, kbuild test robot wrote:
> Hi Sinan,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on linus/master]
> [also build test ERROR on v4.16-rc6 next-20180321]
> [if your patch is applied to the wrong git tree, please drop us a note
> to help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Sinan-Kaya/ib-Eliminate-duplicate-barriers-on-weakly-ordered-archs/20180321-091659
> config: xtensa-allyesconfig (attached as .config)
> compiler: xtensa-linux-gcc (GCC) 7.2.0
> reproduce:
>         wget
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
> -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # save the attached .config to linux build tree
>         make.cross ARCH=xtensa
> 

Jason,

Can you remove the writeq change if it is too late for me to fix?

This is an infrastructural issue on xtensa arch.

Probably, it won't get fixed today.

Sinan


> All errors (new ones prefixed by >>):
> 
>    In file included from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:73:0,
>                     from drivers/infiniband/hw/cxgb4/device.c:40:
>    drivers/infiniband/hw/cxgb4/t4.h: In function 'pio_copy':
>>> drivers/infiniband/hw/cxgb4/t4.h:460:3: error: implicit declaration 
>>> of function 'writeq_relaxed'; did you mean 'writel_relaxed'? 
>>> [-Werror=implicit-function-declaration]
>       writeq_relaxed(*src, dst);
>       ^~~~~~~~~~~~~~
>       writel_relaxed
>    cc1: some warnings being treated as errors
> 
> vim +460 drivers/infiniband/hw/cxgb4/t4.h
> 
>    450
>    451	/* This function copies 64 byte coalesced work request to memory
>    452	 * mapped BAR2 space. For coalesced WRs, the SGE fetches data
>    453	 * from the FIFO instead of from Host.
>    454	 */
>    455	static inline void pio_copy(u64 __iomem *dst, u64 *src)
>    456	{
>    457		int count = 8;
>    458
>    459		while (count) {
>  > 460			writeq_relaxed(*src, dst);
>    461			src++;
>    462			dst++;
>    463			count--;
>    464		}
>    465	}
>    466
> 
> ---
> 0-DAY kernel test infrastructure                Open Source Technology 
> Center
> https://lists.01.org/pipermail/kbuild-all                   Intel 
> Corporation
Sinan Kaya March 22, 2018, 12:48 p.m. UTC | #6
On 2018-03-22 08:24, okaya@codeaurora.org wrote:
> On 2018-03-22 02:44, kbuild test robot wrote:
>> Hi Sinan,
>> 
>> Thank you for the patch! Yet something to improve:
>> 
>> [auto build test ERROR on linus/master]
>> [also build test ERROR on v4.16-rc6 next-20180321]
>> [if your patch is applied to the wrong git tree, please drop us a note
>> to help improve the system]
>> 
>> url:
>> https://github.com/0day-ci/linux/commits/Sinan-Kaya/ib-Eliminate-duplicate-barriers-on-weakly-ordered-archs/20180321-091659
>> config: xtensa-allyesconfig (attached as .config)
>> compiler: xtensa-linux-gcc (GCC) 7.2.0
>> reproduce:
>>         wget
>> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
>> -O ~/bin/make.cross
>>         chmod +x ~/bin/make.cross
>>         # save the attached .config to linux build tree
>>         make.cross ARCH=xtensa
>> 
> 
> Jason,
> 
> Can you remove the writeq change if it is too late for me to fix?
> 
> This is an infrastructural issue on xtensa arch.
> 
> Probably, it won't get fixed today.

AFAIS, even writeq won't compile on this arch. I started questioning 
this build test.


> 
> Sinan
> 
> 
>> All errors (new ones prefixed by >>):
>> 
>>    In file included from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:73:0,
>>                     from drivers/infiniband/hw/cxgb4/device.c:40:
>>    drivers/infiniband/hw/cxgb4/t4.h: In function 'pio_copy':
>>>> drivers/infiniband/hw/cxgb4/t4.h:460:3: error: implicit declaration 
>>>> of function 'writeq_relaxed'; did you mean 'writel_relaxed'? 
>>>> [-Werror=implicit-function-declaration]
>>       writeq_relaxed(*src, dst);
>>       ^~~~~~~~~~~~~~
>>       writel_relaxed
>>    cc1: some warnings being treated as errors
>> 
>> vim +460 drivers/infiniband/hw/cxgb4/t4.h
>> 
>>    450
>>    451	/* This function copies 64 byte coalesced work request to 
>> memory
>>    452	 * mapped BAR2 space. For coalesced WRs, the SGE fetches data
>>    453	 * from the FIFO instead of from Host.
>>    454	 */
>>    455	static inline void pio_copy(u64 __iomem *dst, u64 *src)
>>    456	{
>>    457		int count = 8;
>>    458
>>    459		while (count) {
>>  > 460			writeq_relaxed(*src, dst);
>>    461			src++;
>>    462			dst++;
>>    463			count--;
>>    464		}
>>    465	}
>>    466
>> 
>> ---
>> 0-DAY kernel test infrastructure                Open Source Technology 
>> Center
>> https://lists.01.org/pipermail/kbuild-all                   Intel 
>> Corporation
Sinan Kaya March 22, 2018, 2:33 p.m. UTC | #7
One more time hopefully without HTML this time.


On 03/22/2018 08:48 AM, okaya@codeaurora.org wrote:
>> Can you remove the writeq change if it is too late for me to fix?
>>
>> This is an infrastructural issue on xtensa arch.
>>
>> Probably, it won't get fixed today.
>
> AFAIS, even writeq won't compile on this arch. I started questioning 
> this build test.

I found out that the solution is this:

#include <linux/io-64-nonatomic-hi-lo.h>

https://patchwork.ozlabs.org/patch/511801/

I did a compile test with this change on xtensa and it passed. I'll 
repost with the added diff.

+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -46,7 +46,7 @@
  #include <linux/timer.h>
  #include <linux/io.h>
  #include <linux/workqueue.h>
-
+#include <linux/io-64-nonatomic-hi-lo.h>
  #include <asm/byteorder.h>

  #include <net/net_namespace.h>
Steve Wise March 22, 2018, 2:40 p.m. UTC | #8
> 
> On 2018-03-22 08:24, okaya@codeaurora.org wrote:
> > On 2018-03-22 02:44, kbuild test robot wrote:
> >> Hi Sinan,
> >>
> >> Thank you for the patch! Yet something to improve:
> >>
> >> [auto build test ERROR on linus/master]
> >> [also build test ERROR on v4.16-rc6 next-20180321]
> >> [if your patch is applied to the wrong git tree, please drop us a note
> >> to help improve the system]
> >>
> >> url:
> >> https://github.com/0day-ci/linux/commits/Sinan-Kaya/ib-Eliminate-
> duplicate-barriers-on-weakly-ordered-archs/20180321-091659
> >> config: xtensa-allyesconfig (attached as .config)
> >> compiler: xtensa-linux-gcc (GCC) 7.2.0
> >> reproduce:
> >>         wget
> >> https://raw.githubusercontent.com/intel/lkp-
> tests/master/sbin/make.cross
> >> -O ~/bin/make.cross
> >>         chmod +x ~/bin/make.cross
> >>         # save the attached .config to linux build tree
> >>         make.cross ARCH=xtensa
> >>
> >
> > Jason,
> >
> > Can you remove the writeq change if it is too late for me to fix?
> >
> > This is an infrastructural issue on xtensa arch.
> >
> > Probably, it won't get fixed today.
> 
> AFAIS, even writeq won't compile on this arch. I started questioning
> this build test.
> 

I think all these iw_cxgb4 changes should be reverted until we really have a
plan for multi-platform that works.  

> 
> >
> > Sinan
> >
> >
> >> All errors (new ones prefixed by >>):
> >>
> >>    In file included from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:73:0,
> >>                     from drivers/infiniband/hw/cxgb4/device.c:40:
> >>    drivers/infiniband/hw/cxgb4/t4.h: In function 'pio_copy':
> >>>> drivers/infiniband/hw/cxgb4/t4.h:460:3: error: implicit declaration
> >>>> of function 'writeq_relaxed'; did you mean 'writel_relaxed'?
> >>>> [-Werror=implicit-function-declaration]
> >>       writeq_relaxed(*src, dst);
> >>       ^~~~~~~~~~~~~~
> >>       writel_relaxed
> >>    cc1: some warnings being treated as errors
> >>
> >> vim +460 drivers/infiniband/hw/cxgb4/t4.h
> >>
> >>    450
> >>    451	/* This function copies 64 byte coalesced work request to
> >> memory
> >>    452	 * mapped BAR2 space. For coalesced WRs, the SGE fetches
> data
> >>    453	 * from the FIFO instead of from Host.
> >>    454	 */
> >>    455	static inline void pio_copy(u64 __iomem *dst, u64 *src)
> >>    456	{
> >>    457		int count = 8;
> >>    458
> >>    459		while (count) {
> >>  > 460			writeq_relaxed(*src, dst);
> >>    461			src++;
> >>    462			dst++;
> >>    463			count--;
> >>    464		}
> >>    465	}
> >>    466
> >>
> >> ---
> >> 0-DAY kernel test infrastructure                Open Source Technology
> >> Center
> >> https://lists.01.org/pipermail/kbuild-all                   Intel
> >> Corporation
Sinan Kaya March 22, 2018, 2:52 p.m. UTC | #9
On 3/22/2018 9:40 AM, Steve Wise wrote:
> I think all these iw_cxgb4 changes should be reverted until we really have a
> plan for multi-platform that works.  

I know you are looking to have support for PowerPC. 

Isn't this a PowerPC problem? Why penalize other architectures?

Do you see anything wrong with the code itself?

I started this thread with the PowerPC develoeprs on your request.
"RFC on writel and writel_relaxed"

They are looking into adding the relaxed API support. Support can come
in later. Why block this change now?

benh@kernel.crashing.org:
"I've been wanting to implement the relaxed accessors for a while but
was battling with this to try to also better support WC, and due to
other commitments, this somewhat fell down the cracks."

I have seen four different responses on this thread. Since this is an
architecture change it will take a while to get the semantics right.
It won't happen in the new few days.
Steve Wise March 22, 2018, 4:28 p.m. UTC | #10
On 3/22/2018 9:52 AM, Sinan Kaya wrote:
> On 3/22/2018 9:40 AM, Steve Wise wrote:
>> I think all these iw_cxgb4 changes should be reverted until we really have a
>> plan for multi-platform that works.  
> I know you are looking to have support for PowerPC. 
>
> Isn't this a PowerPC problem? Why penalize other architectures?
>
> Do you see anything wrong with the code itself?

I worry it breaks PPC.

> I started this thread with the PowerPC develoeprs on your request.
> "RFC on writel and writel_relaxed"
>
> They are looking into adding the relaxed API support. Support can come
> in later. Why block this change now?
>
> benh@kernel.crashing.org:
> "I've been wanting to implement the relaxed accessors for a while but
> was battling with this to try to also better support WC, and due to
> other commitments, this somewhat fell down the cracks."
>
> I have seen four different responses on this thread. Since this is an
> architecture change it will take a while to get the semantics right.
> It won't happen in the new few days.

I appreciate you doing this.
Jason Gunthorpe March 22, 2018, 6:46 p.m. UTC | #11
On Thu, Mar 22, 2018 at 10:28:13AM -0400, Sinan Kaya wrote:
>    On 03/22/2018 08:48 AM, [1]okaya@codeaurora.org wrote:
> 
>      Jason,
>      Can you remove the writeq change if it is too late for me to fix?
>      This is an infrastructural issue on xtensa arch.
>      Probably, it won't get fixed today.

I was able to drop the patch, please resend.

>      AFAIS, even writeq won't compile on this arch. I started questioning
>      this build test.

>    I found out that the solution is this:
> #include <linux/io-64-nonatomic-hi-lo.h>

Yuk, what an ugly API..

>    [2]https://patchwork.ozlabs.org/patch/511801/
>    I did a compile test with this change on xtensa and it passed. I'll
>    repost with the added diff.
>    +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
>    @@ -46,7 +46,7 @@
>     #include <linux/timer.h>
>     #include <linux/io.h>
>     #include <linux/workqueue.h>
>    -
>    +#include <linux/io-64-nonatomic-hi-lo.h>

I think this is the wrong one. I would expect all PCI-E devices should
use lo-hi, eg writes are done in address increasing order on the bus.

This is what the PCI-E spec would require if a write were to be
fragmented so I would expect devices to handle it properly.

Steve? Any idea of specific things for this HW?

Jason
Jason Gunthorpe March 22, 2018, 6:48 p.m. UTC | #12
On Thu, Mar 22, 2018 at 08:48:35AM -0400, okaya@codeaurora.org wrote:
> On 2018-03-22 08:24, okaya@codeaurora.org wrote:
> >On 2018-03-22 02:44, kbuild test robot wrote:
> >>Hi Sinan,
> >>
> >>Thank you for the patch! Yet something to improve:
> >>
> >>[auto build test ERROR on linus/master]
> >>[also build test ERROR on v4.16-rc6 next-20180321]
> >>[if your patch is applied to the wrong git tree, please drop us a note
> >>to help improve the system]
> >>
> >>url:
> >>https://github.com/0day-ci/linux/commits/Sinan-Kaya/ib-Eliminate-duplicate-barriers-on-weakly-ordered-archs/20180321-091659
> >>config: xtensa-allyesconfig (attached as .config)
> >>compiler: xtensa-linux-gcc (GCC) 7.2.0
> >>reproduce:
> >>        wget
> >>https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
> >>-O ~/bin/make.cross
> >>        chmod +x ~/bin/make.cross
> >>        # save the attached .config to linux build tree
> >>        make.cross ARCH=xtensa
> >>
> >
> >Jason,
> >
> >Can you remove the writeq change if it is too late for me to fix?
> >
> >This is an infrastructural issue on xtensa arch.
> >
> >Probably, it won't get fixed today.
> 
> AFAIS, even writeq won't compile on this arch. I started questioning this
> build test.

I have the same confusion. Did you figure out an explanation?

Jason
Sinan Kaya March 22, 2018, 6:58 p.m. UTC | #13
On 3/22/2018 1:48 PM, Jason Gunthorpe wrote:
>> AFAIS, even writeq won't compile on this arch. I started questioning this
>> build test.
> I have the same confusion. Did you figure out an explanation?

I did a compile test without the relaxed change. It built just fine.
CONFIG_64BIT is also not defined in the .config file from kbuild.
I also dropped this patch on v5 due to Steve's request.

However, I think I still need an answer for 
v6: RDMA/qedr: eliminate duplicate barriers on weakly-ordered archs #2
I can switch to #include <linux/io-64-nonatomic-lo-hi.h> there.
Casey Leedom March 22, 2018, 7:44 p.m. UTC | #14
| From: Steve Wise <swise@opengridcomputing.com>
| Sent: Thursday, March 22, 2018 9:28 AM
|
| | From: Sinan Kaya <okaya@codeaurora.org>
| | Date: Thursday, March 22, 2018 7:52 AM
| |
| | Isn't this a PowerPC problem? Why penalize other architectures?
| 
| I worry it breaks PPC.

And all other architectures.  Aparraently there isn't a formal API
description for writel_relaxed() and Co., nor __raw_writel(), etc.  What I
think we need is a formal semantic definition of exactly what these APIs is
supposed to do and then we can make sure that they all do that.  Till we
have a consistent definition/implementation, trying to use these APIs in
multi-platform code will be a problem.

For instance, and this is merely an example not a prescription of what I'm
talking about:

    writel():
      -- Ensures correct byte ordering.
      -- Ensures no compiler reordering.
      -- Ensures instruction level synchronization with respect
      --   to previous and succeeding reads and writes.

    writel_relaxed():
      -- Ensures correct byte ordering.
      -- Ensures no compiler reordering.
      -- Ensures instruction level synchronization with respect
      --   to previous writes.

    __raw_writel():
      -- Ensures correct byte ordering.
      -- Ensures no compiler reordering.
 
| I appreciate you doing this.

As do I!  I'm just worried that because the API semantic definitions don't
seem to be formalized for writel_relaxed() and Co., we're in danger of
getting wildly different results on one platform or another based on
differring implementation semantics.

Casey
Jason Gunthorpe March 22, 2018, 8:16 p.m. UTC | #15
On Thu, Mar 22, 2018 at 07:44:51PM +0000, Casey Leedom wrote:
> | From: Steve Wise <swise@opengridcomputing.com>
> | Sent: Thursday, March 22, 2018 9:28 AM
> |
> | | From: Sinan Kaya <okaya@codeaurora.org>
> | | Date: Thursday, March 22, 2018 7:52 AM
> | |
> | | Isn't this a PowerPC problem? Why penalize other architectures?
> | 
> | I worry it breaks PPC.
>
> And all other architectures.  Aparraently there isn't a formal API
> description for writel_relaxed() and Co., nor __raw_writel(), etc.

We have this:

Documentation/memory-barriers.txt lines 2600-2677/3136 85%

 (*) readX_relaxed(), writeX_relaxed()

     These are similar to readX() and writeX(), but provide weaker memory
     ordering guarantees.  Specifically, they do not guarantee ordering with
     respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee
     ordering with respect to LOCK or UNLOCK operations.  If the latter is
     required, an mmiowb() barrier can be used.  Note that relaxed accesses to
     the same peripheral are guaranteed to be ordered with respect to each
     other.

Which basically says they are the same as writel() except they are not
required to be contained by a spinlock, which is the expensive thing
ARM and PPC are doing with the barriers in writel()

Jason
Casey Leedom March 22, 2018, 8:45 p.m. UTC | #16
Yes, but ...

For instance, I see that the x86 writel() has "memory" in its asm(), which
prevents GCC from reordering generated instructions.  And it ~looks like~
arm64 ~sort of~ gets that with the inclusion of __iowmb() (which translates
to wmb() then dsb(st) which finally holds the GCC "memory" barrier).  Is
this part of the documented semantic of the writel_relaxed()?  The PowerPC
stuff simply defines writel_relaxed() as writel() and I can't find the
bottom of that Rabbit Hole ...

I'm guessing~ that this line in the documentation ~may~ imply the GCC
ordering:

     ... Note that relaxed accesses to
     the same peripheral are guaranteed to be ordered with respect to each
     other. ...

In any case, we really only have a few places where we (the various Chelsio
drivers) need to worry about this: the "Fast Paths" where we have a lot of
I/O to the device.  I think we should leave everything else alone.

Casey
Jason Gunthorpe March 22, 2018, 9:25 p.m. UTC | #17
On Thu, Mar 22, 2018 at 08:45:11PM +0000, Casey Leedom wrote:

> I'm guessing~ that this line in the documentation ~may~ imply the GCC
> ordering:
> 
>      ... Note that relaxed accesses to
>      the same peripheral are guaranteed to be ordered with respect to each
>      other. ...

An arch can't guarentee "ordered with respect to each other" without
preventing the compiler from re-ordering, so yes, any correct
implementation of writel_relaxed must prevent compiler re-ordering.

eg with volatile or a compiler barrier, or whatever.

> In any case, we really only have a few places where we (the various Chelsio
> drivers) need to worry about this: the "Fast Paths" where we have a lot of
> I/O to the device.  I think we should leave everything else alone.

Yes.

Jason
Sinan Kaya March 22, 2018, 9:27 p.m. UTC | #18
On 3/22/2018 4:45 PM, Casey Leedom wrote:
> Yes, but ...
> 
> For instance, I see that the x86 writel() has "memory" in its asm(), which
> prevents GCC from reordering generated instructions.  And it ~looks like~
> arm64 ~sort of~ gets that with the inclusion of __iowmb() (which translates
> to wmb() then dsb(st) which finally holds the GCC "memory" barrier).  Is
> this part of the documented semantic of the writel_relaxed()?  The PowerPC
> stuff simply defines writel_relaxed() as writel() and I can't find the
> bottom of that Rabbit Hole ...
> 

This is changing. See "RFC on writel and writel_relaxed" thread. PowerPC 
maintainers are looking for a way to implement this.

What matters is the description in the barriers document. See also 
section "MMIO access primitives" here about mmiowb()

https://lwn.net/Articles/697539/


> I'm guessing~ that this line in the documentation ~may~ imply the GCC
> ordering:
> 
>      ... Note that relaxed accesses to
>      the same peripheral are guaranteed to be ordered with respect to each
>      other. ...
> 

This can be a compiler barrier for some arches and/or can be architecturally 
guaranteed as in ARM64's device nGnRE mapping (non-gathering non-reordering with
early acknowledgment).

Both writel() and writel_relaxed() need to guarantee ordering with respect to
what HW observes for writes. 

They have different guarantees regarding the code surrounding write like you
identified.

> In any case, we really only have a few places where we (the various Chelsio
> drivers) need to worry about this: the "Fast Paths" where we have a lot of
> I/O to the device.  I think we should leave everything else alone.

makes sense

> 
> Casey
>
Casey Leedom March 22, 2018, 10:02 p.m. UTC | #19
Okay, thanks Sinan.  I ~think~ we're on the same page here.

  Our guy Michael Werner is carefully going through our drivers with Steve
Wise.  I'd like to let them work on the changes with a lot of thought and
testing before diving in too far.

  We had thought that we could get around the lack of a real Relaxed
Ordering Write on the PowerPC via the __raw_write*() API, but that isn't
really part of the defined API and various platforms define that API
differently (some doing PCI Byte Ordering rearrangements and some not).  I
think that we're going to have to work with the PowerPC platform folks to
get a real Relaxed Ordering Write.

Casey
kernel test robot March 23, 2018, 4:14 a.m. UTC | #20
Hi Sinan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc6 next-20180322]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Sinan-Kaya/ib-Eliminate-duplicate-barriers-on-weakly-ordered-archs/20180321-091659
config: i386-allyesconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:73:0,
                    from drivers/infiniband/hw/cxgb4/device.c:40:
   drivers/infiniband/hw/cxgb4/t4.h: In function 'pio_copy':
>> drivers/infiniband/hw/cxgb4/t4.h:460:3: error: implicit declaration of function 'writeq_relaxed'; did you mean 'write_pnet'? [-Werror=implicit-function-declaration]
      writeq_relaxed(*src, dst);
      ^~~~~~~~~~~~~~
      write_pnet
   cc1: some warnings being treated as errors
--
   In file included from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:73:0,
                    from drivers/infiniband/hw/cxgb4/cm.c:53:
   drivers/infiniband/hw/cxgb4/t4.h: In function 'pio_copy':
>> drivers/infiniband/hw/cxgb4/t4.h:460:3: error: implicit declaration of function 'writeq_relaxed'; did you mean 'rt6_release'? [-Werror=implicit-function-declaration]
      writeq_relaxed(*src, dst);
      ^~~~~~~~~~~~~~
      rt6_release
   cc1: some warnings being treated as errors

vim +460 drivers/infiniband/hw/cxgb4/t4.h

   450	
   451	/* This function copies 64 byte coalesced work request to memory
   452	 * mapped BAR2 space. For coalesced WRs, the SGE fetches data
   453	 * from the FIFO instead of from Host.
   454	 */
   455	static inline void pio_copy(u64 __iomem *dst, u64 *src)
   456	{
   457		int count = 8;
   458	
   459		while (count) {
 > 460			writeq_relaxed(*src, dst);
   461			src++;
   462			dst++;
   463			count--;
   464		}
   465	}
   466	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 8369c7c..6e5658a 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -457,7 +457,7 @@  static inline void pio_copy(u64 __iomem *dst, u64 *src)
 	int count = 8;
 
 	while (count) {
-		writeq(*src, dst);
+		writeq_relaxed(*src, dst);
 		src++;
 		dst++;
 		count--;
@@ -477,15 +477,15 @@  static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, union t4_wr *wqe)
 				 (u64 *)wqe);
 		} else {
 			pr_debug("DB wq->sq.pidx = %d\n", wq->sq.pidx);
-			writel(PIDX_T5_V(inc) | QID_V(wq->sq.bar2_qid),
-			       wq->sq.bar2_va + SGE_UDB_KDOORBELL);
+			writel_relaxed(PIDX_T5_V(inc) | QID_V(wq->sq.bar2_qid),
+				       wq->sq.bar2_va + SGE_UDB_KDOORBELL);
 		}
 
 		/* Flush user doorbell area writes. */
 		wmb();
 		return;
 	}
-	writel(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db);
+	writel_relaxed(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db);
 }
 
 static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc,
@@ -502,15 +502,15 @@  static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc,
 				 (void *)wqe);
 		} else {
 			pr_debug("DB wq->rq.pidx = %d\n", wq->rq.pidx);
-			writel(PIDX_T5_V(inc) | QID_V(wq->rq.bar2_qid),
-			       wq->rq.bar2_va + SGE_UDB_KDOORBELL);
+			writel_relaxed(PIDX_T5_V(inc) | QID_V(wq->rq.bar2_qid),
+				       wq->rq.bar2_va + SGE_UDB_KDOORBELL);
 		}
 
 		/* Flush user doorbell area writes. */
 		wmb();
 		return;
 	}
-	writel(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db);
+	writel_relaxed(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db);
 }
 
 static inline int t4_wq_in_error(struct t4_wq *wq)