diff mbox

[8/9] Remove most checks of __BYTE_ORDER

Message ID 1475182076-5411-9-git-send-email-jgunthorpe@obsidianresearch.com (mailing list archive)
State Accepted
Headers show

Commit Message

Jason Gunthorpe Sept. 29, 2016, 8:47 p.m. UTC
For a long time now endian.h has defined sane fixed with conversion
macros, so lets just use them instead of rolling our own.

Also, htonll is defined in this source tree under infiniband/arch.h,
so all users of that macro can just use the header.

Someday we should also get rid of all the endless wrappers..

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
---
 ibacm/linux/osd.h                    |  8 +-------
 libcxgb3/src/cxio_wr.h               | 10 +++-------
 libcxgb4/src/t4.h                    | 11 ++++-------
 libi40iw/src/i40iw_umain.h           | 18 ++----------------
 libibumad/src/sysfs.c                |  8 +-------
 libibverbs/include/infiniband/arch.h | 15 ++++++---------
 libnes/src/nes_umain.h               | 17 ++---------------
 libocrdma/src/ocrdma_verbs.c         | 32 ++++----------------------------
 librdmacm/examples/common.h          | 11 +++--------
 librdmacm/src/cma.h                  |  9 +--------
 srp_daemon/srp_daemon/srp_daemon.h   | 17 +----------------
 11 files changed, 28 insertions(+), 128 deletions(-)

Comments

Steve Wise Sept. 29, 2016, 9:26 p.m. UTC | #1
> For a long time now endian.h has defined sane fixed with conversion
> macros, so lets just use them instead of rolling our own.
> 
> Also, htonll is defined in this source tree under infiniband/arch.h,
> so all users of that macro can just use the header.
> 
> Someday we should also get rid of all the endless wrappers..
> 
> Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> ---
>  ibacm/linux/osd.h                    |  8 +-------
>  libcxgb3/src/cxio_wr.h               | 10 +++-------
>  libcxgb4/src/t4.h                    | 11 ++++-------
>  libi40iw/src/i40iw_umain.h           | 18 ++----------------
>  libibumad/src/sysfs.c                |  8 +-------
>  libibverbs/include/infiniband/arch.h | 15 ++++++---------
>  libnes/src/nes_umain.h               | 17 ++---------------
>  libocrdma/src/ocrdma_verbs.c         | 32 ++++----------------------------
>  librdmacm/examples/common.h          | 11 +++--------
>  librdmacm/src/cma.h                  |  9 +--------
>  srp_daemon/srp_daemon/srp_daemon.h   | 17 +----------------
>  11 files changed, 28 insertions(+), 128 deletions(-)
> 

<snip>

> diff --git a/libcxgb3/src/cxio_wr.h b/libcxgb3/src/cxio_wr.h
> index ece06bd0568c..e24c7fed7d76 100644
> --- a/libcxgb3/src/cxio_wr.h
> +++ b/libcxgb3/src/cxio_wr.h
> @@ -50,13 +50,9 @@
>  #define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
>  #define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
> 
> -#if __BYTE_ORDER == __LITTLE_ENDIAN
> -#  define cpu_to_pci32(val) ((val))
> -#elif __BYTE_ORDER == __BIG_ENDIAN
> -#  define cpu_to_pci32(val) (__bswap_32((val)))
> -#else
> -#  error __BYTE_ORDER not defined
> -#endif
> +/* Generally speaking, PCI systems auto-byteswap on PCI accesses, so this is
> +   probably wrong */
> +#define cpu_to_pci32(val) htole32(val)
> 

If this was wrong, then nothing would work.  So I think you should remove the
comment.

>  #define RING_DOORBELL(doorbell, QPID) { \
>  	*doorbell = cpu_to_pci32(QPID); \

See RING_DOORBELL() is used in the .c files

> diff --git a/libcxgb4/src/t4.h b/libcxgb4/src/t4.h
> index e8c5cf66cb14..e519cc4087e6 100644
> --- a/libcxgb4/src/t4.h
> +++ b/libcxgb4/src/t4.h
> @@ -62,13 +62,10 @@
>  #define unlikely
>  #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
>  #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
> -#if __BYTE_ORDER == __LITTLE_ENDIAN
> -#  define cpu_to_pci32(val) ((val))
> -#elif __BYTE_ORDER == __BIG_ENDIAN
> -#  define cpu_to_pci32(val) (__bswap_32((val)))
> -#else
> -#  error __BYTE_ORDER not defined
> -#endif
> +
> +/* Generally speaking, PCI systems auto-byteswap on PCI accesses, so this is
> +   probably wrong */
> +#define cpu_to_pci32(val) htole32(val)
> 

Ditto.

>  #define writel(v, a) do { *((volatile u32 *)(a)) = cpu_to_pci32(v); } while
(0)
>

Ditto.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 29, 2016, 10:28 p.m. UTC | #2
On Thu, Sep 29, 2016 at 04:26:52PM -0500, Steve Wise wrote:

> > +/* Generally speaking, PCI systems auto-byteswap on PCI accesses, so this is
> > +   probably wrong */
> > +#define cpu_to_pci32(val) htole32(val)
> 
> If this was wrong, then nothing would work.  So I think you should
> remove the comment.

I added the comment because cxgb* was the only provider that seemed to
be obviously byte swapping MMIO.

Since htole32 is a NOP on x86 and ARM64 the driver would work fine
with or without it. Do you know this driver works on big endian
powerpc? That would help answer the question for sure.

I looked at this some more, and I still don't really know what is the
right answer.

In the kernel the swapping behavior of 'writel' is architecture (and
sometimes even platform) specific. Looking at the kernel headers
suggests that BE PPC needs an explicit swap and BE ARM does not (eg
the hardware swaps). Maybe. There are a lot of layers of macros in
this area..

However, I can't find an obvious swap in mlx4, which I think is the
most widely tested provider. Does it work on BE PPC?

One of the things I'd like to see done is to consolidate the mmio
accessors.. The note was a reminder to try and figure out what is
actually right here.. Appreciate any help!

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Sept. 29, 2016, 11:46 p.m. UTC | #3
> On Sep 29, 2016, at 5:28 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
> 
> On Thu, Sep 29, 2016 at 04:26:52PM -0500, Steve Wise wrote:
> 
>>> +/* Generally speaking, PCI systems auto-byteswap on PCI accesses, so this is
>>> +   probably wrong */
>>> +#define cpu_to_pci32(val) htole32(val)
>> 
>> If this was wrong, then nothing would work.  So I think you should
>> remove the comment.
> 
> I added the comment because cxgb* was the only provider that seemed to
> be obviously byte swapping MMIO.
> 
> Since htole32 is a NOP on x86 and ARM64 the driver would work fine
> with or without it. Do you know this driver works on big endian
> powerpc? That would help answer the question for sure.
> 

Yes, PowerPC and arm. 

> I looked at this some more, and I still don't really know what is the
> right answer.
> 
> In the kernel the swapping behavior of 'writel' is architecture (and
> sometimes even platform) specific. Looking at the kernel headers
> suggests that BE PPC needs an explicit swap and BE ARM does not (eg
> the hardware swaps). Maybe. There are a lot of layers of macros in
> this area..
> 
> However, I can't find an obvious swap in mlx4, which I think is the
> most widely tested provider. Does it work on BE PPC?
> 
> One of the things I'd like to see done is to consolidate the mmio
> accessors.. The note was a reminder to try and figure out what is
> actually right here.. Appreciate any help!
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 30, 2016, 12:08 a.m. UTC | #4
On Thu, Sep 29, 2016 at 06:46:02PM -0500, Steve Wise wrote:
> > I added the comment because cxgb* was the only provider that seemed to
> > be obviously byte swapping MMIO.
> > 
> > Since htole32 is a NOP on x86 and ARM64 the driver would work fine
> > with or without it. Do you know this driver works on big endian
> > powerpc? That would help answer the question for sure.
> 
> Yes, PowerPC and arm. 

Okay great. Maybe Yishai can explain what mlx drivers are doing..

Is this better:

/* NOTE: The endianess of PCI MMIO accesses is architecture specific,
   this is known correct for BE PPC and LE x86/ARM */

Thanks again Steve,
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Sept. 30, 2016, 1:32 p.m. UTC | #5
> On Thu, Sep 29, 2016 at 06:46:02PM -0500, Steve Wise wrote:
> > > I added the comment because cxgb* was the only provider that seemed
> to
> > > be obviously byte swapping MMIO.
> > >
> > > Since htole32 is a NOP on x86 and ARM64 the driver would work fine
> > > with or without it. Do you know this driver works on big endian
> > > powerpc? That would help answer the question for sure.
> >
> > Yes, PowerPC and arm.
> 
> Okay great. Maybe Yishai can explain what mlx drivers are doing..
> 
> Is this better:
> 
> /* NOTE: The endianess of PCI MMIO accesses is architecture specific,
>    this is known correct for BE PPC and LE x86/ARM */
>

Sure.
 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Sept. 30, 2016, 4:51 p.m. UTC | #6
On 09/29/2016 01:47 PM, Jason Gunthorpe wrote:
> For a long time now endian.h has defined sane fixed with conversion
> macros, so lets just use them instead of rolling our own.
>
> Also, htonll is defined in this source tree under infiniband/arch.h,
> so all users of that macro can just use the header.
>
> Someday we should also get rid of all the endless wrappers..
>
> Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> ---
>  ibacm/linux/osd.h                    |  8 +-------
>  libcxgb3/src/cxio_wr.h               | 10 +++-------
>  libcxgb4/src/t4.h                    | 11 ++++-------
>  libi40iw/src/i40iw_umain.h           | 18 ++----------------
>  libibumad/src/sysfs.c                |  8 +-------
>  libibverbs/include/infiniband/arch.h | 15 ++++++---------
>  libnes/src/nes_umain.h               | 17 ++---------------
>  libocrdma/src/ocrdma_verbs.c         | 32 ++++----------------------------
>  librdmacm/examples/common.h          | 11 +++--------
>  librdmacm/src/cma.h                  |  9 +--------
>  srp_daemon/srp_daemon/srp_daemon.h   | 17 +----------------
>  11 files changed, 28 insertions(+), 128 deletions(-)

For the srp_daemon changes:

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hal Rosenstock Sept. 30, 2016, 5:02 p.m. UTC | #7
On 9/29/2016 4:47 PM, Jason Gunthorpe wrote:
>  libibumad/src/sysfs.c                |  8 +-------

For libibumad bit,

Acked-by: Hal Rosenstock <hal@mellanox.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yishai Hadas Oct. 5, 2016, 3:16 p.m. UTC | #8
On 9/30/2016 3:08 AM, Jason Gunthorpe wrote:
> On Thu, Sep 29, 2016 at 06:46:02PM -0500, Steve Wise wrote:
>>> I added the comment because cxgb* was the only provider that seemed to
>>> be obviously byte swapping MMIO.
>>>
>>> Since htole32 is a NOP on x86 and ARM64 the driver would work fine
>>> with or without it. Do you know this driver works on big endian
>>> powerpc? That would help answer the question for sure.
>>
>> Yes, PowerPC and arm.
>
> Okay great. Maybe Yishai can explain what mlx drivers are doing..

MLX drivers use BE format for QPN as part of writing a door bell in both 
X86 and PPC BE as their NICs expect to get, code is certified to work 
fine on both.

Specifically,
Upon QP creation the code uses htonl for setting qp->doorbell_qpn (see 
mlx4_create_qp_ex), later on this value is used without any change upon 
writing a door bell. (see mmio_writel). No PCI swap is expected.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 5, 2016, 3:37 p.m. UTC | #9
On Thu, Sep 29, 2016 at 04:28:02PM -0600, Jason Gunthorpe wrote:
> In the kernel the swapping behavior of 'writel' is architecture (and
> sometimes even platform) specific. Looking at the kernel headers
> suggests that BE PPC needs an explicit swap and BE ARM does not (eg
> the hardware swaps). Maybe. There are a lot of layers of macros in
> this area..

read{s,l,q} and write{s,l,q} in the kernel always perform an implicit
bytestap, that is the device is expected to use a LE layout.  This
is not architecture specific, otherwise hell would break lose.

If the device has a BE layout you'll need to use ioread*be/iowrite*be
instead.

But none of this has an affect on userspace mappings of registers, as
those don't go through the kernel read{s,l,q} and write{s,l,q} helpers,
so if you mmap the resource directly your need to manually byte swap
to whatever the hardware expects - which seems to be LE for Chelsio
and BE for Mellanox.

Btw, one thing that would be useful is to introduce endiane annoatations
for sparse in rdma-core.   I've added these to a few userspace projects
already, here is the most recent example:

https://github.com/linux-nvme/nvme-cli/commit/126b8beb35b76fe9f61b137e34aea66b64c07b25


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Oct. 5, 2016, 5:33 p.m. UTC | #10
On Wed, Oct 05, 2016 at 08:37:01AM -0700, Christoph Hellwig wrote:
> On Thu, Sep 29, 2016 at 04:28:02PM -0600, Jason Gunthorpe wrote:
> > In the kernel the swapping behavior of 'writel' is architecture (and
> > sometimes even platform) specific. Looking at the kernel headers
> > suggests that BE PPC needs an explicit swap and BE ARM does not (eg
> > the hardware swaps). Maybe. There are a lot of layers of macros in
> > this area..
> 
> read{s,l,q} and write{s,l,q} in the kernel always perform an implicit
> bytestap, that is the device is expected to use a LE layout.  This
> is not architecture specific, otherwise hell would break lose.

Right, stated differently, the kernel requires that writel()/etc
always produce the same PCI-E packet on the wire. (eg writel(1)
produces a TLP with bit 0 of the data payload set)

But each arch implements its own set of instructions to get there,
that is what I mean by architecture specific.

> But none of this has an affect on userspace mappings of registers, as
> those don't go through the kernel read{s,l,q} and write{s,l,q}
> helpers,

Not going through the kernel's writel is the whole problem. The writel
helper generates the arch-specific instruction sequence required to
issue generate the required PCI-E packet.

Today (at best, some drivers do not even do this) our userspace
assumes all archs implement writel as:

   *(u32 *)reg = cpu_to_le32(val);

Which is a good start, but not portable to every arch the kernel
supports.

> Btw, one thing that would be useful is to introduce endiane annoatations
> for sparse in rdma-core.   I've added these to a few userspace projects
> already, here is the most recent example:
> 
> https://github.com/linux-nvme/nvme-cli/commit/126b8beb35b76fe9f61b137e34aea66b64c07b25

Yeah, it would be nice to get that working too. I guess we need to
standardize on the cpu_to_xx macro style as a first step?

The __iomem annotation would be nice as well.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Oct. 5, 2016, 5:44 p.m. UTC | #11
On Wed, Oct 05, 2016 at 06:16:45PM +0300, Yishai Hadas wrote:

> Upon QP creation the code uses htonl for setting qp->doorbell_qpn (see
> mlx4_create_qp_ex), later on this value is used without any change upon
> writing a door bell. (see mmio_writel). No PCI swap is expected.

Thanks, so the mlx drivers are doing what the kernel calls writel_be()?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 6, 2016, 11:40 a.m. UTC | #12
On Wed, Oct 05, 2016 at 11:33:59AM -0600, Jason Gunthorpe wrote:
> Right, stated differently, the kernel requires that writel()/etc
> always produce the same PCI-E packet on the wire. (eg writel(1)
> produces a TLP with bit 0 of the data payload set)

Exactly.

> Not going through the kernel's writel is the whole problem. The writel
> helper generates the arch-specific instruction sequence required to
> issue generate the required PCI-E packet.

All the architectures (including your quoted above ARM example) seem
to do the byte swap in software.  That seems to be important for
accesses like memcpy_{to,from}_io, which would be painful to handle.

But yes, in theory an architecture could do it any way it wants.

> Today (at best, some drivers do not even do this) our userspace
> assumes all archs implement writel as:
> 
>    *(u32 *)reg = cpu_to_le32(val);
> 
> Which is a good start, but not portable to every arch the kernel
> supports.

It should do the right thing for every architecture that matters.
Thay beeing said having an iomem abstraction certainly makes sense
for various reasons, and handling any oddball architecture (or rather
PCI hostbridge implemtation, I would not expect something this
broken to be universal) would come as a bonus.

> Yeah, it would be nice to get that working too. I guess we need to
> standardize on the cpu_to_xx macro style as a first step?

Any style will work as long as it separate the swap directions,
but for a low-level Linux projects using the kernel style certainly
makes sene.

> The __iomem annotation would be nice as well.

I can look into that as well.  Usuaully the first step before adding
sparse annotations is fixing all the misc sparse warnings, as typical
userspace projects have a not too stellar code quality.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yishai Hadas Oct. 6, 2016, 12:09 p.m. UTC | #13
On 10/5/2016 8:44 PM, Jason Gunthorpe wrote:
> On Wed, Oct 05, 2016 at 06:16:45PM +0300, Yishai Hadas wrote:
>
>> Upon QP creation the code uses htonl for setting qp->doorbell_qpn (see
>> mlx4_create_qp_ex), later on this value is used without any change upon
>> writing a door bell. (see mmio_writel). No PCI swap is expected.
>
> Thanks, so the mlx drivers are doing what the kernel calls writel_be()?

It's logically the same. You can look also in mlx4 driver code in the 
kernel [1], writel() is used on qp->doorbell_qpn which was previously 
set to be BE as the hardware expects.

[1] 
http://lxr.free-electrons.com/source/drivers/infiniband/hw/mlx4/qp.c#L3223


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/ibacm/linux/osd.h b/ibacm/linux/osd.h
index 2c4db81b9e8e..c1e7d996c371 100644
--- a/ibacm/linux/osd.h
+++ b/ibacm/linux/osd.h
@@ -46,6 +46,7 @@ 
 #include <arpa/inet.h>
 #include <sys/time.h>
 #include <netinet/in.h>
+#include <infiniband/arch.h>
 
 #include <ccan/minmax.h>
 
@@ -55,13 +56,6 @@ 
 #define LIB_DESTRUCTOR __attribute__((destructor))
 #define CDECL_FUNC
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define htonll(x) bswap_64(x)
-#else
-#define htonll(x) (x)
-#endif
-#define ntohll(x) htonll(x)
-
 #if DEFINE_ATOMICS
 typedef struct { pthread_mutex_t mut; int val; } atomic_t;
 static inline int atomic_inc(atomic_t *atomic)
diff --git a/libcxgb3/src/cxio_wr.h b/libcxgb3/src/cxio_wr.h
index ece06bd0568c..e24c7fed7d76 100644
--- a/libcxgb3/src/cxio_wr.h
+++ b/libcxgb3/src/cxio_wr.h
@@ -50,13 +50,9 @@ 
 #define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
 #define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#  define cpu_to_pci32(val) ((val))
-#elif __BYTE_ORDER == __BIG_ENDIAN
-#  define cpu_to_pci32(val) (__bswap_32((val)))
-#else
-#  error __BYTE_ORDER not defined
-#endif
+/* Generally speaking, PCI systems auto-byteswap on PCI accesses, so this is
+   probably wrong */
+#define cpu_to_pci32(val) htole32(val)
 
 #define RING_DOORBELL(doorbell, QPID) { \
 	*doorbell = cpu_to_pci32(QPID); \
diff --git a/libcxgb4/src/t4.h b/libcxgb4/src/t4.h
index e8c5cf66cb14..e519cc4087e6 100644
--- a/libcxgb4/src/t4.h
+++ b/libcxgb4/src/t4.h
@@ -62,13 +62,10 @@ 
 #define unlikely
 #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#  define cpu_to_pci32(val) ((val))
-#elif __BYTE_ORDER == __BIG_ENDIAN
-#  define cpu_to_pci32(val) (__bswap_32((val)))
-#else
-#  error __BYTE_ORDER not defined
-#endif
+
+/* Generally speaking, PCI systems auto-byteswap on PCI accesses, so this is
+   probably wrong */
+#define cpu_to_pci32(val) htole32(val)
 
 #define writel(v, a) do { *((volatile u32 *)(a)) = cpu_to_pci32(v); } while (0)
 
diff --git a/libi40iw/src/i40iw_umain.h b/libi40iw/src/i40iw_umain.h
index 39b54ba29dcb..13d3da89eba9 100644
--- a/libi40iw/src/i40iw_umain.h
+++ b/libi40iw/src/i40iw_umain.h
@@ -195,28 +195,14 @@  int i40iw_uattach_mcast(struct ibv_qp *, const union ibv_gid *, uint16_t);
 int i40iw_udetach_mcast(struct ibv_qp *, const union ibv_gid *, uint16_t);
 void i40iw_async_event(struct ibv_async_event *event);
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
 static inline uint32_t cpu_to_le32(uint32_t x)
 {
-	return x;
+	return htole32(x);
 }
 
 static inline uint32_t le32_to_cpu(uint32_t x)
 {
-	return x;
+	return le32toh(x);
 }
-#else
-static inline uint32_t cpu_to_le32(uint32_t x)
-{
-	return ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) |
-	    ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
-}
-
-static inline uint32_t le32_to_cpu(uint32_t x)
-{
-	return ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) |
-	    ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
-}
-#endif
 
 #endif /* i40iw_umain_H */
diff --git a/libibumad/src/sysfs.c b/libibumad/src/sysfs.c
index d10f312285ea..011e411f0951 100644
--- a/libibumad/src/sysfs.c
+++ b/libibumad/src/sysfs.c
@@ -41,15 +41,9 @@ 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#include <endian.h>
 #include <byteswap.h>
 #include <netinet/in.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define htonll(x) bswap_64(x)
-#else
-#define htonll(x) (x)
-#endif
+#include <infiniband/arch.h>
 
 static int ret_code(void)
 {
diff --git a/libibverbs/include/infiniband/arch.h b/libibverbs/include/infiniband/arch.h
index e35ecf05bceb..bf0feec08d73 100644
--- a/libibverbs/include/infiniband/arch.h
+++ b/libibverbs/include/infiniband/arch.h
@@ -37,15 +37,12 @@ 
 #include <endian.h>
 #include <byteswap.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-static inline uint64_t htonll(uint64_t x) { return bswap_64(x); }
-static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); }
-#elif __BYTE_ORDER == __BIG_ENDIAN
-static inline uint64_t htonll(uint64_t x) { return x; }
-static inline uint64_t ntohll(uint64_t x) { return x; }
-#else
-#error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN
-#endif
+#undef htonll
+#undef ntohll
+static inline uint64_t htonll(uint64_t x) { return htobe64(x); }
+static inline uint64_t ntohll(uint64_t x) { return be64toh(x); }
+#define htonll htonll
+#define ntohll ntohll
 
 /*
  * Architecture-specific defines.  Currently, an architecture is
diff --git a/libnes/src/nes_umain.h b/libnes/src/nes_umain.h
index c53acd7e90fe..91299823e675 100644
--- a/libnes/src/nes_umain.h
+++ b/libnes/src/nes_umain.h
@@ -393,26 +393,13 @@  int nes_uattach_mcast(struct ibv_qp *, const union ibv_gid *, uint16_t);
 int nes_udetach_mcast(struct ibv_qp *, const union ibv_gid *, uint16_t);
 void nes_async_event(struct ibv_async_event *event);
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
 static inline uint32_t cpu_to_le32(uint32_t x)
 {
-	return x;
+	return htole32(x);
 }
 static inline uint32_t le32_to_cpu(uint32_t x)
 {
-	return x;
+	return le32toh(x);
 }
-#else
-static inline uint32_t cpu_to_le32(uint32_t x)
-{
-	return (((x&0xFF000000)>>24) | ((x&0x00FF0000)>>8) |
-			((x&0x0000FF00)<<8) | ((x&0x000000FF)<<24));
-}
-static inline uint32_t le32_to_cpu(uint32_t x)
-{
-	return (((x&0xFF000000)>>24) | ((x&0x00FF0000)>>8) |
-			((x&0x0000FF00)<<8) | ((x&0x000000FF)<<24));
-}
-#endif
 
 #endif				/* nes_umain_H */
diff --git a/libocrdma/src/ocrdma_verbs.c b/libocrdma/src/ocrdma_verbs.c
index 6d58cb219fe1..163bf23f8914 100644
--- a/libocrdma/src/ocrdma_verbs.c
+++ b/libocrdma/src/ocrdma_verbs.c
@@ -46,6 +46,7 @@ 
 #include <sys/mman.h>
 #include <netinet/in.h>
 #include <unistd.h>
+#include <endian.h>
 
 #include "ocrdma_main.h"
 #include "ocrdma_abi.h"
@@ -54,48 +55,23 @@ 
 static void ocrdma_ring_cq_db(struct ocrdma_cq *cq, uint32_t armed,
 			      int solicited, uint32_t num_cqe);
 
-static inline uint32_t ocrdma_swap_endianness(uint32_t val)
-{
-	return ((val & 0xFF000000) >> 24) | ((val & 0xFF) << 24) |
-	    ((val & 0xFF00) << 8) | ((val & 0xFF0000) >> 8);
-}
-
 static inline uint32_t ocrdma_cpu_to_le(uint32_t val)
 {
-#if __BYTE_ORDER == __BIG_ENDIAN
-	return ocrdma_swap_endianness(val);
-#else
-	return val;
-#endif
+	return htole32(val);
 }
 
 static inline uint32_t ocrdma_le_to_cpu(uint32_t val)
 {
-#if __BYTE_ORDER == __BIG_ENDIAN
-	return ocrdma_swap_endianness(val);
-#else
-	return val;
-#endif
-}
-
-static inline uint32_t ocrdma_cpu_to_be(uint32_t val)
-{
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-	return ocrdma_swap_endianness(val);
-#else
-	return val;
-#endif
+	return le32toh(val);
 }
 
 static inline void ocrdma_swap_cpu_to_le(void *dst, uint32_t len)
 {
-#if __BYTE_ORDER == __BIG_ENDIAN
 	int i = 0;
 	uint32_t *src_ptr = dst;
 	uint32_t *dst_ptr = dst;
 	for (; i < (len / 4); i++)
-		*dst_ptr++ = ocrdma_swap_endianness(*src_ptr++);
-#endif
+		*dst_ptr++ = le32toh(*src_ptr++);
 }
 
 /*
diff --git a/librdmacm/examples/common.h b/librdmacm/examples/common.h
index f7511f039697..ac2d160b0044 100644
--- a/librdmacm/examples/common.h
+++ b/librdmacm/examples/common.h
@@ -34,20 +34,15 @@ 
 
 #include <stdlib.h>
 #include <sys/types.h>
-#include <byteswap.h>
+#include <endian.h>
 #include <poll.h>
 
 #include <rdma/rdma_cma.h>
 #include <rdma/rsocket.h>
 #include <infiniband/ib.h>
 
-#if __BYTE_ORDER == __BIG_ENDIAN
-static inline uint64_t cpu_to_be64(uint64_t x) { return x; }
-static inline uint32_t cpu_to_be32(uint32_t x) { return x; }
-#else
-static inline uint64_t cpu_to_be64(uint64_t x) { return bswap_64(x); }
-static inline uint32_t cpu_to_be32(uint32_t x) { return bswap_32(x); }
-#endif
+static inline uint64_t cpu_to_be64(uint64_t x) { return htobe64(x); }
+static inline uint32_t cpu_to_be32(uint32_t x) { return htobe32(x); }
 
 extern int use_rs;
 
diff --git a/librdmacm/src/cma.h b/librdmacm/src/cma.h
index 16a55a67af9e..c2f603d6f439 100644
--- a/librdmacm/src/cma.h
+++ b/librdmacm/src/cma.h
@@ -44,6 +44,7 @@ 
 
 #include <rdma/rdma_cma.h>
 #include <infiniband/ib.h>
+#include <infiniband/arch.h>
 
 #include <ccan/minmax.h>
 
@@ -51,14 +52,6 @@ 
 
 #define PFX "librdmacm: "
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-static inline uint64_t htonll(uint64_t x) { return bswap_64(x); }
-static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); }
-#else
-static inline uint64_t htonll(uint64_t x) { return x; }
-static inline uint64_t ntohll(uint64_t x) { return x; }
-#endif
-
 /*
  * Fast synchronization for low contention locking.
  */
diff --git a/srp_daemon/srp_daemon/srp_daemon.h b/srp_daemon/srp_daemon/srp_daemon.h
index d6a2d8a84728..5d42d51e6d1c 100644
--- a/srp_daemon/srp_daemon/srp_daemon.h
+++ b/srp_daemon/srp_daemon/srp_daemon.h
@@ -42,26 +42,11 @@ 
 #include <byteswap.h>
 #include <infiniband/verbs.h>
 #include <infiniband/umad.h>
+#include <infiniband/arch.h>
 
 #include "config.h"
 #include "srp_ib_types.h"
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#ifndef htonll
-#define htonll(x) bswap_64(x)
-#endif
-#ifndef ntohll
-#define ntohll(x) bswap_64(x)
-#endif
-#elif __BYTE_ORDER == __BIG_ENDIAN
-#ifndef htonll
-#define htonll(x) (x)
-#endif
-#ifndef ntohll
-#define ntohll(x) (x)
-#endif
-#endif
-
 #ifdef __cplusplus
 template <bool b> struct vki_static_assert { int m_bitfield:(2*b-1); };
 #define STATIC_ASSERT(expr) \