diff mbox series

[v3,01/16] cxl/pci: Fix CDAT retrieval on big endian

Message ID bbbe1c4f3788052865941572565aeb2be67a6770.1676043318.git.lukas@wunner.de (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series Collection of DOE material | expand

Commit Message

Lukas Wunner Feb. 10, 2023, 8:25 p.m. UTC
The CDAT exposed in sysfs differs between little endian and big endian
arches:  On big endian, every 4 bytes are byte-swapped.

PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
such as pci_read_config_dword() implicitly swap bytes on big endian.
That way, the macros in include/uapi/linux/pci_regs.h work regardless of
the arch's endianness.  For an example of implicit byte-swapping, see
ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
(Load Word Byte-Reverse Indexed).

DOE Read/Write Data Mailbox Registers are unlike other registers in
Configuration Space in that they contain or receive a 4 byte portion of
an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
They need to be copied to or from the request/response buffer verbatim.
So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
byte-swapping.

The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
implicit byte-swapping.  Byte-swap requests after constructing them with
those macros and byte-swap responses before parsing them.

Change the request and response type to __le32 to avoid sparse warnings.

Fixes: c97006046c79 ("cxl/port: Read CDAT table")
Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: stable@vger.kernel.org # v6.0+
---
 Changes v2 -> v3:
 * Newly added patch in v3

 drivers/cxl/core/pci.c  | 12 ++++++------
 drivers/pci/doe.c       | 13 ++++++++-----
 include/linux/pci-doe.h |  8 ++++++--
 3 files changed, 20 insertions(+), 13 deletions(-)

Comments

Dan Williams Feb. 11, 2023, 12:22 a.m. UTC | #1
Lukas Wunner wrote:
> The CDAT exposed in sysfs differs between little endian and big endian
> arches:  On big endian, every 4 bytes are byte-swapped.
> 
> PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
> such as pci_read_config_dword() implicitly swap bytes on big endian.
> That way, the macros in include/uapi/linux/pci_regs.h work regardless of
> the arch's endianness.  For an example of implicit byte-swapping, see
> ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
> (Load Word Byte-Reverse Indexed).
> 
> DOE Read/Write Data Mailbox Registers are unlike other registers in
> Configuration Space in that they contain or receive a 4 byte portion of
> an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
> They need to be copied to or from the request/response buffer verbatim.
> So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
> byte-swapping.
> 
> The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
> implicit byte-swapping.  Byte-swap requests after constructing them with
> those macros and byte-swap responses before parsing them.
> 
> Change the request and response type to __le32 to avoid sparse warnings.
> 
> Fixes: c97006046c79 ("cxl/port: Read CDAT table")
> Tested-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> Cc: stable@vger.kernel.org # v6.0+

Good catch, a question below, but either way:

Reviewed-by: Dan Williams <dan.j.williams@intel.com>

> ---
>  Changes v2 -> v3:
>  * Newly added patch in v3
> 
>  drivers/cxl/core/pci.c  | 12 ++++++------
>  drivers/pci/doe.c       | 13 ++++++++-----
>  include/linux/pci-doe.h |  8 ++++++--
>  3 files changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 57764e9cd19d..d3cf1d9d67d4 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -480,7 +480,7 @@ static struct pci_doe_mb *find_cdat_doe(struct device *uport)
>  	return NULL;
>  }
>  
> -#define CDAT_DOE_REQ(entry_handle)					\
> +#define CDAT_DOE_REQ(entry_handle) cpu_to_le32				\
>  	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
>  		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
>  	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
> @@ -493,8 +493,8 @@ static void cxl_doe_task_complete(struct pci_doe_task *task)
>  }
>  
>  struct cdat_doe_task {
> -	u32 request_pl;
> -	u32 response_pl[32];
> +	__le32 request_pl;
> +	__le32 response_pl[32];
>  	struct completion c;
>  	struct pci_doe_task task;
>  };
> @@ -531,7 +531,7 @@ static int cxl_cdat_get_length(struct device *dev,
>  	if (t.task.rv < sizeof(u32))
>  		return -EIO;
>  
> -	*length = t.response_pl[1];
> +	*length = le32_to_cpu(t.response_pl[1]);
>  	dev_dbg(dev, "CDAT length %zu\n", *length);
>  
>  	return 0;
> @@ -548,7 +548,7 @@ static int cxl_cdat_read_table(struct device *dev,
>  	do {
>  		DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t);
>  		size_t entry_dw;
> -		u32 *entry;
> +		__le32 *entry;
>  		int rc;
>  
>  		rc = pci_doe_submit_task(cdat_doe, &t.task);
> @@ -563,7 +563,7 @@ static int cxl_cdat_read_table(struct device *dev,
>  
>  		/* Get the CXL table access header entry handle */
>  		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
> -					 t.response_pl[0]);
> +					 le32_to_cpu(t.response_pl[0]));
>  		entry = t.response_pl + 1;
>  		entry_dw = t.task.rv / sizeof(u32);
>  		/* Skip Header */
> diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
> index 66d9ab288646..69efa9a250b9 100644
> --- a/drivers/pci/doe.c
> +++ b/drivers/pci/doe.c
> @@ -143,7 +143,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
>  					  length));
>  	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
>  		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
> -				       task->request_pl[i]);
> +				       le32_to_cpu(task->request_pl[i]));

What do you think about defining:

int pci_doe_write(const struct pci_dev *dev, int where, __le32 val);
int pci_doe_read(const struct pci_dev *dev, int where, __le32 *val);

...local to this file to make it extra clear that DOE transfers are
passing raw byte-streams and not register values as
pci_{write,read}_config_dword() expect.
Jonathan Cameron Feb. 14, 2023, 11:15 a.m. UTC | #2
On Fri, 10 Feb 2023 21:25:01 +0100
Lukas Wunner <lukas@wunner.de> wrote:

> The CDAT exposed in sysfs differs between little endian and big endian
> arches:  On big endian, every 4 bytes are byte-swapped.
> 
> PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
> such as pci_read_config_dword() implicitly swap bytes on big endian.
> That way, the macros in include/uapi/linux/pci_regs.h work regardless of
> the arch's endianness.  For an example of implicit byte-swapping, see
> ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
> (Load Word Byte-Reverse Indexed).
> 
> DOE Read/Write Data Mailbox Registers are unlike other registers in
> Configuration Space in that they contain or receive a 4 byte portion of
> an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
> They need to be copied to or from the request/response buffer verbatim.
> So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
> byte-swapping.
> 
> The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
> implicit byte-swapping.  Byte-swap requests after constructing them with
> those macros and byte-swap responses before parsing them.
> 
> Change the request and response type to __le32 to avoid sparse warnings.

I'll start by saying I hate endian issues. Endless headaches.

The type swaps in here are confusing me but after arguing myself both
ways this morning, I think you have it right.

In brief this patch set ensures that internal handling of the payloads
is in le32 chunks by unwinding the implicit conversions in the pci config
accessors for BE platforms.  Thus the data in the payloads is always
in the same order.  Good start.  Hence the binary read back of CDAT is
little endian (including all fields within it).

Should we reflect that in the type of cdat->table or at least he u32 *data
pointer in cxl_cdat_read_table()

A few other trivial simplifications inline.

Jonathan


> 
> Fixes: c97006046c79 ("cxl/port: Read CDAT table")
> Tested-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> Cc: stable@vger.kernel.org # v6.0+
> ---
>  Changes v2 -> v3:
>  * Newly added patch in v3
> 
>  drivers/cxl/core/pci.c  | 12 ++++++------
>  drivers/pci/doe.c       | 13 ++++++++-----
>  include/linux/pci-doe.h |  8 ++++++--
>  3 files changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 57764e9cd19d..d3cf1d9d67d4 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -480,7 +480,7 @@ static struct pci_doe_mb *find_cdat_doe(struct device *uport)
>  	return NULL;
>  }
>  
> -#define CDAT_DOE_REQ(entry_handle)					\
> +#define CDAT_DOE_REQ(entry_handle) cpu_to_le32				\
>  	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
>  		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
>  	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
> @@ -493,8 +493,8 @@ static void cxl_doe_task_complete(struct pci_doe_task *task)
>  }
>  
>  struct cdat_doe_task {
> -	u32 request_pl;
> -	u32 response_pl[32];
> +	__le32 request_pl;
> +	__le32 response_pl[32];
>  	struct completion c;
>  	struct pci_doe_task task;
>  };
> @@ -531,7 +531,7 @@ static int cxl_cdat_get_length(struct device *dev,
>  	if (t.task.rv < sizeof(u32))
>  		return -EIO;
>  
> -	*length = t.response_pl[1];
> +	*length = le32_to_cpu(t.response_pl[1]);
>  	dev_dbg(dev, "CDAT length %zu\n", *length);
>  
>  	return 0;
> @@ -548,7 +548,7 @@ static int cxl_cdat_read_table(struct device *dev,
>  	do {
>  		DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t);
>  		size_t entry_dw;
> -		u32 *entry;
> +		__le32 *entry;
>  		int rc;
>  
>  		rc = pci_doe_submit_task(cdat_doe, &t.task);
> @@ -563,7 +563,7 @@ static int cxl_cdat_read_table(struct device *dev,
>  
>  		/* Get the CXL table access header entry handle */
>  		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
> -					 t.response_pl[0]);
> +					 le32_to_cpu(t.response_pl[0]));
>  		entry = t.response_pl + 1;
>  		entry_dw = t.task.rv / sizeof(u32);

Given we are manipulating it as __le32, I'd prefer to see that reflected
in the sizeof() calls.

>  		/* Skip Header */
> diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
> index 66d9ab288646..69efa9a250b9 100644
> --- a/drivers/pci/doe.c
> +++ b/drivers/pci/doe.c
> @@ -143,7 +143,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
>  					  length));
>  	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)

sizeof(__le32) or sizeof(task->request_pl[0])

>  		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
> -				       task->request_pl[i]);
> +				       le32_to_cpu(task->request_pl[i]));
>  
>  	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
>  
> @@ -198,8 +198,8 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
>  	payload_length = min(length, task->response_pl_sz / sizeof(u32));
>  	/* Read the rest of the response payload */
>  	for (i = 0; i < payload_length; i++) {
> -		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
> -				      &task->response_pl[i]);
> +		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
> +		task->response_pl[i] = cpu_to_le32(val);
>  		/* Prior to the last ack, ensure Data Object Ready */
>  		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
>  			return -EIO;
> @@ -322,15 +322,17 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
>  	struct pci_doe_task task = {
>  		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
>  		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
> -		.request_pl = &request_pl,
> +		.request_pl = (__le32 *)&request_pl,

I don't like this type cast. request_pl is local
anyway, so why not change it's type and set it appropriately in
the first place.

>  		.request_pl_sz = sizeof(request_pl),
> -		.response_pl = &response_pl,
> +		.response_pl = (__le32 *)&response_pl,

Likewise here.

>  		.response_pl_sz = sizeof(response_pl),
>  		.complete = pci_doe_task_complete,
>  		.private = &c,
>  	};
>  	int rc;
>  
> +	cpu_to_le32s(&request_pl);
> +
>  	rc = pci_doe_submit_task(doe_mb, &task);
>  	if (rc < 0)
>  		return rc;
> @@ -340,6 +342,7 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
>  	if (task.rv != sizeof(response_pl))
>  		return -EIO;
>  
> +	le32_to_cpus(&response_pl);
>  	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
>  	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
>  			      response_pl);
> diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
> index ed9b4df792b8..43765eaf2342 100644
> --- a/include/linux/pci-doe.h
> +++ b/include/linux/pci-doe.h
> @@ -34,6 +34,10 @@ struct pci_doe_mb;
>   * @work: Used internally by the mailbox
>   * @doe_mb: Used internally by the mailbox
>   *
> + * Payloads are treated as opaque byte streams which are transmitted verbatim,
> + * without byte-swapping.  If payloads contain little-endian register values,
> + * the caller is responsible for conversion with cpu_to_le32() / le32_to_cpu().
> + *
>   * The payload sizes and rv are specified in bytes with the following
>   * restrictions concerning the protocol.
>   *
> @@ -45,9 +49,9 @@ struct pci_doe_mb;
>   */
>  struct pci_doe_task {
>  	struct pci_doe_protocol prot;
> -	u32 *request_pl;
> +	__le32 *request_pl;
>  	size_t request_pl_sz;
> -	u32 *response_pl;
> +	__le32 *response_pl;
>  	size_t response_pl_sz;
>  	int rv;
>  	void (*complete)(struct pci_doe_task *task);
Lukas Wunner Feb. 14, 2023, 1:51 p.m. UTC | #3
On Tue, Feb 14, 2023 at 11:15:04AM +0000, Jonathan Cameron wrote:
> On Fri, 10 Feb 2023 21:25:01 +0100 Lukas Wunner <lukas@wunner.de> wrote:
> > The CDAT exposed in sysfs differs between little endian and big endian
> > arches:  On big endian, every 4 bytes are byte-swapped.
> > 
> > PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
> > such as pci_read_config_dword() implicitly swap bytes on big endian.
> > That way, the macros in include/uapi/linux/pci_regs.h work regardless of
> > the arch's endianness.  For an example of implicit byte-swapping, see
> > ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
> > (Load Word Byte-Reverse Indexed).
> > 
> > DOE Read/Write Data Mailbox Registers are unlike other registers in
> > Configuration Space in that they contain or receive a 4 byte portion of
> > an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
> > They need to be copied to or from the request/response buffer verbatim.
> > So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
> > byte-swapping.
> > 
> > The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
> > implicit byte-swapping.  Byte-swap requests after constructing them with
> > those macros and byte-swap responses before parsing them.
> > 
> > Change the request and response type to __le32 to avoid sparse warnings.
> 
> In brief this patch set ensures that internal handling of the payloads
> is in le32 chunks by unwinding the implicit conversions in the pci config
> accessors for BE platforms.  Thus the data in the payloads is always
> in the same order.  Good start.  Hence the binary read back of CDAT is
> little endian (including all fields within it).

Correct.  On big endian this means writing request dwords and
reading response dwords involves 2 endianness conversions.

If the request is constructed with macros such as CXL_DOE_TABLE_ACCESS_*
and PCI_DOE_DATA_OBJECT_DISC_*, then an additional (3rd) endianness
conversion is necessary:  The request is constructed in native big endian
order, then converted to a little endian DOE payload.  That payload is
then converted back to big endian so that it can be written with the
config space accessor, which converts to little endian.

I recognize this is crazy but I do not see a better solution.
Dan suggested amending struct pci_ops with ->read_raw and ->write_raw
callbacks which omit the endianness conversion.  However there are
175 struct pci_ops in the kernel and each one would have to be looked at.
I also do not know the Assembler instructions for every architecture to
perform a non-byte-swapped write or read.  It's a big task and lots of
code to add.  Given that DOE is likely the only user and big endian
is seeing decreasing use, it's probably not worth the effort and LoC.


> > @@ -322,15 +322,17 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
> >  	struct pci_doe_task task = {
> >  		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
> >  		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
> > -		.request_pl = &request_pl,
> > +		.request_pl = (__le32 *)&request_pl,
> 
> I don't like this type cast. request_pl is local
> anyway, so why not change it's type and set it appropriately in
> the first place.

That doesn't work I'm afraid.  The request_pl is constructed with
FIELD_PREP() here and the response_pl is parsed with FIELD_GET().

Those macros operate on an unsigned integer of native endianness.
If I declare them __le32, I get sparse warnings and rightfully so.

The casts are the simplest, least intrusive solution.  The only
alternative would be to use separate variables but then I'd have
to change a lot more lines and it would be more difficult to review
and probably not all that more readable.

Thanks,

Lukas

> >  		.request_pl_sz = sizeof(request_pl),
> > -		.response_pl = &response_pl,
> > +		.response_pl = (__le32 *)&response_pl,
> 
> Likewise here.
> 
> >  		.response_pl_sz = sizeof(response_pl),
> >  		.complete = pci_doe_task_complete,
> >  		.private = &c,
> >  	};
> >  	int rc;
> >  
> > +	cpu_to_le32s(&request_pl);
> > +
> >  	rc = pci_doe_submit_task(doe_mb, &task);
> >  	if (rc < 0)
> >  		return rc;
> > @@ -340,6 +342,7 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
> >  	if (task.rv != sizeof(response_pl))
> >  		return -EIO;
> >  
> > +	le32_to_cpus(&response_pl);
> >  	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
> >  	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
> >  			      response_pl);
Jonathan Cameron Feb. 14, 2023, 3:45 p.m. UTC | #4
On Tue, 14 Feb 2023 14:51:10 +0100
Lukas Wunner <lukas@wunner.de> wrote:

> On Tue, Feb 14, 2023 at 11:15:04AM +0000, Jonathan Cameron wrote:
> > On Fri, 10 Feb 2023 21:25:01 +0100 Lukas Wunner <lukas@wunner.de> wrote:  
> > > The CDAT exposed in sysfs differs between little endian and big endian
> > > arches:  On big endian, every 4 bytes are byte-swapped.
> > > 
> > > PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
> > > such as pci_read_config_dword() implicitly swap bytes on big endian.
> > > That way, the macros in include/uapi/linux/pci_regs.h work regardless of
> > > the arch's endianness.  For an example of implicit byte-swapping, see
> > > ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
> > > (Load Word Byte-Reverse Indexed).
> > > 
> > > DOE Read/Write Data Mailbox Registers are unlike other registers in
> > > Configuration Space in that they contain or receive a 4 byte portion of
> > > an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
> > > They need to be copied to or from the request/response buffer verbatim.
> > > So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
> > > byte-swapping.
> > > 
> > > The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
> > > implicit byte-swapping.  Byte-swap requests after constructing them with
> > > those macros and byte-swap responses before parsing them.
> > > 
> > > Change the request and response type to __le32 to avoid sparse warnings.  
> > 
> > In brief this patch set ensures that internal handling of the payloads
> > is in le32 chunks by unwinding the implicit conversions in the pci config
> > accessors for BE platforms.  Thus the data in the payloads is always
> > in the same order.  Good start.  Hence the binary read back of CDAT is
> > little endian (including all fields within it).  
> 
> Correct.  On big endian this means writing request dwords and
> reading response dwords involves 2 endianness conversions.
> 
> If the request is constructed with macros such as CXL_DOE_TABLE_ACCESS_*
> and PCI_DOE_DATA_OBJECT_DISC_*, then an additional (3rd) endianness
> conversion is necessary:  The request is constructed in native big endian
> order, then converted to a little endian DOE payload.  That payload is
> then converted back to big endian so that it can be written with the
> config space accessor, which converts to little endian.
> 
> I recognize this is crazy but I do not see a better solution.
> Dan suggested amending struct pci_ops with ->read_raw and ->write_raw
> callbacks which omit the endianness conversion.  However there are
> 175 struct pci_ops in the kernel and each one would have to be looked at.
> I also do not know the Assembler instructions for every architecture to
> perform a non-byte-swapped write or read.  It's a big task and lots of
> code to add.  Given that DOE is likely the only user and big endian
> is seeing decreasing use, it's probably not worth the effort and LoC.
> 
> 
> > > @@ -322,15 +322,17 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
> > >  	struct pci_doe_task task = {
> > >  		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
> > >  		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
> > > -		.request_pl = &request_pl,
> > > +		.request_pl = (__le32 *)&request_pl,  
> > 
> > I don't like this type cast. request_pl is local
> > anyway, so why not change it's type and set it appropriately in
> > the first place.  
> 
> That doesn't work I'm afraid.  The request_pl is constructed with
> FIELD_PREP() here and the response_pl is parsed with FIELD_GET().
> 
> Those macros operate on an unsigned integer of native endianness.
> If I declare them __le32, I get sparse warnings and rightfully so.
> 
> The casts are the simplest, least intrusive solution.  The only
> alternative would be to use separate variables but then I'd have
> to change a lot more lines and it would be more difficult to review
> and probably not all that more readable.

Separate variables was what I meant (less than clear)

Add a __le32 le32_req_pl then
do the endian conversion into that rather than in place conversion.
Equivalent in teh other direction for the response.

That way the endian markings end up correct on all the local variables
and it shouldn't require many more lines to change.


> 
> Thanks,
> 
> Lukas
> 
> > >  		.request_pl_sz = sizeof(request_pl),
> > > -		.response_pl = &response_pl,
> > > +		.response_pl = (__le32 *)&response_pl,  
> > 
> > Likewise here.
> >   
> > >  		.response_pl_sz = sizeof(response_pl),
> > >  		.complete = pci_doe_task_complete,
> > >  		.private = &c,
> > >  	};
> > >  	int rc;
> > >  
> > > +	cpu_to_le32s(&request_pl);
> > > +
> > >  	rc = pci_doe_submit_task(doe_mb, &task);
> > >  	if (rc < 0)
> > >  		return rc;
> > > @@ -340,6 +342,7 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
> > >  	if (task.rv != sizeof(response_pl))
> > >  		return -EIO;
> > >  
> > > +	le32_to_cpus(&response_pl);
> > >  	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
> > >  	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
> > >  			      response_pl);  
>
Lukas Wunner Feb. 19, 2023, 1:03 p.m. UTC | #5
On Fri, Feb 10, 2023 at 04:22:47PM -0800, Dan Williams wrote:
> Lukas Wunner wrote:
> > PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
> > such as pci_read_config_dword() implicitly swap bytes on big endian.
> > That way, the macros in include/uapi/linux/pci_regs.h work regardless of
> > the arch's endianness.  For an example of implicit byte-swapping, see
> > ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
> > (Load Word Byte-Reverse Indexed).
> > 
> > DOE Read/Write Data Mailbox Registers are unlike other registers in
> > Configuration Space in that they contain or receive a 4 byte portion of
> > an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
> > They need to be copied to or from the request/response buffer verbatim.
> > So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
> > byte-swapping.
[...]
> > --- a/drivers/pci/doe.c
> > +++ b/drivers/pci/doe.c
> > @@ -143,7 +143,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
> >  					  length));
> >  	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
> >  		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
> > -				       task->request_pl[i]);
> > +				       le32_to_cpu(task->request_pl[i]));
> 
> What do you think about defining:
> 
> int pci_doe_write(const struct pci_dev *dev, int where, __le32 val);
> int pci_doe_read(const struct pci_dev *dev, int where, __le32 *val);
> 
> ...local to this file to make it extra clear that DOE transfers are
> passing raw byte-streams and not register values as
> pci_{write,read}_config_dword() expect.

I've done a prototype (see below), but it doesn't strike me as an
improvement:

There are only two occurrences for each of those read and write accessors,
so the diffstat isn't pretty (27 insertions, 12 deletions).

Moreover, the request header constructed in pci_doe_send_req() is
constructed in native endianness and written using the standard
pci_write_config_dword() accessor.  Same for the response header
parsed in pci_doe_recv_resp().  Thus, the functions do not
consistently use the new custom accessors, but rather use a mix
of the standard accessors and custom accessors.  So clarity may
not improve all that much.

Let me know if you feel otherwise!

The patch below goes on top of the series.  I'm using a variation
of your suggested approach in that I've named the custom accessors
pci_doe_{write,read}_data() (for consistency with the existing
pci_doe_write_ctrl()).

Thanks,

Lukas

-- >8 --

diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index 8499cf9..6b0148e 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -106,6 +106,24 @@ static void pci_doe_write_ctrl(struct pci_doe_mb *doe_mb, u32 val)
 	pci_write_config_dword(pdev, offset + PCI_DOE_CTRL, val);
 }
 
+static void pci_doe_write_data(struct pci_doe_mb *doe_mb, __le32 lav)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, le32_to_cpu(lav));
+}
+
+static void pci_doe_read_data(struct pci_doe_mb *doe_mb, __le32 *lav)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	*lav = cpu_to_le32(val);
+}
+
 static int pci_doe_abort(struct pci_doe_mb *doe_mb)
 {
 	struct pci_dev *pdev = doe_mb->pdev;
@@ -144,6 +162,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
 	struct pci_dev *pdev = doe_mb->pdev;
 	int offset = doe_mb->cap_offset;
 	size_t length, remainder;
+	__le32 lav;
 	u32 val;
 	int i;
 
@@ -177,16 +196,14 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
 
 	/* Write payload */
 	for (i = 0; i < task->request_pl_sz / sizeof(__le32); i++)
-		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
-				       le32_to_cpu(task->request_pl[i]));
+		pci_doe_write_data(doe_mb, task->request_pl[i]);
 
 	/* Write last payload dword */
 	remainder = task->request_pl_sz % sizeof(__le32);
 	if (remainder) {
-		val = 0;
-		memcpy(&val, &task->request_pl[i], remainder);
-		le32_to_cpus(&val);
-		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val);
+		lav = 0;
+		memcpy(&lav, &task->request_pl[i], remainder);
+		pci_doe_write_data(doe_mb, lav);
 	}
 
 	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
@@ -211,6 +228,7 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
 	size_t length, payload_length, remainder, received;
 	struct pci_dev *pdev = doe_mb->pdev;
 	int offset = doe_mb->cap_offset;
+	__le32 lav;
 	int i = 0;
 	u32 val;
 
@@ -256,16 +274,13 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
 	if (payload_length) {
 		/* Read all payload dwords except the last */
 		for (; i < payload_length - 1; i++) {
-			pci_read_config_dword(pdev, offset + PCI_DOE_READ,
-					      &val);
-			task->response_pl[i] = cpu_to_le32(val);
+			pci_doe_read_data(doe_mb, &task->response_pl[i]);
 			pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
 		}
 
 		/* Read last payload dword */
-		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
-		cpu_to_le32s(&val);
-		memcpy(&task->response_pl[i], &val, remainder);
+		pci_doe_read_data(doe_mb, &lav);
+		memcpy(&task->response_pl[i], &lav, remainder);
 		/* Prior to the last ack, ensure Data Object Ready */
 		if (!pci_doe_data_obj_ready(doe_mb))
 			return -EIO;
Alexey Kardashevskiy Feb. 28, 2023, 2:53 a.m. UTC | #6
On 11/2/23 07:25, Lukas Wunner wrote:
> The CDAT exposed in sysfs differs between little endian and big endian
> arches:  On big endian, every 4 bytes are byte-swapped.


hexdump prints different byte streams on LE and BE? Does not seem right.


> PCI Configuration Space is little endian (PCI r3.0 sec 6.1).  Accessors
> such as pci_read_config_dword() implicitly swap bytes on big endian.
> That way, the macros in include/uapi/linux/pci_regs.h work regardless of
> the arch's endianness.  For an example of implicit byte-swapping, see
> ppc4xx_pciex_read_config(), which calls in_le32(), which uses lwbrx
> (Load Word Byte-Reverse Indexed).
> 
> DOE Read/Write Data Mailbox Registers are unlike other registers in
> Configuration Space in that they contain or receive a 4 byte portion of
> an opaque byte stream (a "Data Object" per PCIe r6.0 sec 7.9.24.5f).
> They need to be copied to or from the request/response buffer verbatim.
> So amend pci_doe_send_req() and pci_doe_recv_resp() to undo the implicit
> byte-swapping.
> 
> The CXL_DOE_TABLE_ACCESS_* and PCI_DOE_DATA_OBJECT_DISC_* macros assume
> implicit byte-swapping.  Byte-swap requests after constructing them with
> those macros and byte-swap responses before parsing them.
> 
> Change the request and response type to __le32 to avoid sparse warnings.
>
> Fixes: c97006046c79 ("cxl/port: Read CDAT table")
> Tested-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> Cc: stable@vger.kernel.org # v6.0+
> ---
>   Changes v2 -> v3:
>   * Newly added patch in v3
> 
>   drivers/cxl/core/pci.c  | 12 ++++++------
>   drivers/pci/doe.c       | 13 ++++++++-----
>   include/linux/pci-doe.h |  8 ++++++--
>   3 files changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 57764e9cd19d..d3cf1d9d67d4 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -480,7 +480,7 @@ static struct pci_doe_mb *find_cdat_doe(struct device *uport)
>   	return NULL;
>   }
>   
> -#define CDAT_DOE_REQ(entry_handle)					\
> +#define CDAT_DOE_REQ(entry_handle) cpu_to_le32				\
>   	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
>   		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
>   	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
> @@ -493,8 +493,8 @@ static void cxl_doe_task_complete(struct pci_doe_task *task)
>   }
>   
>   struct cdat_doe_task {
> -	u32 request_pl;
> -	u32 response_pl[32];
> +	__le32 request_pl;
> +	__le32 response_pl[32];

This is ok as it is a binary format of DOE message (is it?)...

>   	struct completion c;
>   	struct pci_doe_task task;
>   };
> @@ -531,7 +531,7 @@ static int cxl_cdat_get_length(struct device *dev,
>   	if (t.task.rv < sizeof(u32))
>   		return -EIO;
>   
> -	*length = t.response_pl[1];
> +	*length = le32_to_cpu(t.response_pl[1]);
>   	dev_dbg(dev, "CDAT length %zu\n", *length);
>   
>   	return 0;
> @@ -548,7 +548,7 @@ static int cxl_cdat_read_table(struct device *dev,
>   	do {
>   		DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t);
>   		size_t entry_dw;
> -		u32 *entry;
> +		__le32 *entry;
>   		int rc;
>   
>   		rc = pci_doe_submit_task(cdat_doe, &t.task);
> @@ -563,7 +563,7 @@ static int cxl_cdat_read_table(struct device *dev,
>   
>   		/* Get the CXL table access header entry handle */
>   		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
> -					 t.response_pl[0]);
> +					 le32_to_cpu(t.response_pl[0]));
>   		entry = t.response_pl + 1;
>   		entry_dw = t.task.rv / sizeof(u32);
>   		/* Skip Header */
> diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
> index 66d9ab288646..69efa9a250b9 100644
> --- a/drivers/pci/doe.c
> +++ b/drivers/pci/doe.c
> @@ -143,7 +143,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
>   					  length));
>   	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
>   		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
> -				       task->request_pl[i]);
> +				       le32_to_cpu(task->request_pl[i]));

Does it really work on BE? My little brain explodes on all these 
convertions :)

char buf[] = { 1, 2, 3, 4 }
u32 *request_pl = buf;

request_pl[0] will be 0x01020304.
le32_to_cpu(request_pl[0]) will be 0x04030201
And then pci_write_config_dword() will do another swap.

Did I miss something? (/me is gone bringing up a BE system).

>   
>   	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
>   
> @@ -198,8 +198,8 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
>   	payload_length = min(length, task->response_pl_sz / sizeof(u32));
>   	/* Read the rest of the response payload */
>   	for (i = 0; i < payload_length; i++) {
> -		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
> -				      &task->response_pl[i]);
> +		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
> +		task->response_pl[i] = cpu_to_le32(val);
>   		/* Prior to the last ack, ensure Data Object Ready */
>   		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
>   			return -EIO;
> @@ -322,15 +322,17 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
>   	struct pci_doe_task task = {
>   		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
>   		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
> -		.request_pl = &request_pl,
> +		.request_pl = (__le32 *)&request_pl,
>   		.request_pl_sz = sizeof(request_pl),
> -		.response_pl = &response_pl,
> +		.response_pl = (__le32 *)&response_pl,
>   		.response_pl_sz = sizeof(response_pl),
>   		.complete = pci_doe_task_complete,
>   		.private = &c,
>   	};
>   	int rc;
>   
> +	cpu_to_le32s(&request_pl);
> +
>   	rc = pci_doe_submit_task(doe_mb, &task);
>   	if (rc < 0)
>   		return rc;
> @@ -340,6 +342,7 @@ static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
>   	if (task.rv != sizeof(response_pl))
>   		return -EIO;
>   
> +	le32_to_cpus(&response_pl);
>   	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
>   	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
>   			      response_pl);
> diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
> index ed9b4df792b8..43765eaf2342 100644
> --- a/include/linux/pci-doe.h
> +++ b/include/linux/pci-doe.h
> @@ -34,6 +34,10 @@ struct pci_doe_mb;
>    * @work: Used internally by the mailbox
>    * @doe_mb: Used internally by the mailbox
>    *
> + * Payloads are treated as opaque byte streams which are transmitted verbatim,
> + * without byte-swapping.  If payloads contain little-endian register values,
> + * the caller is responsible for conversion with cpu_to_le32() / le32_to_cpu().
> + *
>    * The payload sizes and rv are specified in bytes with the following
>    * restrictions concerning the protocol.
>    *
> @@ -45,9 +49,9 @@ struct pci_doe_mb;
>    */
>   struct pci_doe_task {
>   	struct pci_doe_protocol prot;
> -	u32 *request_pl;
> +	__le32 *request_pl;
>   	size_t request_pl_sz;
> -	u32 *response_pl;
> +	__le32 *response_pl;


This does not look right. Either:
- pci_doe() should also take __le32* or
- pci_doe() should do cpu_to_le32() for the request, and 
pci_doe_task_complete() - for the response.


Thanks,


>   	size_t response_pl_sz;
>   	int rv;
>   	void (*complete)(struct pci_doe_task *task);
Lukas Wunner Feb. 28, 2023, 8:24 a.m. UTC | #7
On Tue, Feb 28, 2023 at 01:53:46PM +1100, Alexey Kardashevskiy wrote:
> On 11/2/23 07:25, Lukas Wunner wrote:
> > @@ -493,8 +493,8 @@ static void cxl_doe_task_complete(struct pci_doe_task *task)
> >   }
> >   struct cdat_doe_task {
> > -	u32 request_pl;
> > -	u32 response_pl[32];
> > +	__le32 request_pl;
> > +	__le32 response_pl[32];
> 
> This is ok as it is a binary format of DOE message (is it?)...

Yes, the DOE request and response is an opaque byte stream.

The above-quoted type changes are necessary to avoid sparse warnings
(as noted in the commit message).


> > --- a/drivers/pci/doe.c
> > +++ b/drivers/pci/doe.c
> > @@ -143,7 +143,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
> >   					  length));
> >   	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
> >   		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
> > -				       task->request_pl[i]);
> > +				       le32_to_cpu(task->request_pl[i]));
> 
> Does it really work on BE? My little brain explodes on all these convertions
> 
> char buf[] = { 1, 2, 3, 4 }
> u32 *request_pl = buf;
> 
> request_pl[0] will be 0x01020304.
> le32_to_cpu(request_pl[0]) will be 0x04030201
> And then pci_write_config_dword() will do another swap.
> 
> Did I miss something? (/me is gone bringing up a BE system).

Correct.


> > @@ -45,9 +49,9 @@ struct pci_doe_mb;
> >    */
> >   struct pci_doe_task {
> >   	struct pci_doe_protocol prot;
> > -	u32 *request_pl;
> > +	__le32 *request_pl;
> >   	size_t request_pl_sz;
> > -	u32 *response_pl;
> > +	__le32 *response_pl;
> 
> 
> This does not look right. Either:
> - pci_doe() should also take __le32* or
> - pci_doe() should do cpu_to_le32() for the request, and
> pci_doe_task_complete() - for the response.

Again, the type changes are necessary to avoid sparse warnings.

pci_doe() takes a void * because patch [15/16] eliminates the need
for the request and response size to be a multiple of dwords.
Passing __le32 * to pci_doe() would then no longer be correct.

Thanks,

Lukas
Alexey Kardashevskiy Feb. 28, 2023, 12:08 p.m. UTC | #8
On 28/2/23 19:24, Lukas Wunner wrote:
> On Tue, Feb 28, 2023 at 01:53:46PM +1100, Alexey Kardashevskiy wrote:
>> On 11/2/23 07:25, Lukas Wunner wrote:
>>> @@ -493,8 +493,8 @@ static void cxl_doe_task_complete(struct pci_doe_task *task)
>>>    }
>>>    struct cdat_doe_task {
>>> -	u32 request_pl;
>>> -	u32 response_pl[32];
>>> +	__le32 request_pl;
>>> +	__le32 response_pl[32];
>>
>> This is ok as it is a binary format of DOE message (is it?)...
> 
> Yes, the DOE request and response is an opaque byte stream.
> 
> The above-quoted type changes are necessary to avoid sparse warnings
> (as noted in the commit message).
> 
> 
>>> --- a/drivers/pci/doe.c
>>> +++ b/drivers/pci/doe.c
>>> @@ -143,7 +143,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
>>>    					  length));
>>>    	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
>>>    		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
>>> -				       task->request_pl[i]);
>>> +				       le32_to_cpu(task->request_pl[i]));
>>
>> Does it really work on BE? My little brain explodes on all these convertions
>>
>> char buf[] = { 1, 2, 3, 4 }
>> u32 *request_pl = buf;
>>
>> request_pl[0] will be 0x01020304.
>> le32_to_cpu(request_pl[0]) will be 0x04030201
>> And then pci_write_config_dword() will do another swap.
>>
>> Did I miss something? (/me is gone bringing up a BE system).
> 
> Correct.

Uff, ok, your code works correct, sorry for the noise.


> 
>>> @@ -45,9 +49,9 @@ struct pci_doe_mb;
>>>     */
>>>    struct pci_doe_task {
>>>    	struct pci_doe_protocol prot;
>>> -	u32 *request_pl;
>>> +	__le32 *request_pl;
>>>    	size_t request_pl_sz;
>>> -	u32 *response_pl;
>>> +	__le32 *response_pl;
>>
>>
>> This does not look right. Either:
>> - pci_doe() should also take __le32* or
>> - pci_doe() should do cpu_to_le32() for the request, and
>> pci_doe_task_complete() - for the response.
> 
> Again, the type changes are necessary to avoid sparse warnings.

I'd shut it up like this:

===
          for (i = 0; i < task->request_pl_sz / sizeof(__le32); i++)
                 pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
-                                      le32_to_cpu(task->request_pl[i]));
+                                      le32_to_cpu(*(__le32 
*)&task->request_pl[i]));
===

+ may be a comment that LE is a natural order. Dunno.

Changing types to __le32 without explicit conversions or comments just 
confuses. Thanks,


> pci_doe() takes a void * because patch [15/16] eliminates the need
> for the request and response size to be a multiple of dwords.
> Passing __le32 * to pci_doe() would then no longer be correct.
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 57764e9cd19d..d3cf1d9d67d4 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -480,7 +480,7 @@  static struct pci_doe_mb *find_cdat_doe(struct device *uport)
 	return NULL;
 }
 
-#define CDAT_DOE_REQ(entry_handle)					\
+#define CDAT_DOE_REQ(entry_handle) cpu_to_le32				\
 	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
 		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
 	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
@@ -493,8 +493,8 @@  static void cxl_doe_task_complete(struct pci_doe_task *task)
 }
 
 struct cdat_doe_task {
-	u32 request_pl;
-	u32 response_pl[32];
+	__le32 request_pl;
+	__le32 response_pl[32];
 	struct completion c;
 	struct pci_doe_task task;
 };
@@ -531,7 +531,7 @@  static int cxl_cdat_get_length(struct device *dev,
 	if (t.task.rv < sizeof(u32))
 		return -EIO;
 
-	*length = t.response_pl[1];
+	*length = le32_to_cpu(t.response_pl[1]);
 	dev_dbg(dev, "CDAT length %zu\n", *length);
 
 	return 0;
@@ -548,7 +548,7 @@  static int cxl_cdat_read_table(struct device *dev,
 	do {
 		DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t);
 		size_t entry_dw;
-		u32 *entry;
+		__le32 *entry;
 		int rc;
 
 		rc = pci_doe_submit_task(cdat_doe, &t.task);
@@ -563,7 +563,7 @@  static int cxl_cdat_read_table(struct device *dev,
 
 		/* Get the CXL table access header entry handle */
 		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
-					 t.response_pl[0]);
+					 le32_to_cpu(t.response_pl[0]));
 		entry = t.response_pl + 1;
 		entry_dw = t.task.rv / sizeof(u32);
 		/* Skip Header */
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index 66d9ab288646..69efa9a250b9 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -143,7 +143,7 @@  static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
 					  length));
 	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
 		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
-				       task->request_pl[i]);
+				       le32_to_cpu(task->request_pl[i]));
 
 	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
 
@@ -198,8 +198,8 @@  static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas
 	payload_length = min(length, task->response_pl_sz / sizeof(u32));
 	/* Read the rest of the response payload */
 	for (i = 0; i < payload_length; i++) {
-		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
-				      &task->response_pl[i]);
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+		task->response_pl[i] = cpu_to_le32(val);
 		/* Prior to the last ack, ensure Data Object Ready */
 		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
 			return -EIO;
@@ -322,15 +322,17 @@  static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
 	struct pci_doe_task task = {
 		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
 		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
-		.request_pl = &request_pl,
+		.request_pl = (__le32 *)&request_pl,
 		.request_pl_sz = sizeof(request_pl),
-		.response_pl = &response_pl,
+		.response_pl = (__le32 *)&response_pl,
 		.response_pl_sz = sizeof(response_pl),
 		.complete = pci_doe_task_complete,
 		.private = &c,
 	};
 	int rc;
 
+	cpu_to_le32s(&request_pl);
+
 	rc = pci_doe_submit_task(doe_mb, &task);
 	if (rc < 0)
 		return rc;
@@ -340,6 +342,7 @@  static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
 	if (task.rv != sizeof(response_pl))
 		return -EIO;
 
+	le32_to_cpus(&response_pl);
 	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
 	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
 			      response_pl);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index ed9b4df792b8..43765eaf2342 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -34,6 +34,10 @@  struct pci_doe_mb;
  * @work: Used internally by the mailbox
  * @doe_mb: Used internally by the mailbox
  *
+ * Payloads are treated as opaque byte streams which are transmitted verbatim,
+ * without byte-swapping.  If payloads contain little-endian register values,
+ * the caller is responsible for conversion with cpu_to_le32() / le32_to_cpu().
+ *
  * The payload sizes and rv are specified in bytes with the following
  * restrictions concerning the protocol.
  *
@@ -45,9 +49,9 @@  struct pci_doe_mb;
  */
 struct pci_doe_task {
 	struct pci_doe_protocol prot;
-	u32 *request_pl;
+	__le32 *request_pl;
 	size_t request_pl_sz;
-	u32 *response_pl;
+	__le32 *response_pl;
 	size_t response_pl_sz;
 	int rv;
 	void (*complete)(struct pci_doe_task *task);