diff mbox series

[4/5] cxl/mem: Add CDAT table reading from DOE

Message ID 20211105235056.3711389-5-ira.weiny@intel.com
State New, archived
Headers show
Series CXL: Read CDAT and DSMAS data from the device. | expand

Commit Message

Ira Weiny Nov. 5, 2021, 11:50 p.m. UTC
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Read CDAT raw table data from the cxl_mem state object.  Currently this
is only supported by a PCI CXL object through a DOE mailbox which supports
CDAT.  But any cxl_mem type object can provide this data later if need
be.  For example for testing.

Cache this data for later parsing.  Provide a sysfs binary attribute to
allow dumping of the CDAT.

Binary dumping is modeled on /sys/firmware/ACPI/tables/

The ability to dump this table will be very useful for emulation of real
devices once they become available as QEMU CXL type 3 device emulation will
be able to load this file in.

This does not support table updates at runtime. It will always provide
whatever was there when first cached. Handling of table updates can be
implemented later.

Once there are more users, this code can move out to driver/cxl/cdat.c
or similar.

Finally create a complete list of DOE defines within cdat.h for anyone
wishing to decode the CDAT table.

Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

---
Changes from V4:
	Split this into it's own patch
	Rearchitect this such that the memdev driver calls into the DOE
	driver via the cxl_mem state object.  This allows CDAT data to
	come from any type of cxl_mem object not just PCI DOE.
	Rebase on new struct cxl_dev_state
---
 drivers/cxl/cdat.h        | 81 +++++++++++++++++++++++++++++++++
 drivers/cxl/core/memdev.c | 46 +++++++++++++++++++
 drivers/cxl/cxl.h         |  7 +++
 drivers/cxl/cxlmem.h      | 25 +++++++++++
 drivers/cxl/pci.c         | 94 ++++++++++++++++++++++++++++++++++++++-
 5 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cxl/cdat.h

Comments

Jonathan Cameron Nov. 8, 2021, 1:21 p.m. UTC | #1
On Fri, 5 Nov 2021 16:50:55 -0700
<ira.weiny@intel.com> wrote:

> From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> Read CDAT raw table data from the cxl_mem state object.  Currently this
> is only supported by a PCI CXL object through a DOE mailbox which supports
> CDAT.  But any cxl_mem type object can provide this data later if need
> be.  For example for testing.
> 
> Cache this data for later parsing.  Provide a sysfs binary attribute to
> allow dumping of the CDAT.
> 
> Binary dumping is modeled on /sys/firmware/ACPI/tables/
> 
> The ability to dump this table will be very useful for emulation of real
> devices once they become available as QEMU CXL type 3 device emulation will
> be able to load this file in.
> 
> This does not support table updates at runtime. It will always provide
> whatever was there when first cached. Handling of table updates can be
> implemented later.
> 
> Once there are more users, this code can move out to driver/cxl/cdat.c
> or similar.
> 
> Finally create a complete list of DOE defines within cdat.h for anyone
> wishing to decode the CDAT table.
> 
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Hi Ira,

A few other comments inline, some of which are really updates of
earlier comments now I see how it is fitting together.

Jonathan

> 
> ---
> Changes from V4:
> 	Split this into it's own patch
> 	Rearchitect this such that the memdev driver calls into the DOE
> 	driver via the cxl_mem state object.  This allows CDAT data to
> 	come from any type of cxl_mem object not just PCI DOE.

Ah.  Is this to allow mocking? Or is there another architected source
of this information that I've missed?

> 	Rebase on new struct cxl_dev_state
> ---

...

> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index df524b74f1d2..086532a42480 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -11,6 +11,7 @@
>  #include "cxlmem.h"
>  #include "pci.h"
>  #include "cxl.h"
> +#include "cdat.h"
>  
>  /**
>   * DOC: cxl pci
> @@ -575,17 +576,106 @@ static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
>  		if (rc)
>  			return rc;
>  
> -		if (device_attach(&adev->dev) != 1)
> +		if (device_attach(&adev->dev) != 1) {
>  			dev_err(&adev->dev,
>  				"Failed to attach a driver to DOE device %d\n",
>  				adev->id);
> +			goto next;
> +		}
> +
> +		if (pci_doe_supports_prot(new_dev, PCI_DVSEC_VENDOR_ID_CXL,
> +					  CXL_DOE_PROTOCOL_TABLE_ACCESS))
> +			cxlds->cdat_doe = new_dev;

Ah. If we did try to make this block generic, we'd then need a look
up function to call after the generic part.  I guess it is getting more
complex so maybe not having it generic is the right choice for now.

Also, this explains why you passed cxlds in.  So ignore that comment on
the previous.

>  
> +next:
>  		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
>  	}
>  
>  	return 0;
>  }
>  
> +#define CDAT_DOE_REQ(entry_handle)					\
> +	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
> +		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
> +	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
> +		    CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) |		\
> +	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle)))
> +
> +static int cxl_cdat_get_length(struct cxl_dev_state *cxlds, size_t *length)
> +{
> +	struct pci_doe_dev *doe_dev = cxlds->cdat_doe;
> +	u32 cdat_request_pl = CDAT_DOE_REQ(0);
> +	u32 cdat_response_pl[32];
> +	struct pci_doe_exchange ex = {
> +		.prot.vid = PCI_DVSEC_VENDOR_ID_CXL,
> +		.prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS,
> +		.request_pl = &cdat_request_pl,
> +		.request_pl_sz = sizeof(cdat_request_pl),
> +		.response_pl = cdat_response_pl,
> +		.response_pl_sz = sizeof(cdat_response_pl),
> +	};
> +
> +	ssize_t rc;
> +
> +	rc = pci_doe_exchange_sync(doe_dev, &ex);
> +	if (rc < 0)
> +		return rc;
> +	if (rc < 1)
> +		return -EIO;
> +
> +	*length = cdat_response_pl[1];
> +	dev_dbg(cxlds->dev, "CDAT length %zu\n", *length);

Probably not useful any more... 

> +	return 0;
> +}
> +
Jonathan Cameron Nov. 8, 2021, 3:02 p.m. UTC | #2
On Fri, 5 Nov 2021 16:50:55 -0700
<ira.weiny@intel.com> wrote:

> From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> Read CDAT raw table data from the cxl_mem state object.  Currently this
> is only supported by a PCI CXL object through a DOE mailbox which supports
> CDAT.  But any cxl_mem type object can provide this data later if need
> be.  For example for testing.
> 
> Cache this data for later parsing.  Provide a sysfs binary attribute to
> allow dumping of the CDAT.
> 
> Binary dumping is modeled on /sys/firmware/ACPI/tables/
> 
> The ability to dump this table will be very useful for emulation of real
> devices once they become available as QEMU CXL type 3 device emulation will
> be able to load this file in.
> 
> This does not support table updates at runtime. It will always provide
> whatever was there when first cached. Handling of table updates can be
> implemented later.
> 
> Once there are more users, this code can move out to driver/cxl/cdat.c
> or similar.
> 
> Finally create a complete list of DOE defines within cdat.h for anyone
> wishing to decode the CDAT table.
> 
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

A few more things came to mind whilst reading the rest of the series. In particular
lifetime management for the doe structures.

>   * DOC: cxl pci
> @@ -575,17 +576,106 @@ static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
>  		if (rc)
>  			return rc;
>  
> -		if (device_attach(&adev->dev) != 1)
> +		if (device_attach(&adev->dev) != 1) {
>  			dev_err(&adev->dev,
>  				"Failed to attach a driver to DOE device %d\n",
>  				adev->id);
> +			goto next;
> +		}
> +
> +		if (pci_doe_supports_prot(new_dev, PCI_DVSEC_VENDOR_ID_CXL,
> +					  CXL_DOE_PROTOCOL_TABLE_ACCESS))
> +			cxlds->cdat_doe = new_dev;

I'm probably missing something, but what prevents new_dev from going away after
this assignment?  Perhaps a force unbind or driver removal.  Should we get a
reference?

Also it's possible we'll have multiple CDAT supporting DOEs so
I'd suggest checking if cxlds->cdata_doe is already set before setting it.

We could break out of the loop early, but I want to bolt the CMA doe detection
in there so I'd rather we didn't.  This is all subject to whether we attempt
to generalize this support and move it over to the PCI side of things.

>  
> +next:
>  		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
>  	}
>  
>  	return 0;
>  }
>
Ira Weiny Nov. 8, 2021, 10:25 p.m. UTC | #3
On Mon, Nov 08, 2021 at 03:02:36PM +0000, Jonathan Cameron wrote:
> On Fri, 5 Nov 2021 16:50:55 -0700
> <ira.weiny@intel.com> wrote:
> 
> > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > 

[snip]

> 
> A few more things came to mind whilst reading the rest of the series. In particular
> lifetime management for the doe structures.

Thanks for the review I'm just working through all the comments and so I'm
somewhat working backwards.

> 
> >   * DOC: cxl pci
> > @@ -575,17 +576,106 @@ static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
> >  		if (rc)
> >  			return rc;
> >  
> > -		if (device_attach(&adev->dev) != 1)
> > +		if (device_attach(&adev->dev) != 1) {
> >  			dev_err(&adev->dev,
> >  				"Failed to attach a driver to DOE device %d\n",
> >  				adev->id);
> > +			goto next;
> > +		}
> > +
> > +		if (pci_doe_supports_prot(new_dev, PCI_DVSEC_VENDOR_ID_CXL,
> > +					  CXL_DOE_PROTOCOL_TABLE_ACCESS))
> > +			cxlds->cdat_doe = new_dev;
> 
> I'm probably missing something, but what prevents new_dev from going away after
> this assignment?
> Perhaps a force unbind or driver removal.  Should we get a
> reference?

I had a get_device() here at one point but took it out...  Because I was
thinking that new_dev's lifetime was equal to cxlds because cxlds 'owned' the
DOE devices.  However this is totally not true.  And there is a race between
the device going away and cxlds going away which could be a problem.

> 
> Also it's possible we'll have multiple CDAT supporting DOEs so
> I'd suggest checking if cxlds->cdata_doe is already set before setting it.

Sure.

> 
> We could break out of the loop early, but I want to bolt the CMA doe detection
> in there so I'd rather we didn't.  This is all subject to whether we attempt
> to generalize this support and move it over to the PCI side of things.

I'm not 100% sure about moving it to the PCI side but it does make some sense
because really the auxiliary devices are only bounded by the PCI device being
available.  None of the CXL stuff needs to exist for the DOE driver to talk to
the device but the pdev does need to be there...  :-/

This is all part of what drove the cxl_mem rename because that structure was
really confusing me.  Dan got me straightened out but I did not revisit this
series after that.  Now off the top of my head I'm not sure that cxlds needs to
be involved in the auxiliary device creation.  OTOH I was making it a central
place for in kernel users to know where/how to get information from DOE
mailboxes.  Hence caching which of these devices had CDAT capability.[1]

Since you seem to have arrived at this conclusion before me where in the PCI
code do you think is appropriate for this?

Ira

[1] I'm not really sure what is going to happen if multiple DOE boxes have CDAT
capability.  This seems like a recipe for confusion.

> 
> >  
> > +next:
> >  		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> >  	}
> >  
> >  	return 0;
> >  }
> >
Ira Weiny Nov. 8, 2021, 11:19 p.m. UTC | #4
On Mon, Nov 08, 2021 at 01:21:51PM +0000, Jonathan Cameron wrote:
> On Fri, 5 Nov 2021 16:50:55 -0700
> <ira.weiny@intel.com> wrote:
> 
> > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > 
> > Read CDAT raw table data from the cxl_mem state object.  Currently this
> > is only supported by a PCI CXL object through a DOE mailbox which supports
> > CDAT.  But any cxl_mem type object can provide this data later if need
> > be.  For example for testing.
> > 
> > Cache this data for later parsing.  Provide a sysfs binary attribute to
> > allow dumping of the CDAT.
> > 
> > Binary dumping is modeled on /sys/firmware/ACPI/tables/
> > 
> > The ability to dump this table will be very useful for emulation of real
> > devices once they become available as QEMU CXL type 3 device emulation will
> > be able to load this file in.
> > 
> > This does not support table updates at runtime. It will always provide
> > whatever was there when first cached. Handling of table updates can be
> > implemented later.
> > 
> > Once there are more users, this code can move out to driver/cxl/cdat.c
> > or similar.
> > 
> > Finally create a complete list of DOE defines within cdat.h for anyone
> > wishing to decode the CDAT table.
> > 
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> Hi Ira,
> 
> A few other comments inline, some of which are really updates of
> earlier comments now I see how it is fitting together.
> 
> Jonathan
> 
> > 
> > ---
> > Changes from V4:
> > 	Split this into it's own patch
> > 	Rearchitect this such that the memdev driver calls into the DOE
> > 	driver via the cxl_mem state object.  This allows CDAT data to
> > 	come from any type of cxl_mem object not just PCI DOE.
> 
> Ah.  Is this to allow mocking? Or is there another architected source
> of this information that I've missed?

Right now yes as the testing stuff could mock this.  But I did not plumb that
up yet.

> 
> > 	Rebase on new struct cxl_dev_state
> > ---
> 
> ...
> 
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index df524b74f1d2..086532a42480 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -11,6 +11,7 @@
> >  #include "cxlmem.h"
> >  #include "pci.h"
> >  #include "cxl.h"
> > +#include "cdat.h"
> >  
> >  /**
> >   * DOC: cxl pci
> > @@ -575,17 +576,106 @@ static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
> >  		if (rc)
> >  			return rc;
> >  
> > -		if (device_attach(&adev->dev) != 1)
> > +		if (device_attach(&adev->dev) != 1) {
> >  			dev_err(&adev->dev,
> >  				"Failed to attach a driver to DOE device %d\n",
> >  				adev->id);
> > +			goto next;
> > +		}
> > +
> > +		if (pci_doe_supports_prot(new_dev, PCI_DVSEC_VENDOR_ID_CXL,
> > +					  CXL_DOE_PROTOCOL_TABLE_ACCESS))
> > +			cxlds->cdat_doe = new_dev;
> 
> Ah. If we did try to make this block generic, we'd then need a look
> up function to call after the generic part.  I guess it is getting more
> complex so maybe not having it generic is the right choice for now.

It is complex and I'm still concerned about getting the driver attached such
that this works.  But the MODULE_SOFTDEP should take care of this.

> 
> Also, this explains why you passed cxlds in.  So ignore that comment on
> the previous.

Yea.

> 
> >  
> > +next:
> >  		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> >  	}
> >  
> >  	return 0;
> >  }
> >  
> > +#define CDAT_DOE_REQ(entry_handle)					\
> > +	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
> > +		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
> > +	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
> > +		    CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) |		\
> > +	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle)))
> > +
> > +static int cxl_cdat_get_length(struct cxl_dev_state *cxlds, size_t *length)
> > +{
> > +	struct pci_doe_dev *doe_dev = cxlds->cdat_doe;
> > +	u32 cdat_request_pl = CDAT_DOE_REQ(0);
> > +	u32 cdat_response_pl[32];
> > +	struct pci_doe_exchange ex = {
> > +		.prot.vid = PCI_DVSEC_VENDOR_ID_CXL,
> > +		.prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS,
> > +		.request_pl = &cdat_request_pl,
> > +		.request_pl_sz = sizeof(cdat_request_pl),
> > +		.response_pl = cdat_response_pl,
> > +		.response_pl_sz = sizeof(cdat_response_pl),
> > +	};
> > +
> > +	ssize_t rc;
> > +
> > +	rc = pci_doe_exchange_sync(doe_dev, &ex);
> > +	if (rc < 0)
> > +		return rc;
> > +	if (rc < 1)
> > +		return -EIO;
> > +
> > +	*length = cdat_response_pl[1];
> > +	dev_dbg(cxlds->dev, "CDAT length %zu\n", *length);
> 
> Probably not useful any more... 

yea. removed.

Ira

> 
> > +	return 0;
> > +}
> > +
>
Jonathan Cameron Nov. 9, 2021, 11:09 a.m. UTC | #5
...

> > We could break out of the loop early, but I want to bolt the CMA doe detection
> > in there so I'd rather we didn't.  This is all subject to whether we attempt
> > to generalize this support and move it over to the PCI side of things.  
> 
> I'm not 100% sure about moving it to the PCI side but it does make some sense
> because really the auxiliary devices are only bounded by the PCI device being
> available.  None of the CXL stuff needs to exist for the DOE driver to talk to
> the device but the pdev does need to be there...  :-/

This will become more relevant with CMA etc on top of this series as that
is not CXL specific, so definitely shouldn't live in here.

> 
> This is all part of what drove the cxl_mem rename because that structure was
> really confusing me.  Dan got me straightened out but I did not revisit this
> series after that.  Now off the top of my head I'm not sure that cxlds needs to
> be involved in the auxiliary device creation.  OTOH I was making it a central
> place for in kernel users to know where/how to get information from DOE
> mailboxes.  Hence caching which of these devices had CDAT capability.[1]
Caching a particular instance makes sense (with a reference taken).

I'd expect something similar to the divide between
pci_alloc_irq_vectors() which enumerates them in the pci core, and
actually getting for a particular instance with request_irq()

So maybe
pci_alloc_doe_instances() which adds the auxiliary devices to the bus.

and

pci_doe_get(vendor_id, protcol_id);
with the _get() implemented using auxilliary_find_device() with
appropriate match function.


> 
> Since you seem to have arrived at this conclusion before me where in the PCI
> code do you think is appropriate for this?

I'm not sure to be honest.  Given the dependency on MSI/MSIX it may be that the best
we can do is to provide some utility functions for the auxiliary device
creation and then every driver for devices with a DOE would need to call
them manually.  As this isn't dependent on the DOE driver, it would need
to be tied to the PCI core rather than that, possibly stubbed if
PCI_DOE isn't built.

> 
> Ira
> 
> [1] I'm not really sure what is going to happen if multiple DOE boxes have CDAT
> capability.  This seems like a recipe for confusion.

They will all report the same thing so just use the first one.
I can't really think why someone would do this deliberately but I can conceive of
people deciding to support multiple because they have a sneaky firmware running
somewhere and they want to avoid mediating between that and the OS. Mind you
that needs something to indicate to the OS which one it is which is still
an open problem.

Jonathan

> 
> >   
> > >  
> > > +next:
> > >  		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> > >  	}
> > >  
> > >  	return 0;
> > >  }
> > >
Jonathan Cameron Nov. 19, 2021, 2:40 p.m. UTC | #6
On Fri, 5 Nov 2021 16:50:55 -0700
<ira.weiny@intel.com> wrote:

> From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> Read CDAT raw table data from the cxl_mem state object.  Currently this
> is only supported by a PCI CXL object through a DOE mailbox which supports
> CDAT.  But any cxl_mem type object can provide this data later if need
> be.  For example for testing.
> 
> Cache this data for later parsing.  Provide a sysfs binary attribute to
> allow dumping of the CDAT.
> 
> Binary dumping is modeled on /sys/firmware/ACPI/tables/
> 
> The ability to dump this table will be very useful for emulation of real
> devices once they become available as QEMU CXL type 3 device emulation will
> be able to load this file in.
> 
> This does not support table updates at runtime. It will always provide
> whatever was there when first cached. Handling of table updates can be
> implemented later.
> 
> Once there are more users, this code can move out to driver/cxl/cdat.c
> or similar.
> 
> Finally create a complete list of DOE defines within cdat.h for anyone
> wishing to decode the CDAT table.
> 
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 

>  
>  static struct attribute_group cxl_memdev_ram_attribute_group = {
> @@ -293,6 +329,16 @@ devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
>  	if (rc)
>  		goto err;
>  
> +	/* Cache the data early to ensure is_visible() works */
> +	if (!cxl_mem_cdat_get_length(cxlds, &cxlmd->cdat_length)) {
> +		cxlmd->cdat_table = devm_kzalloc(dev, cxlmd->cdat_length, GFP_KERNEL);

I think this devm_ call should be using the parent device, not this one.

As it stands it breaks Ben's mem.c driver which probes for this device
and fails because you can't call probe for a device that already has things
in it's devres queue.

Too many patches in flight at the same time makes for some entertaining
rebases if you want them all at once..

Jonathan

> +		if (!cxlmd->cdat_table) {
> +			rc = -ENOMEM;
> +			goto err;
> +		}
> +		cxl_mem_cdat_read_table(cxlds, cxlmd->cdat_table, cxlmd->cdat_length);
> +	}
> +
>  	/*
>  	 * Activate ioctl operations, no cxl_memdev_rwsem manipulation
>  	 * needed as this is ordered with cdev_add() publishing the device.
diff mbox series

Patch

diff --git a/drivers/cxl/cdat.h b/drivers/cxl/cdat.h
new file mode 100644
index 000000000000..ee78eb822166
--- /dev/null
+++ b/drivers/cxl/cdat.h
@@ -0,0 +1,81 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Coherent Device Attribute table (CDAT)
+ *
+ * Specification available from UEFI.org
+ *
+ * Whilst CDAT is defined as a single table, the access via DOE maiboxes is
+ * done one entry at a time, where the first entry is the header.
+ */
+
+#define CXL_DOE_TABLE_ACCESS_REQ_CODE		0x000000ff
+#define   CXL_DOE_TABLE_ACCESS_REQ_CODE_READ	0
+#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE		0x0000ff00
+#define   CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA	0
+#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE	0xffff0000
+
+/*
+ * CDAT entries are little endian and are read from PCI config space which
+ * is also little endian.
+ * As such, on a big endian system these will have been reversed.
+ * This prevents us from making easy use of packed structures.
+ * Style form pci_regs.h
+ */
+
+#define CDAT_HEADER_LENGTH_DW 4
+#define CDAT_HEADER_LENGTH_BYTES (CDAT_HEADER_LENGTH_DW * sizeof(u32))
+#define CDAT_HEADER_DW0_LENGTH		0xffffffff
+#define CDAT_HEADER_DW1_REVISION	0x000000ff
+#define CDAT_HEADER_DW1_CHECKSUM	0x0000ff00
+/* CDAT_HEADER_DW2_RESERVED	*/
+#define CDAT_HEADER_DW3_SEQUENCE	0xffffffff
+
+/* All structures have a common first DW */
+#define CDAT_STRUCTURE_DW0_TYPE		0x000000ff
+#define   CDAT_STRUCTURE_DW0_TYPE_DSMAS 0
+#define   CDAT_STRUCTURE_DW0_TYPE_DSLBIS 1
+#define   CDAT_STRUCTURE_DW0_TYPE_DSMSCIS 2
+#define   CDAT_STRUCTURE_DW0_TYPE_DSIS 3
+#define   CDAT_STRUCTURE_DW0_TYPE_DSEMTS 4
+#define   CDAT_STRUCTURE_DW0_TYPE_SSLBIS 5
+
+#define CDAT_STRUCTURE_DW0_LENGTH	0xffff0000
+
+/* Device Scoped Memory Affinity Structure */
+#define CDAT_DSMAS_DW1_DSMAD_HANDLE	0x000000ff
+#define CDAT_DSMAS_DW1_FLAGS		0x0000ff00
+#define CDAT_DSMAS_DPA_OFFSET(entry) ((u64)((entry)[3]) << 32 | (entry)[2])
+#define CDAT_DSMAS_DPA_LEN(entry) ((u64)((entry)[5]) << 32 | (entry)[4])
+#define CDAT_DSMAS_NON_VOLATILE(flags)  ((flags & 0x04) >> 2)
+
+/* Device Scoped Latency and Bandwidth Information Structure */
+#define CDAT_DSLBIS_DW1_HANDLE		0x000000ff
+#define CDAT_DSLBIS_DW1_FLAGS		0x0000ff00
+#define CDAT_DSLBIS_DW1_DATA_TYPE	0x00ff0000
+#define CDAT_DSLBIS_BASE_UNIT(entry) ((u64)((entry)[3]) << 32 | (entry)[2])
+#define CDAT_DSLBIS_DW4_ENTRY_0		0x0000ffff
+#define CDAT_DSLBIS_DW4_ENTRY_1		0xffff0000
+#define CDAT_DSLBIS_DW5_ENTRY_2		0x0000ffff
+
+/* Device Scoped Memory Side Cache Information Structure */
+#define CDAT_DSMSCIS_DW1_HANDLE		0x000000ff
+#define CDAT_DSMSCIS_MEMORY_SIDE_CACHE_SIZE(entry) \
+	((u64)((entry)[3]) << 32 | (entry)[2])
+#define CDAT_DSMSCIS_DW4_MEMORY_SIDE_CACHE_ATTRS 0xffffffff
+
+/* Device Scoped Initiator Structure */
+#define CDAT_DSIS_DW1_FLAGS		0x000000ff
+#define CDAT_DSIS_DW1_HANDLE		0x0000ff00
+
+/* Device Scoped EFI Memory Type Structure */
+#define CDAT_DSEMTS_DW1_HANDLE		0x000000ff
+#define CDAT_DSEMTS_DW1_EFI_MEMORY_TYPE_ATTR	0x0000ff00
+#define CDAT_DSEMTS_DPA_OFFSET(entry)	((u64)((entry)[3]) << 32 | (entry)[2])
+#define CDAT_DSEMTS_DPA_LENGTH(entry)	((u64)((entry)[5]) << 32 | (entry)[4])
+
+/* Switch Scoped Latency and Bandwidth Information Structure */
+#define CDAT_SSLBIS_DW1_DATA_TYPE	0x000000ff
+#define CDAT_SSLBIS_BASE_UNIT(entry)	((u64)((entry)[3]) << 32 | (entry)[2])
+#define CDAT_SSLBIS_ENTRY_PORT_X(entry, i) ((entry)[4 + (i) * 2] & 0x0000ffff)
+#define CDAT_SSLBIS_ENTRY_PORT_Y(entry, i) (((entry)[4 + (i) * 2] & 0xffff0000) >> 16)
+#define CDAT_SSLBIS_ENTRY_LAT_OR_BW(entry, i) ((entry)[4 + (i) * 2 + 1] & 0x0000ffff)
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 5341b0ba99a7..c35de9e8298e 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -86,6 +86,35 @@  static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
 	return sysfs_emit(buf, "%#llx\n", len);
 }
 
+static ssize_t CDAT_read(struct file *filp, struct kobject *kobj,
+			 struct bin_attribute *bin_attr, char *buf,
+			 loff_t offset, size_t count)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+	if (!cxlmd->cdat_table)
+		return 0;
+
+	return memory_read_from_buffer(buf, count, &offset,
+				       cxlmd->cdat_table,
+				       cxlmd->cdat_length);
+}
+
+static BIN_ATTR_RO(CDAT, 0);
+
+static umode_t cxl_memdev_bin_attr_is_visible(struct kobject *kobj,
+					      struct bin_attribute *attr, int i)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+	if ((attr == &bin_attr_CDAT) && cxlmd->cdat_table)
+		return 0400;
+
+	return 0;
+}
+
 static struct device_attribute dev_attr_pmem_size =
 	__ATTR(size, 0444, pmem_size_show, NULL);
 
@@ -96,6 +125,11 @@  static struct attribute *cxl_memdev_attributes[] = {
 	NULL,
 };
 
+static struct bin_attribute *cxl_memdev_bin_attributes[] = {
+	&bin_attr_CDAT,
+	NULL,
+};
+
 static struct attribute *cxl_memdev_pmem_attributes[] = {
 	&dev_attr_pmem_size.attr,
 	NULL,
@@ -108,6 +142,8 @@  static struct attribute *cxl_memdev_ram_attributes[] = {
 
 static struct attribute_group cxl_memdev_attribute_group = {
 	.attrs = cxl_memdev_attributes,
+	.bin_attrs = cxl_memdev_bin_attributes,
+	.is_bin_visible = cxl_memdev_bin_attr_is_visible,
 };
 
 static struct attribute_group cxl_memdev_ram_attribute_group = {
@@ -293,6 +329,16 @@  devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
 	if (rc)
 		goto err;
 
+	/* Cache the data early to ensure is_visible() works */
+	if (!cxl_mem_cdat_get_length(cxlds, &cxlmd->cdat_length)) {
+		cxlmd->cdat_table = devm_kzalloc(dev, cxlmd->cdat_length, GFP_KERNEL);
+		if (!cxlmd->cdat_table) {
+			rc = -ENOMEM;
+			goto err;
+		}
+		cxl_mem_cdat_read_table(cxlds, cxlmd->cdat_table, cxlmd->cdat_length);
+	}
+
 	/*
 	 * Activate ioctl operations, no cxl_memdev_rwsem manipulation
 	 * needed as this is ordered with cdev_add() publishing the device.
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index f1241a7f2b7b..f5dd38c6ce0f 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -88,6 +88,13 @@  static inline int cxl_hdm_decoder_count(u32 cap_hdr)
 #define CXL_DOE_PROTOCOL_COMPLIANCE 0
 #define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
 
+/* Common to request and response */
+#define CXL_DOE_TABLE_ACCESS_3_CODE GENMASK(7, 0)
+#define   CXL_DOE_TABLE_ACCESS_3_CODE_READ 0
+#define CXL_DOE_TABLE_ACCESS_3_TYPE GENMASK(15, 8)
+#define   CXL_DOE_TABLE_ACCESS_3_TYPE_CDAT 0
+#define CXL_DOE_TABLE_ACCESS_3_ENTRY_HANDLE GENMASK(31, 16)
+
 #define CXL_COMPONENT_REGS() \
 	void __iomem *hdm_decoder
 
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 8d96d009ad90..f6c62cd537bb 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -34,12 +34,16 @@ 
  * @dev: driver core device object
  * @cdev: char dev core object for ioctl operations
  * @cxlds: The device state backing this device
+ * @cdat_table: cache of CDAT table
+ * @cdat_length: length of cached CDAT table
  * @id: id number of this memdev instance.
  */
 struct cxl_memdev {
 	struct device dev;
 	struct cdev cdev;
 	struct cxl_dev_state *cxlds;
+	void *cdat_table;
+	size_t cdat_length;
 	int id;
 };
 
@@ -97,6 +101,7 @@  struct cxl_mbox_cmd {
  * Currently only memory devices are represented.
  *
  * @dev: The device associated with this CXL state
+ * @cdat_doe: Auxiliary DOE device capabile of reading CDAT
  * @regs: Parsed register blocks
  * @payload_size: Size of space for payload
  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
@@ -117,6 +122,10 @@  struct cxl_mbox_cmd {
  * @next_volatile_bytes: volatile capacity change pending device reset
  * @next_persistent_bytes: persistent capacity change pending device reset
  * @mbox_send: @dev specific transport for transmitting mailbox commands
+ * @cdat_get_length: @dev specific function for reading the CDAT table length
+ *                   returns -errno if CDAT not supported on this device
+ * @cdat_read_table: @dev specific function for reading the table
+ *                   returns -errno if CDAT not supported on this device
  *
  * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
  * details on capacity parameters.
@@ -124,6 +133,7 @@  struct cxl_mbox_cmd {
 struct cxl_dev_state {
 	struct device *dev;
 
+	struct pci_doe_dev *cdat_doe;
 	struct cxl_regs regs;
 
 	size_t payload_size;
@@ -146,6 +156,8 @@  struct cxl_dev_state {
 	u64 next_persistent_bytes;
 
 	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
+	int (*cdat_get_length)(struct cxl_dev_state *cxlds, size_t *length);
+	int (*cdat_read_table)(struct cxl_dev_state *cxlds, u32 *data, size_t length);
 };
 
 enum cxl_opcode {
@@ -264,4 +276,17 @@  int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
 struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
 void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
 void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
+
+static inline int cxl_mem_cdat_get_length(struct cxl_dev_state *cxlds, size_t *length)
+{
+	if (cxlds->cdat_get_length)
+		return cxlds->cdat_get_length(cxlds, length);
+	return -EOPNOTSUPP;
+}
+static inline int cxl_mem_cdat_read_table(struct cxl_dev_state *cxlds, u32 *data, size_t length)
+{
+	if (cxlds->cdat_read_table)
+		return cxlds->cdat_read_table(cxlds, data, length);
+	return -EOPNOTSUPP;
+}
 #endif /* __CXL_MEM_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index df524b74f1d2..086532a42480 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -11,6 +11,7 @@ 
 #include "cxlmem.h"
 #include "pci.h"
 #include "cxl.h"
+#include "cdat.h"
 
 /**
  * DOC: cxl pci
@@ -575,17 +576,106 @@  static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
 		if (rc)
 			return rc;
 
-		if (device_attach(&adev->dev) != 1)
+		if (device_attach(&adev->dev) != 1) {
 			dev_err(&adev->dev,
 				"Failed to attach a driver to DOE device %d\n",
 				adev->id);
+			goto next;
+		}
+
+		if (pci_doe_supports_prot(new_dev, PCI_DVSEC_VENDOR_ID_CXL,
+					  CXL_DOE_PROTOCOL_TABLE_ACCESS))
+			cxlds->cdat_doe = new_dev;
 
+next:
 		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
 	}
 
 	return 0;
 }
 
+#define CDAT_DOE_REQ(entry_handle)					\
+	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
+		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
+	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
+		    CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) |		\
+	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle)))
+
+static int cxl_cdat_get_length(struct cxl_dev_state *cxlds, size_t *length)
+{
+	struct pci_doe_dev *doe_dev = cxlds->cdat_doe;
+	u32 cdat_request_pl = CDAT_DOE_REQ(0);
+	u32 cdat_response_pl[32];
+	struct pci_doe_exchange ex = {
+		.prot.vid = PCI_DVSEC_VENDOR_ID_CXL,
+		.prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS,
+		.request_pl = &cdat_request_pl,
+		.request_pl_sz = sizeof(cdat_request_pl),
+		.response_pl = cdat_response_pl,
+		.response_pl_sz = sizeof(cdat_response_pl),
+	};
+
+	ssize_t rc;
+
+	rc = pci_doe_exchange_sync(doe_dev, &ex);
+	if (rc < 0)
+		return rc;
+	if (rc < 1)
+		return -EIO;
+
+	*length = cdat_response_pl[1];
+	dev_dbg(cxlds->dev, "CDAT length %zu\n", *length);
+	return 0;
+}
+
+static int cxl_cdat_read_table(struct cxl_dev_state *cxlds, u32 *data, size_t length)
+{
+	struct pci_doe_dev *doe_dev = cxlds->cdat_doe;
+	int entry_handle = 0;
+	int rc;
+
+	do {
+		u32 cdat_request_pl = CDAT_DOE_REQ(entry_handle);
+		u32 cdat_response_pl[32];
+		struct pci_doe_exchange ex = {
+			.prot.vid = PCI_DVSEC_VENDOR_ID_CXL,
+			.prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS,
+			.request_pl = &cdat_request_pl,
+			.request_pl_sz = sizeof(cdat_request_pl),
+			.response_pl = cdat_response_pl,
+			.response_pl_sz = sizeof(cdat_response_pl),
+		};
+		size_t entry_dw;
+		u32 *entry;
+
+		rc = pci_doe_exchange_sync(doe_dev, &ex);
+		if (rc < 0)
+			return rc;
+
+		entry = cdat_response_pl + 1;
+		entry_dw = rc / sizeof(u32);
+		/* Skip Header */
+		entry_dw -= 1;
+		entry_dw = min(length / 4, entry_dw);
+		memcpy(data, entry, entry_dw * sizeof(u32));
+		length -= entry_dw * sizeof(u32);
+		data += entry_dw;
+		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, cdat_response_pl[0]);
+
+	} while (entry_handle != 0xFFFF);
+
+	return 0;
+}
+
+static void cxl_setup_cdat(struct cxl_dev_state *cxlds)
+{
+	if (!cxlds->cdat_doe)
+		return;
+
+	cxlds->cdat_get_length = cxl_cdat_get_length;
+	cxlds->cdat_read_table = cxl_cdat_read_table;
+}
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct cxl_register_map map;
@@ -636,6 +726,8 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	cxl_setup_cdat(cxlds);
+
 	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);