diff mbox series

[v3,06/25] cxl/mem: Read dynamic capacity configuration from the device

Message ID 20240816-dcd-type2-upstream-v3-6-7c9b96cba6d7@intel.com (mailing list archive)
State New, archived
Headers show
Series DCD: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny Aug. 16, 2024, 2:44 p.m. UTC
From: Navneet Singh <navneet.singh@intel.com>

Devices which optionally support Dynamic Capacity (DC) are configured
via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
Configuration command in order to properly configure DCDs.  Without the
Get DC Configuration command DCD can't be supported.

Implement the DC mailbox commands as specified in CXL 3.1 section
8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
information.  Disable DCD if DCD is not supported.  Leverage the Get DC
Configuration command supported bit to indicate if DCD support.

Linux has no use for the trailing fields of the Get Dynamic Capacity
Configuration Output Payload (Total number of supported extents, number
of available extents, total number of supported tags, and number of
available tags).  Avoid defining those fields to use the more useful
dynamic C array.

Cc: "Li, Ming" <ming4.li@intel.com>
Signed-off-by: Navneet Singh <navneet.singh@intel.com>
Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes:
[Li, Ming: Fix bug in total_bytes calculation]
[iweiny: update commit message]
[Jonathan: fix formatting]
[Jonathan: Define block line size]
[Jonathan/Fan: use regions returned field instead of macro in get config]
[Jørgen: Rename memdev state range variables]
[Jonathan: adjust use of rc in cxl_dev_dynamic_capacity_identify()]
[Jonathan: white space cleanup]
[fan: make a comment about the trailing configuration output fields]
---
 drivers/cxl/core/mbox.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/cxl/cxlmem.h    |  64 +++++++++++++++++-
 drivers/cxl/pci.c       |   4 ++
 3 files changed, 237 insertions(+), 2 deletions(-)

Comments

Dave Jiang Aug. 16, 2024, 9:45 p.m. UTC | #1
On 8/16/24 7:44 AM, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Devices which optionally support Dynamic Capacity (DC) are configured
> via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
> Configuration command in order to properly configure DCDs.  Without the
> Get DC Configuration command DCD can't be supported.
> 
> Implement the DC mailbox commands as specified in CXL 3.1 section
> 8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
> information.  Disable DCD if DCD is not supported.  Leverage the Get DC
> Configuration command supported bit to indicate if DCD support.
> 
> Linux has no use for the trailing fields of the Get Dynamic Capacity
> Configuration Output Payload (Total number of supported extents, number
> of available extents, total number of supported tags, and number of
> available tags).  Avoid defining those fields to use the more useful
> dynamic C array.
> 
> Cc: "Li, Ming" <ming4.li@intel.com>
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes:
> [Li, Ming: Fix bug in total_bytes calculation]
> [iweiny: update commit message]
> [Jonathan: fix formatting]
> [Jonathan: Define block line size]
> [Jonathan/Fan: use regions returned field instead of macro in get config]
> [Jørgen: Rename memdev state range variables]
> [Jonathan: adjust use of rc in cxl_dev_dynamic_capacity_identify()]
> [Jonathan: white space cleanup]
> [fan: make a comment about the trailing configuration output fields]
> ---
>  drivers/cxl/core/mbox.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/cxl/cxlmem.h    |  64 +++++++++++++++++-
>  drivers/cxl/pci.c       |   4 ++
>  3 files changed, 237 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 8eb196858abe..68c26c4be91a 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1157,7 +1157,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  	if (rc < 0)
>  		return rc;
>  
> -	mds->total_bytes =
> +	mds->static_bytes =
>  		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
>  	mds->volatile_only_bytes =
>  		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> @@ -1264,6 +1264,159 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
>  	return rc;
>  }
>  
> +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index,
> +				   struct cxl_dc_region_config *region_config)
> +{
> +	struct cxl_dc_region_info *dcr = &mds->dc_region[index];
> +	struct device *dev = mds->cxlds.dev;
> +
> +	dcr->base = le64_to_cpu(region_config->region_base);
> +	dcr->decode_len = le64_to_cpu(region_config->region_decode_length);
> +	dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> +	dcr->len = le64_to_cpu(region_config->region_length);
> +	dcr->blk_size = le64_to_cpu(region_config->region_block_size);
> +	dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle);
> +	dcr->flags = region_config->flags;
> +	snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index);
> +
> +	/* Check regions are in increasing DPA order */
> +	if (index > 0) {
> +		struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1];
> +
> +		if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) {
> +			dev_err(dev,
> +				"DPA ordering violation for DC region %d and %d\n",
> +				index - 1, index);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (!IS_ALIGNED(dcr->base, SZ_256M) ||
> +	    !IS_ALIGNED(dcr->base, dcr->blk_size)) {
> +		dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n",
> +			index, dcr->base, dcr->blk_size);
> +		return -EINVAL;
> +	}
> +
> +	if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len ||
> +	    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> +		dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n",
> +			index, dcr->decode_len, dcr->len, dcr->blk_size);
> +		return -EINVAL;
> +	}
> +
> +	if (dcr->blk_size == 0 || dcr->blk_size % CXL_DCD_BLOCK_LINE_SIZE ||
> +	    !is_power_of_2(dcr->blk_size)) {
> +		dev_err(dev, "DC region %d invalid block size; %#llx\n",
> +			index, dcr->blk_size);
> +		return -EINVAL;
> +	}
> +
> +	dev_dbg(dev,
> +		"DC region %s base %#llx length %#llx block size %#llx\n",
> +		dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> +
> +	return 0;
> +}
> +
> +/* Returns the number of regions in dc_resp or -ERRNO */
> +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region,
> +			     struct cxl_mbox_get_dc_config_out *dc_resp,
> +			     size_t dc_resp_size)
> +{
> +	struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) {
> +		.region_count = CXL_MAX_DC_REGION,
> +		.start_region_index = start_region,
> +	};
> +	struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> +		.payload_in = &get_dc,
> +		.size_in = sizeof(get_dc),
> +		.size_out = dc_resp_size,
> +		.payload_out = dc_resp,
> +		.min_out = 1,
> +	};
> +	struct device *dev = mds->cxlds.dev;
> +	int rc;
> +
> +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> +	if (rc < 0)
> +		return rc;
> +
> +	dev_dbg(dev, "Read %d/%d DC regions\n",
> +		dc_resp->regions_returned, dc_resp->avail_region_count);
> +	return dc_resp->regions_returned;
> +}
> +
> +/**
> + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> + *					 information from the device.
> + * @mds: The memory device state
> + *
> + * Read Dynamic Capacity information from the device and populate the state
> + * structures for later use.
> + *
> + * Return: 0 if identify was executed successfully, -ERRNO on error.
> + */
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> +{
> +	size_t dc_resp_size = mds->payload_size;
> +	struct device *dev = mds->cxlds.dev;
> +	u8 start_region, i;
> +
> +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> +		snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>");
> +
> +	if (!cxl_dcd_supported(mds)) {
> +		dev_dbg(dev, "DCD not supported\n");
> +		return 0;
> +	}

This should happen before you pre-format the name string? I would assume that if DCD is not supported then the dcd name sysfs attribs would be not be visible?

> +
> +	struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) =
> +					kvmalloc(dc_resp_size, GFP_KERNEL);
> +	if (!dc_resp)
> +		return -ENOMEM;
> +
> +	start_region = 0;
> +	do {
> +		int rc, j;
> +
> +		rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size);
> +		if (rc < 0) {
> +			dev_dbg(dev, "Failed to get DC config: %d\n", rc);
> +			return rc;
> +		}
> +
> +		mds->nr_dc_region += rc;
> +
> +		if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> +			dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> +				mds->nr_dc_region);
> +			return -EINVAL;
> +		}
> +
> +		for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) {

This should be 'j < mds->nr_dc_region'? Otherwise if your start region say is '3' and you have '2' DC regions, you never enter the loop. Or does that not happen? I also wonder if you need to check if 'start_region + mds->nr_dc_region > CXL_MAX_DC_REGION'.

> +			rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]);
> +			if (rc) {
> +				dev_dbg(dev, "Failed to save region info: %d\n", rc);
> +				return rc;
> +			}
> +		}
> +
> +		start_region = mds->nr_dc_region;
> +
> +	} while (mds->nr_dc_region < dc_resp->avail_region_count);
> +
> +	mds->dynamic_bytes =
> +		mds->dc_region[mds->nr_dc_region - 1].base +
> +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> +		mds->dc_region[0].base;
> +	dev_dbg(dev, "Total dynamic range: %#llx\n", mds->dynamic_bytes);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> +
>  static int add_dpa_res(struct device *dev, struct resource *parent,
>  		       struct resource *res, resource_size_t start,
>  		       resource_size_t size, const char *type)
> @@ -1294,8 +1447,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  {
>  	struct cxl_dev_state *cxlds = &mds->cxlds;
>  	struct device *dev = cxlds->dev;
> +	size_t untenanted_mem;
>  	int rc;
>  
> +	mds->total_bytes = mds->static_bytes;
> +	if (mds->nr_dc_region) {
> +		untenanted_mem = mds->dc_region[0].base - mds->static_bytes;
> +		mds->total_bytes += untenanted_mem + mds->dynamic_bytes;
> +	}
> +
>  	if (!cxlds->media_ready) {
>  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
>  		cxlds->ram_res = DEFINE_RES_MEM(0, 0);
> @@ -1305,6 +1465,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  
>  	cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
>  
> +	for (int i = 0; i < mds->nr_dc_region; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> +				 dcr->base, dcr->decode_len, dcr->name);
> +		if (rc)
> +			return rc;
> +	}
> +
>  	if (mds->partition_align_bytes == 0) {
>  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
>  				 mds->volatile_only_bytes, "ram");
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index f2f8b567e0e7..b4eb8164d05d 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -402,6 +402,7 @@ enum cxl_devtype {
>  	CXL_DEVTYPE_CLASSMEM,
>  };
>  
> +#define CXL_MAX_DC_REGION 8
>  /**
>   * struct cxl_dpa_perf - DPA performance property entry
>   * @dpa_range: range for DPA address
> @@ -431,6 +432,8 @@ struct cxl_dpa_perf {
>   * @dpa_res: Overall DPA resource tree for the device
>   * @pmem_res: Active Persistent memory capacity configuration
>   * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>   * @serial: PCIe Device Serial Number
>   * @type: Generic Memory Class device or Vendor Specific Memory device
>   */
> @@ -445,10 +448,22 @@ struct cxl_dev_state {
>  	struct resource dpa_res;
>  	struct resource pmem_res;
>  	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];
>  	u64 serial;
>  	enum cxl_devtype type;
>  };
>  
> +#define CXL_DC_REGION_STRLEN > +struct cxl_dc_region_info {
> +	u64 base;
> +	u64 decode_len;
> +	u64 len;
> +	u64 blk_size;
> +	u32 dsmad_handle;
> +	u8 flags;
> +	u8 name[CXL_DC_REGION_STRLEN];
> +};

Does this need kdoc comments?


> +
>  /**
>   * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
>   *
> @@ -466,7 +481,9 @@ struct cxl_dev_state {
>   * @dcd_cmds: List of DCD commands implemented by memory device
>   * @enabled_cmds: Hardware commands found enabled in CEL.
>   * @exclusive_cmds: Commands that are kernel-internal only
> - * @total_bytes: sum of all possible capacities
> + * @total_bytes: length of all possible capacities
> + * @static_bytes: length of possible static RAM and PMEM partitions
> + * @dynamic_bytes: length of possible DC partitions (DC Regions)

Did this get added to the wrong struct comment header? 'cxl_dev_state' instead of 'cxl_memdev_state'?
>   * @volatile_only_bytes: hard volatile capacity
>   * @persistent_only_bytes: hard persistent capacity
>   * @partition_align_bytes: alignment size for partition-able capacity
> @@ -476,6 +493,8 @@ struct cxl_dev_state {
>   * @next_persistent_bytes: persistent capacity change pending device reset
>   * @ram_perf: performance data entry matched to RAM partition
>   * @pmem_perf: performance data entry matched to PMEM partition
> + * @nr_dc_region: number of DC regions implemented in the memory device
> + * @dc_region: array containing info about the DC regions
Did this get added to the wrong struct comment header? 'cxl_dev_state' instead of 'cxl_memdev_state'?

DJ

>   * @event: event log driver state
>   * @poison: poison driver state info
>   * @security: security driver state info
> @@ -496,6 +515,8 @@ struct cxl_memdev_state {
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	u64 total_bytes;
> +	u64 static_bytes;
> +	u64 dynamic_bytes;
>  	u64 volatile_only_bytes;
>  	u64 persistent_only_bytes;
>  	u64 partition_align_bytes;
> @@ -507,6 +528,9 @@ struct cxl_memdev_state {
>  	struct cxl_dpa_perf ram_perf;
>  	struct cxl_dpa_perf pmem_perf;
>  
> +	u8 nr_dc_region;
> +	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
> +
>  	struct cxl_event_state event;
>  	struct cxl_poison_state poison;
>  	struct cxl_security_state security;
> @@ -709,6 +733,32 @@ struct cxl_mbox_set_partition_info {
>  
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>  
> +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> +struct cxl_mbox_get_dc_config_in {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> +struct cxl_mbox_get_dc_config_out {
> +	u8 avail_region_count;
> +	u8 regions_returned;
> +	u8 rsvd[6];
> +	/* See CXL 3.1 Table 8-165 */
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[];
> +	/* Trailing fields unused */
> +} __packed;
> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
> +#define CXL_DCD_BLOCK_LINE_SIZE 0x40
> +
>  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>  struct cxl_mbox_set_timestamp_in {
>  	__le64 timestamp;
> @@ -832,6 +882,7 @@ enum {
>  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
>  			  struct cxl_mbox_cmd *cmd);
>  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> @@ -845,6 +896,17 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>  			    enum cxl_event_log_type type,
>  			    enum cxl_event_type event_type,
>  			    const uuid_t *uuid, union cxl_event *evt);
> +
> +static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
> +{
> +	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +}
> +
> +static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
> +{
> +	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +}
> +
>  int cxl_set_timestamp(struct cxl_memdev_state *mds);
>  int cxl_poison_state_init(struct cxl_memdev_state *mds);
>  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3a60cd66263e..f7f03599bc83 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	rc = cxl_dev_dynamic_capacity_identify(mds);
> +	if (rc)
> +		cxl_disable_dcd(mds);
> +
>  	rc = cxl_mem_create_range_info(mds);
>  	if (rc)
>  		return rc;
>
Fan Ni Aug. 20, 2024, 5:01 p.m. UTC | #2
On Fri, Aug 16, 2024 at 02:45:47PM -0700, Dave Jiang wrote:
> 
> 
> On 8/16/24 7:44 AM, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > Devices which optionally support Dynamic Capacity (DC) are configured
> > via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
> > Configuration command in order to properly configure DCDs.  Without the
> > Get DC Configuration command DCD can't be supported.
> > 
> > Implement the DC mailbox commands as specified in CXL 3.1 section
> > 8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
> > information.  Disable DCD if DCD is not supported.  Leverage the Get DC
> > Configuration command supported bit to indicate if DCD support.
> > 
> > Linux has no use for the trailing fields of the Get Dynamic Capacity
> > Configuration Output Payload (Total number of supported extents, number
> > of available extents, total number of supported tags, and number of
> > available tags).  Avoid defining those fields to use the more useful
> > dynamic C array.
> > 
> > Cc: "Li, Ming" <ming4.li@intel.com>
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > 
> > ---
> > Changes:
> > [Li, Ming: Fix bug in total_bytes calculation]
> > [iweiny: update commit message]
> > [Jonathan: fix formatting]
> > [Jonathan: Define block line size]
> > [Jonathan/Fan: use regions returned field instead of macro in get config]
> > [Jørgen: Rename memdev state range variables]
> > [Jonathan: adjust use of rc in cxl_dev_dynamic_capacity_identify()]
> > [Jonathan: white space cleanup]
> > [fan: make a comment about the trailing configuration output fields]
> > ---
> >  drivers/cxl/core/mbox.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/cxl/cxlmem.h    |  64 +++++++++++++++++-
> >  drivers/cxl/pci.c       |   4 ++
> >  3 files changed, 237 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 8eb196858abe..68c26c4be91a 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -1157,7 +1157,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
> >  	if (rc < 0)
> >  		return rc;
> >  
> > -	mds->total_bytes =
> > +	mds->static_bytes =
> >  		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
> >  	mds->volatile_only_bytes =
> >  		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> > @@ -1264,6 +1264,159 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
> >  	return rc;
> >  }
> >  
> > +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index,
> > +				   struct cxl_dc_region_config *region_config)
> > +{
> > +	struct cxl_dc_region_info *dcr = &mds->dc_region[index];
> > +	struct device *dev = mds->cxlds.dev;
> > +
> > +	dcr->base = le64_to_cpu(region_config->region_base);
> > +	dcr->decode_len = le64_to_cpu(region_config->region_decode_length);
> > +	dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> > +	dcr->len = le64_to_cpu(region_config->region_length);
> > +	dcr->blk_size = le64_to_cpu(region_config->region_block_size);
> > +	dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle);
> > +	dcr->flags = region_config->flags;
> > +	snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index);
> > +
> > +	/* Check regions are in increasing DPA order */
> > +	if (index > 0) {
> > +		struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1];
> > +
> > +		if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) {
> > +			dev_err(dev,
> > +				"DPA ordering violation for DC region %d and %d\n",
> > +				index - 1, index);
> > +			return -EINVAL;
> > +		}
> > +	}
> > +
> > +	if (!IS_ALIGNED(dcr->base, SZ_256M) ||
> > +	    !IS_ALIGNED(dcr->base, dcr->blk_size)) {
> > +		dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n",
> > +			index, dcr->base, dcr->blk_size);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len ||
> > +	    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> > +		dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n",
> > +			index, dcr->decode_len, dcr->len, dcr->blk_size);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (dcr->blk_size == 0 || dcr->blk_size % CXL_DCD_BLOCK_LINE_SIZE ||
> > +	    !is_power_of_2(dcr->blk_size)) {
> > +		dev_err(dev, "DC region %d invalid block size; %#llx\n",
> > +			index, dcr->blk_size);
> > +		return -EINVAL;
> > +	}
> > +
> > +	dev_dbg(dev,
> > +		"DC region %s base %#llx length %#llx block size %#llx\n",
> > +		dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> > +
> > +	return 0;
> > +}
> > +
> > +/* Returns the number of regions in dc_resp or -ERRNO */
> > +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region,
> > +			     struct cxl_mbox_get_dc_config_out *dc_resp,
> > +			     size_t dc_resp_size)
> > +{
> > +	struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) {
> > +		.region_count = CXL_MAX_DC_REGION,
> > +		.start_region_index = start_region,
> > +	};
> > +	struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) {
> > +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> > +		.payload_in = &get_dc,
> > +		.size_in = sizeof(get_dc),
> > +		.size_out = dc_resp_size,
> > +		.payload_out = dc_resp,
> > +		.min_out = 1,
> > +	};
> > +	struct device *dev = mds->cxlds.dev;
> > +	int rc;
> > +
> > +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> > +	if (rc < 0)
> > +		return rc;
> > +
> > +	dev_dbg(dev, "Read %d/%d DC regions\n",
> > +		dc_resp->regions_returned, dc_resp->avail_region_count);
> > +	return dc_resp->regions_returned;
> > +}
> > +
> > +/**
> > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> > + *					 information from the device.
> > + * @mds: The memory device state
> > + *
> > + * Read Dynamic Capacity information from the device and populate the state
> > + * structures for later use.
> > + *
> > + * Return: 0 if identify was executed successfully, -ERRNO on error.
> > + */
> > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > +{
> > +	size_t dc_resp_size = mds->payload_size;
> > +	struct device *dev = mds->cxlds.dev;
> > +	u8 start_region, i;
> > +
> > +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> > +		snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>");
> > +
> > +	if (!cxl_dcd_supported(mds)) {
> > +		dev_dbg(dev, "DCD not supported\n");
> > +		return 0;
> > +	}
> 
> This should happen before you pre-format the name string? I would assume that if DCD is not supported then the dcd name sysfs attribs would be not be visible?
> 
> > +
> > +	struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) =
> > +					kvmalloc(dc_resp_size, GFP_KERNEL);
> > +	if (!dc_resp)
> > +		return -ENOMEM;
> > +
> > +	start_region = 0;
> > +	do {
> > +		int rc, j;
> > +
> > +		rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size);
> > +		if (rc < 0) {
> > +			dev_dbg(dev, "Failed to get DC config: %d\n", rc);
> > +			return rc;
> > +		}
> > +
> > +		mds->nr_dc_region += rc;
> > +
> > +		if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> > +			dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> > +				mds->nr_dc_region);
> > +			return -EINVAL;
> > +		}
> > +
> > +		for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) {
> 
> This should be 'j < mds->nr_dc_region'? Otherwise if your start region say is '3' and you have '2' DC regions, you never enter the loop. Or does that not happen? I also wonder if you need to check if 'start_region + mds->nr_dc_region > CXL_MAX_DC_REGION'.
> 
That can not happen, start_region was updated to the number of regions
has returned till now (not counting the current call), while
nr_dc_region is the total number of regions returned till now (including
the current call) as we update it above, so start_region should never be larger
than nr_dc_region.

> > +			rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]);
> > +			if (rc) {
> > +				dev_dbg(dev, "Failed to save region info: %d\n", rc);

I am not sure why we sometimes use dev_err and sometimes we use dev_dbg
here, if dcd is supported, error from getting dc configuration is an
error to me.

Fan

> > +				return rc;
> > +			}
> > +		}
> > +
> > +		start_region = mds->nr_dc_region;
> > +
> > +	} while (mds->nr_dc_region < dc_resp->avail_region_count);
> > +
> > +	mds->dynamic_bytes =
> > +		mds->dc_region[mds->nr_dc_region - 1].base +
> > +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> > +		mds->dc_region[0].base;
> > +	dev_dbg(dev, "Total dynamic range: %#llx\n", mds->dynamic_bytes);
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> > +
> >  static int add_dpa_res(struct device *dev, struct resource *parent,
> >  		       struct resource *res, resource_size_t start,
> >  		       resource_size_t size, const char *type)
> > @@ -1294,8 +1447,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >  {
> >  	struct cxl_dev_state *cxlds = &mds->cxlds;
> >  	struct device *dev = cxlds->dev;
> > +	size_t untenanted_mem;
> >  	int rc;
> >  
> > +	mds->total_bytes = mds->static_bytes;
> > +	if (mds->nr_dc_region) {
> > +		untenanted_mem = mds->dc_region[0].base - mds->static_bytes;
> > +		mds->total_bytes += untenanted_mem + mds->dynamic_bytes;
> > +	}
> > +
> >  	if (!cxlds->media_ready) {
> >  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> >  		cxlds->ram_res = DEFINE_RES_MEM(0, 0);
> > @@ -1305,6 +1465,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >  
> >  	cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
> >  
> > +	for (int i = 0; i < mds->nr_dc_region; i++) {
> > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> > +				 dcr->base, dcr->decode_len, dcr->name);
> > +		if (rc)
> > +			return rc;
> > +	}
> > +
> >  	if (mds->partition_align_bytes == 0) {
> >  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
> >  				 mds->volatile_only_bytes, "ram");
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index f2f8b567e0e7..b4eb8164d05d 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -402,6 +402,7 @@ enum cxl_devtype {
> >  	CXL_DEVTYPE_CLASSMEM,
> >  };
> >  
> > +#define CXL_MAX_DC_REGION 8
> >  /**
> >   * struct cxl_dpa_perf - DPA performance property entry
> >   * @dpa_range: range for DPA address
> > @@ -431,6 +432,8 @@ struct cxl_dpa_perf {
> >   * @dpa_res: Overall DPA resource tree for the device
> >   * @pmem_res: Active Persistent memory capacity configuration
> >   * @ram_res: Active Volatile memory capacity configuration
> > + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> > + *          region
> >   * @serial: PCIe Device Serial Number
> >   * @type: Generic Memory Class device or Vendor Specific Memory device
> >   */
> > @@ -445,10 +448,22 @@ struct cxl_dev_state {
> >  	struct resource dpa_res;
> >  	struct resource pmem_res;
> >  	struct resource ram_res;
> > +	struct resource dc_res[CXL_MAX_DC_REGION];
> >  	u64 serial;
> >  	enum cxl_devtype type;
> >  };
> >  
> > +#define CXL_DC_REGION_STRLEN > +struct cxl_dc_region_info {
> > +	u64 base;
> > +	u64 decode_len;
> > +	u64 len;
> > +	u64 blk_size;
> > +	u32 dsmad_handle;
> > +	u8 flags;
> > +	u8 name[CXL_DC_REGION_STRLEN];
> > +};
> 
> Does this need kdoc comments?
> 
> 
> > +
> >  /**
> >   * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
> >   *
> > @@ -466,7 +481,9 @@ struct cxl_dev_state {
> >   * @dcd_cmds: List of DCD commands implemented by memory device
> >   * @enabled_cmds: Hardware commands found enabled in CEL.
> >   * @exclusive_cmds: Commands that are kernel-internal only
> > - * @total_bytes: sum of all possible capacities
> > + * @total_bytes: length of all possible capacities
> > + * @static_bytes: length of possible static RAM and PMEM partitions
> > + * @dynamic_bytes: length of possible DC partitions (DC Regions)
> 
> Did this get added to the wrong struct comment header? 'cxl_dev_state' instead of 'cxl_memdev_state'?
> >   * @volatile_only_bytes: hard volatile capacity
> >   * @persistent_only_bytes: hard persistent capacity
> >   * @partition_align_bytes: alignment size for partition-able capacity
> > @@ -476,6 +493,8 @@ struct cxl_dev_state {
> >   * @next_persistent_bytes: persistent capacity change pending device reset
> >   * @ram_perf: performance data entry matched to RAM partition
> >   * @pmem_perf: performance data entry matched to PMEM partition
> > + * @nr_dc_region: number of DC regions implemented in the memory device
> > + * @dc_region: array containing info about the DC regions
> Did this get added to the wrong struct comment header? 'cxl_dev_state' instead of 'cxl_memdev_state'?
> 
> DJ
> 
> >   * @event: event log driver state
> >   * @poison: poison driver state info
> >   * @security: security driver state info
> > @@ -496,6 +515,8 @@ struct cxl_memdev_state {
> >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	u64 total_bytes;
> > +	u64 static_bytes;
> > +	u64 dynamic_bytes;
> >  	u64 volatile_only_bytes;
> >  	u64 persistent_only_bytes;
> >  	u64 partition_align_bytes;
> > @@ -507,6 +528,9 @@ struct cxl_memdev_state {
> >  	struct cxl_dpa_perf ram_perf;
> >  	struct cxl_dpa_perf pmem_perf;
> >  
> > +	u8 nr_dc_region;
> > +	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
> > +
> >  	struct cxl_event_state event;
> >  	struct cxl_poison_state poison;
> >  	struct cxl_security_state security;
> > @@ -709,6 +733,32 @@ struct cxl_mbox_set_partition_info {
> >  
> >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> >  
> > +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> > +struct cxl_mbox_get_dc_config_in {
> > +	u8 region_count;
> > +	u8 start_region_index;
> > +} __packed;
> > +
> > +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> > +struct cxl_mbox_get_dc_config_out {
> > +	u8 avail_region_count;
> > +	u8 regions_returned;
> > +	u8 rsvd[6];
> > +	/* See CXL 3.1 Table 8-165 */
> > +	struct cxl_dc_region_config {
> > +		__le64 region_base;
> > +		__le64 region_decode_length;
> > +		__le64 region_length;
> > +		__le64 region_block_size;
> > +		__le32 region_dsmad_handle;
> > +		u8 flags;
> > +		u8 rsvd[3];
> > +	} __packed region[];
> > +	/* Trailing fields unused */
> > +} __packed;
> > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
> > +#define CXL_DCD_BLOCK_LINE_SIZE 0x40
> > +
> >  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
> >  struct cxl_mbox_set_timestamp_in {
> >  	__le64 timestamp;
> > @@ -832,6 +882,7 @@ enum {
> >  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
> >  			  struct cxl_mbox_cmd *cmd);
> >  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
> >  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
> >  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
> >  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> > @@ -845,6 +896,17 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> >  			    enum cxl_event_log_type type,
> >  			    enum cxl_event_type event_type,
> >  			    const uuid_t *uuid, union cxl_event *evt);
> > +
> > +static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
> > +{
> > +	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > +}
> > +
> > +static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
> > +{
> > +	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > +}
> > +
> >  int cxl_set_timestamp(struct cxl_memdev_state *mds);
> >  int cxl_poison_state_init(struct cxl_memdev_state *mds);
> >  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 3a60cd66263e..f7f03599bc83 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > +	rc = cxl_dev_dynamic_capacity_identify(mds);
> > +	if (rc)
> > +		cxl_disable_dcd(mds);
> > +
> >  	rc = cxl_mem_create_range_info(mds);
> >  	if (rc)
> >  		return rc;
> >
Ira Weiny Aug. 23, 2024, 2:01 a.m. UTC | #3
Fan Ni wrote:
> On Fri, Aug 16, 2024 at 02:45:47PM -0700, Dave Jiang wrote:
> > 
> > > +
> > > +/**
> > > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> > > + *					 information from the device.
> > > + * @mds: The memory device state
> > > + *
> > > + * Read Dynamic Capacity information from the device and populate the state
> > > + * structures for later use.
> > > + *
> > > + * Return: 0 if identify was executed successfully, -ERRNO on error.
> > > + */
> > > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > > +{
> > > +	size_t dc_resp_size = mds->payload_size;
> > > +	struct device *dev = mds->cxlds.dev;
> > > +	u8 start_region, i;
> > > +
> > > +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> > > +		snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>");
> > > +
> > > +	if (!cxl_dcd_supported(mds)) {
> > > +		dev_dbg(dev, "DCD not supported\n");
> > > +		return 0;
> > > +	}
> > 
> > This should happen before you pre-format the name string? I would assume that if DCD is not supported then the dcd name sysfs attribs would be not be visible?
> > 

No this string is not used for sysfs.  It is used to label the dpa
resources...  That said in review I don't recall why it was necessary to
add the '<nil>' to them by default.  I'm actually going to remove that and
continue testing and if I recall where this was showing up I might add it
back in.

> > > +
> > > +	struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) =
> > > +					kvmalloc(dc_resp_size, GFP_KERNEL);
> > > +	if (!dc_resp)
> > > +		return -ENOMEM;
> > > +
> > > +	start_region = 0;
> > > +	do {
> > > +		int rc, j;
> > > +
> > > +		rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size);
> > > +		if (rc < 0) {
> > > +			dev_dbg(dev, "Failed to get DC config: %d\n", rc);
> > > +			return rc;
> > > +		}
> > > +
> > > +		mds->nr_dc_region += rc;
> > > +
> > > +		if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> > > +			dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> > > +				mds->nr_dc_region);
> > > +			return -EINVAL;
> > > +		}
> > > +
> > > +		for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) {
> > 
> > This should be 'j < mds->nr_dc_region'? Otherwise if your start region say is '3' and you have '2' DC regions, you never enter the loop. Or does that not happen? I also wonder if you need to check if 'start_region + mds->nr_dc_region > CXL_MAX_DC_REGION'.
> > 
> That can not happen, start_region was updated to the number of regions
> has returned till now (not counting the current call), while
> nr_dc_region is the total number of regions returned till now (including
> the current call) as we update it above, so start_region should never be larger
> than nr_dc_region.

Yep.

> 
> > > +			rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]);
> > > +			if (rc) {
> > > +				dev_dbg(dev, "Failed to save region info: %d\n", rc);
> 
> I am not sure why we sometimes use dev_err and sometimes we use dev_dbg
> here, if dcd is supported, error from getting dc configuration is an
> error to me.

We are trying to reduce the dev_err() use.  cxl_dc_save_region_info() has
dev_err() which is much more specific as to the error.  At worse this is
just redundant as a debug.

I'll remove it because the debug output is pretty verbose too.

Ira

> 
> Fan
> 
> > > +				return rc;
> > > +			}
> > > +		}
> > > +
> > > +		start_region = mds->nr_dc_region;
> > > +
> > > +	} while (mds->nr_dc_region < dc_resp->avail_region_count);
> > > +
> > > +	mds->dynamic_bytes =
> > > +		mds->dc_region[mds->nr_dc_region - 1].base +
> > > +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> > > +		mds->dc_region[0].base;
> > > +	dev_dbg(dev, "Total dynamic range: %#llx\n", mds->dynamic_bytes);
> > > +
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> > > +
> > >  static int add_dpa_res(struct device *dev, struct resource *parent,
> > >  		       struct resource *res, resource_size_t start,
> > >  		       resource_size_t size, const char *type)
> > > @@ -1294,8 +1447,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> > >  {
> > >  	struct cxl_dev_state *cxlds = &mds->cxlds;
> > >  	struct device *dev = cxlds->dev;
> > > +	size_t untenanted_mem;
> > >  	int rc;
> > >  
> > > +	mds->total_bytes = mds->static_bytes;
> > > +	if (mds->nr_dc_region) {
> > > +		untenanted_mem = mds->dc_region[0].base - mds->static_bytes;
> > > +		mds->total_bytes += untenanted_mem + mds->dynamic_bytes;
> > > +	}
> > > +
> > >  	if (!cxlds->media_ready) {
> > >  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> > >  		cxlds->ram_res = DEFINE_RES_MEM(0, 0);
> > > @@ -1305,6 +1465,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> > >  
> > >  	cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
> > >  
> > > +	for (int i = 0; i < mds->nr_dc_region; i++) {
> > > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > > +
> > > +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> > > +				 dcr->base, dcr->decode_len, dcr->name);
> > > +		if (rc)
> > > +			return rc;
> > > +	}
> > > +
> > >  	if (mds->partition_align_bytes == 0) {
> > >  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
> > >  				 mds->volatile_only_bytes, "ram");
> > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > index f2f8b567e0e7..b4eb8164d05d 100644
> > > --- a/drivers/cxl/cxlmem.h
> > > +++ b/drivers/cxl/cxlmem.h
> > > @@ -402,6 +402,7 @@ enum cxl_devtype {
> > >  	CXL_DEVTYPE_CLASSMEM,
> > >  };
> > >  
> > > +#define CXL_MAX_DC_REGION 8
> > >  /**
> > >   * struct cxl_dpa_perf - DPA performance property entry
> > >   * @dpa_range: range for DPA address
> > > @@ -431,6 +432,8 @@ struct cxl_dpa_perf {
> > >   * @dpa_res: Overall DPA resource tree for the device
> > >   * @pmem_res: Active Persistent memory capacity configuration
> > >   * @ram_res: Active Volatile memory capacity configuration
> > > + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> > > + *          region
> > >   * @serial: PCIe Device Serial Number
> > >   * @type: Generic Memory Class device or Vendor Specific Memory device
> > >   */
> > > @@ -445,10 +448,22 @@ struct cxl_dev_state {
> > >  	struct resource dpa_res;
> > >  	struct resource pmem_res;
> > >  	struct resource ram_res;
> > > +	struct resource dc_res[CXL_MAX_DC_REGION];
> > >  	u64 serial;
> > >  	enum cxl_devtype type;
> > >  };
> > >  
> > > +#define CXL_DC_REGION_STRLEN > +struct cxl_dc_region_info {
> > > +	u64 base;
> > > +	u64 decode_len;
> > > +	u64 len;
> > > +	u64 blk_size;
> > > +	u32 dsmad_handle;
> > > +	u8 flags;
> > > +	u8 name[CXL_DC_REGION_STRLEN];
> > > +};
> > 
> > Does this need kdoc comments?
> > 
> > 
> > > +
> > >  /**
> > >   * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
> > >   *
> > > @@ -466,7 +481,9 @@ struct cxl_dev_state {
> > >   * @dcd_cmds: List of DCD commands implemented by memory device
> > >   * @enabled_cmds: Hardware commands found enabled in CEL.
> > >   * @exclusive_cmds: Commands that are kernel-internal only
> > > - * @total_bytes: sum of all possible capacities
> > > + * @total_bytes: length of all possible capacities
> > > + * @static_bytes: length of possible static RAM and PMEM partitions
> > > + * @dynamic_bytes: length of possible DC partitions (DC Regions)
> > 
> > Did this get added to the wrong struct comment header? 'cxl_dev_state' instead of 'cxl_memdev_state'?
> > >   * @volatile_only_bytes: hard volatile capacity
> > >   * @persistent_only_bytes: hard persistent capacity
> > >   * @partition_align_bytes: alignment size for partition-able capacity
> > > @@ -476,6 +493,8 @@ struct cxl_dev_state {
> > >   * @next_persistent_bytes: persistent capacity change pending device reset
> > >   * @ram_perf: performance data entry matched to RAM partition
> > >   * @pmem_perf: performance data entry matched to PMEM partition
> > > + * @nr_dc_region: number of DC regions implemented in the memory device
> > > + * @dc_region: array containing info about the DC regions
> > Did this get added to the wrong struct comment header? 'cxl_dev_state' instead of 'cxl_memdev_state'?
> > 
> > DJ
> > 
> > >   * @event: event log driver state
> > >   * @poison: poison driver state info
> > >   * @security: security driver state info
> > > @@ -496,6 +515,8 @@ struct cxl_memdev_state {
> > >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> > >  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> > >  	u64 total_bytes;
> > > +	u64 static_bytes;
> > > +	u64 dynamic_bytes;
> > >  	u64 volatile_only_bytes;
> > >  	u64 persistent_only_bytes;
> > >  	u64 partition_align_bytes;
> > > @@ -507,6 +528,9 @@ struct cxl_memdev_state {
> > >  	struct cxl_dpa_perf ram_perf;
> > >  	struct cxl_dpa_perf pmem_perf;
> > >  
> > > +	u8 nr_dc_region;
> > > +	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
> > > +
> > >  	struct cxl_event_state event;
> > >  	struct cxl_poison_state poison;
> > >  	struct cxl_security_state security;
> > > @@ -709,6 +733,32 @@ struct cxl_mbox_set_partition_info {
> > >  
> > >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> > >  
> > > +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> > > +struct cxl_mbox_get_dc_config_in {
> > > +	u8 region_count;
> > > +	u8 start_region_index;
> > > +} __packed;
> > > +
> > > +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> > > +struct cxl_mbox_get_dc_config_out {
> > > +	u8 avail_region_count;
> > > +	u8 regions_returned;
> > > +	u8 rsvd[6];
> > > +	/* See CXL 3.1 Table 8-165 */
> > > +	struct cxl_dc_region_config {
> > > +		__le64 region_base;
> > > +		__le64 region_decode_length;
> > > +		__le64 region_length;
> > > +		__le64 region_block_size;
> > > +		__le32 region_dsmad_handle;
> > > +		u8 flags;
> > > +		u8 rsvd[3];
> > > +	} __packed region[];
> > > +	/* Trailing fields unused */
> > > +} __packed;
> > > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
> > > +#define CXL_DCD_BLOCK_LINE_SIZE 0x40
> > > +
> > >  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
> > >  struct cxl_mbox_set_timestamp_in {
> > >  	__le64 timestamp;
> > > @@ -832,6 +882,7 @@ enum {
> > >  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
> > >  			  struct cxl_mbox_cmd *cmd);
> > >  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> > > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
> > >  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
> > >  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
> > >  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> > > @@ -845,6 +896,17 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> > >  			    enum cxl_event_log_type type,
> > >  			    enum cxl_event_type event_type,
> > >  			    const uuid_t *uuid, union cxl_event *evt);
> > > +
> > > +static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
> > > +{
> > > +	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > > +}
> > > +
> > > +static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
> > > +{
> > > +	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > > +}
> > > +
> > >  int cxl_set_timestamp(struct cxl_memdev_state *mds);
> > >  int cxl_poison_state_init(struct cxl_memdev_state *mds);
> > >  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > index 3a60cd66263e..f7f03599bc83 100644
> > > --- a/drivers/cxl/pci.c
> > > +++ b/drivers/cxl/pci.c
> > > @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> > >  	if (rc)
> > >  		return rc;
> > >  
> > > +	rc = cxl_dev_dynamic_capacity_identify(mds);
> > > +	if (rc)
> > > +		cxl_disable_dcd(mds);
> > > +
> > >  	rc = cxl_mem_create_range_info(mds);
> > >  	if (rc)
> > >  		return rc;
> > > 
>
Ira Weiny Aug. 23, 2024, 2:02 a.m. UTC | #4
Fan Ni wrote:
> On Fri, Aug 16, 2024 at 02:45:47PM -0700, Dave Jiang wrote:

[snip]

> > > +
> > > +/**
> > > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> > > + *					 information from the device.
> > > + * @mds: The memory device state
> > > + *
> > > + * Read Dynamic Capacity information from the device and populate the state
> > > + * structures for later use.
> > > + *
> > > + * Return: 0 if identify was executed successfully, -ERRNO on error.
> > > + */
> > > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > > +{
> > > +	size_t dc_resp_size = mds->payload_size;
> > > +	struct device *dev = mds->cxlds.dev;
> > > +	u8 start_region, i;
> > > +
> > > +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> > > +		snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>");
> > > +
> > > +	if (!cxl_dcd_supported(mds)) {
> > > +		dev_dbg(dev, "DCD not supported\n");
> > > +		return 0;
> > > +	}
> > 
> > This should happen before you pre-format the name string? I would assume that if DCD is not supported then the dcd name sysfs attribs would be not be visible?
> > 

No this string is not used for sysfs.  It is used to label the dpa
resources...  That said in review I don't recall why it was necessary to
add the '<nil>' to them by default.  I'm actually going to remove that and
continue testing and if I recall where this was showing up I might add it
back in.

> > > +
> > > +	struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) =
> > > +					kvmalloc(dc_resp_size, GFP_KERNEL);
> > > +	if (!dc_resp)
> > > +		return -ENOMEM;
> > > +
> > > +	start_region = 0;
> > > +	do {
> > > +		int rc, j;
> > > +
> > > +		rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size);
> > > +		if (rc < 0) {
> > > +			dev_dbg(dev, "Failed to get DC config: %d\n", rc);
> > > +			return rc;
> > > +		}
> > > +
> > > +		mds->nr_dc_region += rc;
> > > +
> > > +		if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> > > +			dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> > > +				mds->nr_dc_region);
> > > +			return -EINVAL;
> > > +		}
> > > +
> > > +		for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) {
> > 
> > This should be 'j < mds->nr_dc_region'? Otherwise if your start region say is '3' and you have '2' DC regions, you never enter the loop. Or does that not happen? I also wonder if you need to check if 'start_region + mds->nr_dc_region > CXL_MAX_DC_REGION'.
> > 
> That can not happen, start_region was updated to the number of regions
> has returned till now (not counting the current call), while
> nr_dc_region is the total number of regions returned till now (including
> the current call) as we update it above, so start_region should never be larger
> than nr_dc_region.

Yep.

> 
> > > +			rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]);
> > > +			if (rc) {
> > > +				dev_dbg(dev, "Failed to save region info: %d\n", rc);
> 
> I am not sure why we sometimes use dev_err and sometimes we use dev_dbg
> here, if dcd is supported, error from getting dc configuration is an
> error to me.

We are trying to reduce the dev_err() use.  cxl_dc_save_region_info() has
dev_err() which is much more specific as to the error.  At worse this is
just redundant as a debug.

I'll remove it because the debug output is pretty verbose too.

Ira

> 
> Fan

[snip]
Jonathan Cameron Aug. 23, 2024, 3:45 p.m. UTC | #5
On Fri, 16 Aug 2024 09:44:14 -0500
ira.weiny@intel.com wrote:

> From: Navneet Singh <navneet.singh@intel.com>
> 
> Devices which optionally support Dynamic Capacity (DC) are configured
> via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
> Configuration command in order to properly configure DCDs.  Without the
> Get DC Configuration command DCD can't be supported.
> 
> Implement the DC mailbox commands as specified in CXL 3.1 section
> 8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
> information.  Disable DCD if DCD is not supported.  Leverage the Get DC
> Configuration command supported bit to indicate if DCD support.
> 
> Linux has no use for the trailing fields of the Get Dynamic Capacity
> Configuration Output Payload (Total number of supported extents, number
> of available extents, total number of supported tags, and number of
> available tags).  Avoid defining those fields to use the more useful
> dynamic C array.
> 
> Cc: "Li, Ming" <ming4.li@intel.com>
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
LGTM
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
If you can get rid of the <nil> thing even better.
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 8eb196858abe..68c26c4be91a 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1157,7 +1157,7 @@  int cxl_dev_state_identify(struct cxl_memdev_state *mds)
 	if (rc < 0)
 		return rc;
 
-	mds->total_bytes =
+	mds->static_bytes =
 		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
 	mds->volatile_only_bytes =
 		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
@@ -1264,6 +1264,159 @@  int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
 	return rc;
 }
 
+static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index,
+				   struct cxl_dc_region_config *region_config)
+{
+	struct cxl_dc_region_info *dcr = &mds->dc_region[index];
+	struct device *dev = mds->cxlds.dev;
+
+	dcr->base = le64_to_cpu(region_config->region_base);
+	dcr->decode_len = le64_to_cpu(region_config->region_decode_length);
+	dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
+	dcr->len = le64_to_cpu(region_config->region_length);
+	dcr->blk_size = le64_to_cpu(region_config->region_block_size);
+	dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle);
+	dcr->flags = region_config->flags;
+	snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index);
+
+	/* Check regions are in increasing DPA order */
+	if (index > 0) {
+		struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1];
+
+		if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) {
+			dev_err(dev,
+				"DPA ordering violation for DC region %d and %d\n",
+				index - 1, index);
+			return -EINVAL;
+		}
+	}
+
+	if (!IS_ALIGNED(dcr->base, SZ_256M) ||
+	    !IS_ALIGNED(dcr->base, dcr->blk_size)) {
+		dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n",
+			index, dcr->base, dcr->blk_size);
+		return -EINVAL;
+	}
+
+	if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len ||
+	    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
+		dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n",
+			index, dcr->decode_len, dcr->len, dcr->blk_size);
+		return -EINVAL;
+	}
+
+	if (dcr->blk_size == 0 || dcr->blk_size % CXL_DCD_BLOCK_LINE_SIZE ||
+	    !is_power_of_2(dcr->blk_size)) {
+		dev_err(dev, "DC region %d invalid block size; %#llx\n",
+			index, dcr->blk_size);
+		return -EINVAL;
+	}
+
+	dev_dbg(dev,
+		"DC region %s base %#llx length %#llx block size %#llx\n",
+		dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
+
+	return 0;
+}
+
+/* Returns the number of regions in dc_resp or -ERRNO */
+static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region,
+			     struct cxl_mbox_get_dc_config_out *dc_resp,
+			     size_t dc_resp_size)
+{
+	struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) {
+		.region_count = CXL_MAX_DC_REGION,
+		.start_region_index = start_region,
+	};
+	struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
+		.payload_in = &get_dc,
+		.size_in = sizeof(get_dc),
+		.size_out = dc_resp_size,
+		.payload_out = dc_resp,
+		.min_out = 1,
+	};
+	struct device *dev = mds->cxlds.dev;
+	int rc;
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		return rc;
+
+	dev_dbg(dev, "Read %d/%d DC regions\n",
+		dc_resp->regions_returned, dc_resp->avail_region_count);
+	return dc_resp->regions_returned;
+}
+
+/**
+ * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
+ *					 information from the device.
+ * @mds: The memory device state
+ *
+ * Read Dynamic Capacity information from the device and populate the state
+ * structures for later use.
+ *
+ * Return: 0 if identify was executed successfully, -ERRNO on error.
+ */
+int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
+{
+	size_t dc_resp_size = mds->payload_size;
+	struct device *dev = mds->cxlds.dev;
+	u8 start_region, i;
+
+	for (i = 0; i < CXL_MAX_DC_REGION; i++)
+		snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>");
+
+	if (!cxl_dcd_supported(mds)) {
+		dev_dbg(dev, "DCD not supported\n");
+		return 0;
+	}
+
+	struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) =
+					kvmalloc(dc_resp_size, GFP_KERNEL);
+	if (!dc_resp)
+		return -ENOMEM;
+
+	start_region = 0;
+	do {
+		int rc, j;
+
+		rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size);
+		if (rc < 0) {
+			dev_dbg(dev, "Failed to get DC config: %d\n", rc);
+			return rc;
+		}
+
+		mds->nr_dc_region += rc;
+
+		if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
+			dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
+				mds->nr_dc_region);
+			return -EINVAL;
+		}
+
+		for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) {
+			rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]);
+			if (rc) {
+				dev_dbg(dev, "Failed to save region info: %d\n", rc);
+				return rc;
+			}
+		}
+
+		start_region = mds->nr_dc_region;
+
+	} while (mds->nr_dc_region < dc_resp->avail_region_count);
+
+	mds->dynamic_bytes =
+		mds->dc_region[mds->nr_dc_region - 1].base +
+		mds->dc_region[mds->nr_dc_region - 1].decode_len -
+		mds->dc_region[0].base;
+	dev_dbg(dev, "Total dynamic range: %#llx\n", mds->dynamic_bytes);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
+
 static int add_dpa_res(struct device *dev, struct resource *parent,
 		       struct resource *res, resource_size_t start,
 		       resource_size_t size, const char *type)
@@ -1294,8 +1447,15 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 {
 	struct cxl_dev_state *cxlds = &mds->cxlds;
 	struct device *dev = cxlds->dev;
+	size_t untenanted_mem;
 	int rc;
 
+	mds->total_bytes = mds->static_bytes;
+	if (mds->nr_dc_region) {
+		untenanted_mem = mds->dc_region[0].base - mds->static_bytes;
+		mds->total_bytes += untenanted_mem + mds->dynamic_bytes;
+	}
+
 	if (!cxlds->media_ready) {
 		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
 		cxlds->ram_res = DEFINE_RES_MEM(0, 0);
@@ -1305,6 +1465,15 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 
 	cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
 
+	for (int i = 0; i < mds->nr_dc_region; i++) {
+		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
+
+		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
+				 dcr->base, dcr->decode_len, dcr->name);
+		if (rc)
+			return rc;
+	}
+
 	if (mds->partition_align_bytes == 0) {
 		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
 				 mds->volatile_only_bytes, "ram");
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index f2f8b567e0e7..b4eb8164d05d 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -402,6 +402,7 @@  enum cxl_devtype {
 	CXL_DEVTYPE_CLASSMEM,
 };
 
+#define CXL_MAX_DC_REGION 8
 /**
  * struct cxl_dpa_perf - DPA performance property entry
  * @dpa_range: range for DPA address
@@ -431,6 +432,8 @@  struct cxl_dpa_perf {
  * @dpa_res: Overall DPA resource tree for the device
  * @pmem_res: Active Persistent memory capacity configuration
  * @ram_res: Active Volatile memory capacity configuration
+ * @dc_res: Active Dynamic Capacity memory configuration for each possible
+ *          region
  * @serial: PCIe Device Serial Number
  * @type: Generic Memory Class device or Vendor Specific Memory device
  */
@@ -445,10 +448,22 @@  struct cxl_dev_state {
 	struct resource dpa_res;
 	struct resource pmem_res;
 	struct resource ram_res;
+	struct resource dc_res[CXL_MAX_DC_REGION];
 	u64 serial;
 	enum cxl_devtype type;
 };
 
+#define CXL_DC_REGION_STRLEN 8
+struct cxl_dc_region_info {
+	u64 base;
+	u64 decode_len;
+	u64 len;
+	u64 blk_size;
+	u32 dsmad_handle;
+	u8 flags;
+	u8 name[CXL_DC_REGION_STRLEN];
+};
+
 /**
  * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
  *
@@ -466,7 +481,9 @@  struct cxl_dev_state {
  * @dcd_cmds: List of DCD commands implemented by memory device
  * @enabled_cmds: Hardware commands found enabled in CEL.
  * @exclusive_cmds: Commands that are kernel-internal only
- * @total_bytes: sum of all possible capacities
+ * @total_bytes: length of all possible capacities
+ * @static_bytes: length of possible static RAM and PMEM partitions
+ * @dynamic_bytes: length of possible DC partitions (DC Regions)
  * @volatile_only_bytes: hard volatile capacity
  * @persistent_only_bytes: hard persistent capacity
  * @partition_align_bytes: alignment size for partition-able capacity
@@ -476,6 +493,8 @@  struct cxl_dev_state {
  * @next_persistent_bytes: persistent capacity change pending device reset
  * @ram_perf: performance data entry matched to RAM partition
  * @pmem_perf: performance data entry matched to PMEM partition
+ * @nr_dc_region: number of DC regions implemented in the memory device
+ * @dc_region: array containing info about the DC regions
  * @event: event log driver state
  * @poison: poison driver state info
  * @security: security driver state info
@@ -496,6 +515,8 @@  struct cxl_memdev_state {
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
 	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
 	u64 total_bytes;
+	u64 static_bytes;
+	u64 dynamic_bytes;
 	u64 volatile_only_bytes;
 	u64 persistent_only_bytes;
 	u64 partition_align_bytes;
@@ -507,6 +528,9 @@  struct cxl_memdev_state {
 	struct cxl_dpa_perf ram_perf;
 	struct cxl_dpa_perf pmem_perf;
 
+	u8 nr_dc_region;
+	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
+
 	struct cxl_event_state event;
 	struct cxl_poison_state poison;
 	struct cxl_security_state security;
@@ -709,6 +733,32 @@  struct cxl_mbox_set_partition_info {
 
 #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
 
+/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
+struct cxl_mbox_get_dc_config_in {
+	u8 region_count;
+	u8 start_region_index;
+} __packed;
+
+/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
+struct cxl_mbox_get_dc_config_out {
+	u8 avail_region_count;
+	u8 regions_returned;
+	u8 rsvd[6];
+	/* See CXL 3.1 Table 8-165 */
+	struct cxl_dc_region_config {
+		__le64 region_base;
+		__le64 region_decode_length;
+		__le64 region_length;
+		__le64 region_block_size;
+		__le32 region_dsmad_handle;
+		u8 flags;
+		u8 rsvd[3];
+	} __packed region[];
+	/* Trailing fields unused */
+} __packed;
+#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
+#define CXL_DCD_BLOCK_LINE_SIZE 0x40
+
 /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
 struct cxl_mbox_set_timestamp_in {
 	__le64 timestamp;
@@ -832,6 +882,7 @@  enum {
 int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
 			  struct cxl_mbox_cmd *cmd);
 int cxl_dev_state_identify(struct cxl_memdev_state *mds);
+int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
 int cxl_await_media_ready(struct cxl_dev_state *cxlds);
 int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
 int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
@@ -845,6 +896,17 @@  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 			    enum cxl_event_log_type type,
 			    enum cxl_event_type event_type,
 			    const uuid_t *uuid, union cxl_event *evt);
+
+static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
+{
+	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
+}
+
+static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
+{
+	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
+}
+
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 3a60cd66263e..f7f03599bc83 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -874,6 +874,10 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	rc = cxl_dev_dynamic_capacity_identify(mds);
+	if (rc)
+		cxl_disable_dcd(mds);
+
 	rc = cxl_mem_create_range_info(mds);
 	if (rc)
 		return rc;