diff mbox series

[1/5] cxl/mem : Read Dynamic capacity configuration from the device

Message ID 20230604-dcd-type2-upstream-v1-1-71b6341bae54@intel.com
State New, archived
Headers show
Series cxl/dcd: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny June 14, 2023, 7:16 p.m. UTC
From: Navneet Singh <navneet.singh@intel.com>

Read the Dynamic capacity configuration and store dynamic capacity region
information in the device state which driver will use to map into the HDM
ranges.

Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
command as specified in CXL 3.0 spec section 8.2.9.8.9.1.

Signed-off-by: Navneet Singh <navneet.singh@intel.com>

---
[iweiny: ensure all mds->dc_region's are named]
---
 drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
 drivers/cxl/pci.c       |   4 +
 3 files changed, 256 insertions(+), 8 deletions(-)

Comments

Dave Jiang June 14, 2023, 10:53 p.m. UTC | #1
On 6/14/23 12:16, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Read the Dynamic capacity configuration and store dynamic capacity region
> information in the device state which driver will use to map into the HDM
> ranges.
> 
> Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: ensure all mds->dc_region's are named]
> ---
>   drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
>   drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
>   drivers/cxl/pci.c       |   4 +
>   3 files changed, 256 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 3ca0bf12c55f..c5b696737c87 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -111,6 +111,37 @@ static u8 security_command_sets[] = {
>   	0x46, /* Security Passthrough */
>   };
>   
> +static bool cxl_is_dcd_command(u16 opcode)
> +{
> +#define CXL_MBOX_OP_DCD_CMDS 0x48

Move this to cxlmem.h?

> +
> +	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
> +		return true;
> +
> +	return false;

I think you simplify by:

return (opcode >> 8) == CXL_MBOX_OP_DCD_CMDS;


> +}
> +
> +static void cxl_set_dcd_cmd_enabled(struct cxl_memdev_state *mds,
> +					u16 opcode)
> +{
> +	switch (opcode) {
> +	case CXL_MBOX_OP_GET_DC_CONFIG:
> +		set_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_GET_DC_EXTENT_LIST:
> +		set_bit(CXL_DCD_ENABLED_GET_EXTENT_LIST, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_ADD_DC_RESPONSE:
> +		set_bit(CXL_DCD_ENABLED_ADD_RESPONSE, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_RELEASE_DC:
> +		set_bit(CXL_DCD_ENABLED_RELEASE, mds->dcd_cmds);
> +		break;
> +	default:
> +		break;
> +	}
> +}
> +
>   static bool cxl_is_security_command(u16 opcode)
>   {
>   	int i;
> @@ -666,6 +697,7 @@ static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
>   static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>   {
>   	struct cxl_cel_entry *cel_entry;
> +	struct cxl_mem_command *cmd;
>   	const int cel_entries = size / sizeof(*cel_entry);
>   	struct device *dev = mds->cxlds.dev;
>   	int i;
> @@ -674,11 +706,12 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>   
>   	for (i = 0; i < cel_entries; i++) {
>   		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
> -		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
> +		cmd = cxl_mem_find_command(opcode);
>   
> -		if (!cmd && !cxl_is_poison_command(opcode)) {
> -			dev_dbg(dev,
> -				"Opcode 0x%04x unsupported by driver\n", opcode);
> +		if (!cmd && !cxl_is_poison_command(opcode) &&
> +		    !cxl_is_dcd_command(opcode)) {
> +			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
> +				opcode);
>   			continue;
>   		}
>   
> @@ -688,6 +721,9 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>   		if (cxl_is_poison_command(opcode))
>   			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
>   
> +		if (cxl_is_dcd_command(opcode))
> +			cxl_set_dcd_cmd_enabled(mds, opcode);
> +
>   		dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode);
>   	}
>   }
> @@ -1059,7 +1095,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>   	if (rc < 0)
>   		return rc;
>   
> -	mds->total_bytes =
> +	mds->total_static_capacity =
>   		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
>   	mds->volatile_only_bytes =
>   		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> @@ -1077,10 +1113,137 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>   		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
>   	}
>   
> +	mds->dc_event_log_size = le16_to_cpu(id.dc_event_log_size);
> +
>   	return 0;
>   }
>   EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
>   
> +/**
> + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> + * information from the device.
> + * @mds: The memory device state
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the get_dynamic_capacity command to the device
> + * and on success populate structures to be exported to sysfs.
> + */
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> +{
> +	struct cxl_dev_state *cxlds = &mds->cxlds;
> +	struct device *dev = cxlds->dev;
> +	struct cxl_mbox_dynamic_capacity *dc;
> +	struct cxl_mbox_get_dc_config get_dc;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	u64 next_dc_region_start;
> +	int rc, i;
> +
> +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> +		sprintf(mds->dc_region[i].name, "dc%d", i);
> +
> +	/* Check GET_DC_CONFIG is supported by device */
> +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> +		return 0;
> +	}
> +
> +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> +	if (!dc)
> +		return -ENOMEM;
> +
> +	get_dc = (struct cxl_mbox_get_dc_config) {
> +		.region_count = CXL_MAX_DC_REGION,
> +		.start_region_index = 0,
> +	};
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> +		.payload_in = &get_dc,
> +		.size_in = sizeof(get_dc),
> +		.size_out = mds->payload_size,
> +		.payload_out = dc,
> +		.min_out = 1,
> +	};
> +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> +	if (rc < 0)
> +		goto dc_error;
> +
> +	mds->nr_dc_region = dc->avail_region_count;
> +
> +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> +			mds->nr_dc_region);
> +		rc = -EINVAL;
> +		goto dc_error;
> +	}
> +
> +	for (i = 0; i < mds->nr_dc_region; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> +		dcr->decode_len =
> +			le64_to_cpu(dc->region[i].region_decode_length);
> +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> +
> +		/* Check regions are in increasing DPA order */
> +		if ((i + 1) < mds->nr_dc_region) {
> +			next_dc_region_start =
> +				le64_to_cpu(dc->region[i + 1].region_base);
> +			if ((dcr->base > next_dc_region_start) ||
> +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
> +				dev_err(dev,
> +					"DPA ordering violation for DC region %d and %d\n",
> +					i, i + 1);
> +				rc = -EINVAL;
> +				goto dc_error;
> +			}
> +		}
> +
> +		/* Check the region is 256 MB aligned */
> +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
> +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		/* Check Region base and length are aligned to block size */
> +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> +				dcr->blk_size);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		dcr->dsmad_handle =
> +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> +		dcr->flags = dc->region[i].flags;
> +		sprintf(dcr->name, "dc%d", i);
> +
> +		dev_dbg(dev,
> +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> +	}
> +
> +	/*
> +	 * Calculate entire DPA range of all configured regions which will be mapped by
> +	 * one or more HDM decoders
> +	 */
> +	mds->total_dynamic_capacity =
> +		mds->dc_region[mds->nr_dc_region - 1].base +
> +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> +		mds->dc_region[0].base;
> +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> +		mds->total_dynamic_capacity);
> +
> +dc_error:
> +	kvfree(dc);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> +
>   static int add_dpa_res(struct device *dev, struct resource *parent,
>   		       struct resource *res, resource_size_t start,
>   		       resource_size_t size, const char *type)
> @@ -1112,6 +1275,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>   	struct cxl_dev_state *cxlds = &mds->cxlds;
>   	struct device *dev = cxlds->dev;
>   	int rc;
> +	size_t untenanted_mem =
> +		mds->dc_region[0].base - mds->total_static_capacity;
> +
> +	mds->total_capacity = mds->total_static_capacity +
> +			untenanted_mem + mds->total_dynamic_capacity;
>   
>   	if (!cxlds->media_ready) {
>   		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>   	}
>   
>   	cxlds->dpa_res =
> -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> +
> +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> +				 dcr->base, dcr->decode_len, dcr->name);
> +		if (rc)
> +			return rc;
> +	}
>   
>   	if (mds->partition_align_bytes == 0) {
>   		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
>   				 mds->volatile_only_bytes, "ram");
>   		if (rc)
>   			return rc;
> +

Stray blank line?

DJ

>   		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
>   				   mds->volatile_only_bytes,
>   				   mds->persistent_only_bytes, "pmem");
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 89e560ea14c0..9c0b2fa72bdd 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -239,6 +239,15 @@ struct cxl_event_state {
>   	struct mutex log_lock;
>   };
>   
> +/* Device enabled DCD commands */
> +enum dcd_cmd_enabled_bits {
> +	CXL_DCD_ENABLED_GET_CONFIG,
> +	CXL_DCD_ENABLED_GET_EXTENT_LIST,
> +	CXL_DCD_ENABLED_ADD_RESPONSE,
> +	CXL_DCD_ENABLED_RELEASE,
> +	CXL_DCD_ENABLED_MAX
> +};
> +
>   /* Device enabled poison commands */
>   enum poison_cmd_enabled_bits {
>   	CXL_POISON_ENABLED_LIST,
> @@ -284,6 +293,9 @@ enum cxl_devtype {
>   	CXL_DEVTYPE_CLASSMEM,
>   };
>   
> +#define CXL_MAX_DC_REGION 8
> +#define CXL_DC_REGION_SRTLEN 8
> +
>   /**
>    * struct cxl_dev_state - The driver device state
>    *
> @@ -300,6 +312,8 @@ enum cxl_devtype {
>    * @dpa_res: Overall DPA resource tree for the device
>    * @pmem_res: Active Persistent memory capacity configuration
>    * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>    * @component_reg_phys: register base of component registers
>    * @info: Cached DVSEC information about the device.
>    * @serial: PCIe Device Serial Number
> @@ -315,6 +329,7 @@ struct cxl_dev_state {
>   	struct resource dpa_res;
>   	struct resource pmem_res;
>   	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];
>   	resource_size_t component_reg_phys;
>   	u64 serial;
>   	enum cxl_devtype type;
> @@ -334,9 +349,12 @@ struct cxl_dev_state {
>    *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>    * @mbox_mutex: Mutex to synchronize mailbox access.
>    * @firmware_version: Firmware version for the memory device.
> + * @dcd_cmds: List of DCD commands implemented by memory device
>    * @enabled_cmds: Hardware commands found enabled in CEL.
>    * @exclusive_cmds: Commands that are kernel-internal only
> - * @total_bytes: sum of all possible capacities
> + * @total_capacity: Sum of static and dynamic capacities
> + * @total_static_capacity: Sum of RAM and PMEM capacities
> + * @total_dynamic_capacity: Complete DPA range occupied by DC regions
>    * @volatile_only_bytes: hard volatile capacity
>    * @persistent_only_bytes: hard persistent capacity
>    * @partition_align_bytes: alignment size for partition-able capacity
> @@ -344,6 +362,10 @@ struct cxl_dev_state {
>    * @active_persistent_bytes: sum of hard + soft persistent
>    * @next_volatile_bytes: volatile capacity change pending device reset
>    * @next_persistent_bytes: persistent capacity change pending device reset
> + * @nr_dc_region: number of DC regions implemented in the memory device
> + * @dc_region: array containing info about the DC regions
> + * @dc_event_log_size: The number of events the device can store in the
> + * Dynamic Capacity Event Log before it overflows
>    * @event: event log driver state
>    * @poison: poison driver state info
>    * @mbox_send: @dev specific transport for transmitting mailbox commands
> @@ -357,9 +379,13 @@ struct cxl_memdev_state {
>   	size_t lsa_size;
>   	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>   	char firmware_version[0x10];
> +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
>   	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>   	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> -	u64 total_bytes;
> +
> +	u64 total_capacity;
> +	u64 total_static_capacity;
> +	u64 total_dynamic_capacity;
>   	u64 volatile_only_bytes;
>   	u64 persistent_only_bytes;
>   	u64 partition_align_bytes;
> @@ -367,6 +393,20 @@ struct cxl_memdev_state {
>   	u64 active_persistent_bytes;
>   	u64 next_volatile_bytes;
>   	u64 next_persistent_bytes;
> +
> +	u8 nr_dc_region;
> +
> +	struct cxl_dc_region_info {
> +		u8 name[CXL_DC_REGION_SRTLEN];
> +		u64 base;
> +		u64 decode_len;
> +		u64 len;
> +		u64 blk_size;
> +		u32 dsmad_handle;
> +		u8 flags;
> +	} dc_region[CXL_MAX_DC_REGION];
> +
> +	size_t dc_event_log_size;
>   	struct cxl_event_state event;
>   	struct cxl_poison_state poison;
>   	int (*mbox_send)(struct cxl_memdev_state *mds,
> @@ -415,6 +455,10 @@ enum cxl_opcode {
>   	CXL_MBOX_OP_UNLOCK		= 0x4503,
>   	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
>   	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
> +	CXL_MBOX_OP_GET_DC_CONFIG	= 0x4800,
> +	CXL_MBOX_OP_GET_DC_EXTENT_LIST	= 0x4801,
> +	CXL_MBOX_OP_ADD_DC_RESPONSE	= 0x4802,
> +	CXL_MBOX_OP_RELEASE_DC		= 0x4803,
>   	CXL_MBOX_OP_MAX			= 0x10000
>   };
>   
> @@ -462,6 +506,7 @@ struct cxl_mbox_identify {
>   	__le16 inject_poison_limit;
>   	u8 poison_caps;
>   	u8 qos_telemetry_caps;
> +	__le16 dc_event_log_size;
>   } __packed;
>   
>   /*
> @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
>   	u8 flags;
>   } __packed;
>   
> +struct cxl_mbox_get_dc_config {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> +struct cxl_mbox_dynamic_capacity {
> +	u8 avail_region_count;
> +	u8 rsvd[7];
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[];
> +} __packed;
>   #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
>   
>   /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>   struct cxl_mbox_set_timestamp_in {
> @@ -742,6 +807,7 @@ enum {
>   int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
>   			  struct cxl_mbox_cmd *cmd);
>   int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>   int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>   int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>   int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 4e2845b7331a..ac1a41bc083d 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -742,6 +742,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>   	if (rc)
>   		return rc;
>   
> +	rc = cxl_dev_dynamic_capacity_identify(mds);
> +	if (rc)
> +		return rc;
> +
>   	rc = cxl_mem_create_range_info(mds);
>   	if (rc)
>   		return rc;
>
Alison Schofield June 14, 2023, 11:49 p.m. UTC | #2
On Wed, Jun 14, 2023 at 12:16:28PM -0700, Ira Weiny wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Read the Dynamic capacity configuration and store dynamic capacity region
> information in the device state which driver will use to map into the HDM
> ranges.
> 
> Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: ensure all mds->dc_region's are named]
> ---
>  drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
>  drivers/cxl/pci.c       |   4 +
>  3 files changed, 256 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 3ca0bf12c55f..c5b696737c87 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -111,6 +111,37 @@ static u8 security_command_sets[] = {
>  	0x46, /* Security Passthrough */
>  };
>  
> +static bool cxl_is_dcd_command(u16 opcode)
> +{
> +#define CXL_MBOX_OP_DCD_CMDS 0x48
> +
> +	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
> +		return true;
> +
> +	return false;
> +}
> +
> +static void cxl_set_dcd_cmd_enabled(struct cxl_memdev_state *mds,
> +					u16 opcode)
> +{
> +	switch (opcode) {
> +	case CXL_MBOX_OP_GET_DC_CONFIG:
> +		set_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_GET_DC_EXTENT_LIST:
> +		set_bit(CXL_DCD_ENABLED_GET_EXTENT_LIST, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_ADD_DC_RESPONSE:
> +		set_bit(CXL_DCD_ENABLED_ADD_RESPONSE, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_RELEASE_DC:
> +		set_bit(CXL_DCD_ENABLED_RELEASE, mds->dcd_cmds);
> +		break;
> +	default:
> +		break;
> +	}
> +}
> +
>  static bool cxl_is_security_command(u16 opcode)
>  {
>  	int i;
> @@ -666,6 +697,7 @@ static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
>  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  {
>  	struct cxl_cel_entry *cel_entry;
> +	struct cxl_mem_command *cmd;
>  	const int cel_entries = size / sizeof(*cel_entry);
>  	struct device *dev = mds->cxlds.dev;
>  	int i;
> @@ -674,11 +706,12 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  
>  	for (i = 0; i < cel_entries; i++) {
>  		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
> -		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
> +		cmd = cxl_mem_find_command(opcode);

Is the move of the 'cmd' define related to this patch?
Checkpatch warns on it: WARNING: Missing a blank line after declarations

>  
> -		if (!cmd && !cxl_is_poison_command(opcode)) {
> -			dev_dbg(dev,
> -				"Opcode 0x%04x unsupported by driver\n", opcode);
> +		if (!cmd && !cxl_is_poison_command(opcode) &&
> +		    !cxl_is_dcd_command(opcode)) {
> +			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
> +				opcode);
>  			continue;
>  		}
>  
> @@ -688,6 +721,9 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  		if (cxl_is_poison_command(opcode))
>  			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
>  
> +		if (cxl_is_dcd_command(opcode))
> +			cxl_set_dcd_cmd_enabled(mds, opcode);
> +
>  		dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode);
>  	}
>  }
> @@ -1059,7 +1095,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  	if (rc < 0)
>  		return rc;
>  
> -	mds->total_bytes =
> +	mds->total_static_capacity =
>  		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
>  	mds->volatile_only_bytes =
>  		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> @@ -1077,10 +1113,137 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
>  	}
>  
> +	mds->dc_event_log_size = le16_to_cpu(id.dc_event_log_size);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
>  
> +/**
> + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> + * information from the device.
> + * @mds: The memory device state
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the get_dynamic_capacity command to the device
> + * and on success populate structures to be exported to sysfs.
> + */
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> +{
> +	struct cxl_dev_state *cxlds = &mds->cxlds;
> +	struct device *dev = cxlds->dev;
> +	struct cxl_mbox_dynamic_capacity *dc;
> +	struct cxl_mbox_get_dc_config get_dc;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	u64 next_dc_region_start;
> +	int rc, i;
> +
> +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> +		sprintf(mds->dc_region[i].name, "dc%d", i);
> +
> +	/* Check GET_DC_CONFIG is supported by device */
> +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> +		return 0;
> +	}
> +
> +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> +	if (!dc)
> +		return -ENOMEM;
> +
> +	get_dc = (struct cxl_mbox_get_dc_config) {
> +		.region_count = CXL_MAX_DC_REGION,
> +		.start_region_index = 0,
> +	};
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> +		.payload_in = &get_dc,
> +		.size_in = sizeof(get_dc),
> +		.size_out = mds->payload_size,
> +		.payload_out = dc,
> +		.min_out = 1,
> +	};
> +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> +	if (rc < 0)
> +		goto dc_error;
> +
> +	mds->nr_dc_region = dc->avail_region_count;
> +
> +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> +			mds->nr_dc_region);
> +		rc = -EINVAL;
> +		goto dc_error;
> +	}
> +
> +	for (i = 0; i < mds->nr_dc_region; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> +		dcr->decode_len =
> +			le64_to_cpu(dc->region[i].region_decode_length);
> +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> +
> +		/* Check regions are in increasing DPA order */
> +		if ((i + 1) < mds->nr_dc_region) {
> +			next_dc_region_start =
> +				le64_to_cpu(dc->region[i + 1].region_base);
> +			if ((dcr->base > next_dc_region_start) ||
> +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
> +				dev_err(dev,
> +					"DPA ordering violation for DC region %d and %d\n",
> +					i, i + 1);
> +				rc = -EINVAL;
> +				goto dc_error;
> +			}
> +		}
> +
> +		/* Check the region is 256 MB aligned */
> +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
> +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		/* Check Region base and length are aligned to block size */
> +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> +				dcr->blk_size);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		dcr->dsmad_handle =
> +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> +		dcr->flags = dc->region[i].flags;
> +		sprintf(dcr->name, "dc%d", i);
> +
> +		dev_dbg(dev,
> +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> +	}
> +
> +	/*
> +	 * Calculate entire DPA range of all configured regions which will be mapped by
> +	 * one or more HDM decoders
> +	 */

Comment is needlessly going >80 chars.


> +	mds->total_dynamic_capacity =
> +		mds->dc_region[mds->nr_dc_region - 1].base +
> +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> +		mds->dc_region[0].base;
> +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> +		mds->total_dynamic_capacity);
> +
> +dc_error:
> +	kvfree(dc);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> +
>  static int add_dpa_res(struct device *dev, struct resource *parent,
>  		       struct resource *res, resource_size_t start,
>  		       resource_size_t size, const char *type)
> @@ -1112,6 +1275,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	struct cxl_dev_state *cxlds = &mds->cxlds;
>  	struct device *dev = cxlds->dev;
>  	int rc;
> +	size_t untenanted_mem =
> +		mds->dc_region[0].base - mds->total_static_capacity;

Perhaps:
	size_t untenanted_mem;  (and put that in reverse x-tree order)

	untenanted_mem = mds->dc_region[0].base - mds->total_static_capacity;

> +
> +	mds->total_capacity = mds->total_static_capacity +
> +			untenanted_mem + mds->total_dynamic_capacity;
>  

Also, looking at this first patch with the long names, wondering if
there is an opportunity to (re-)define these fields in fewers chars.
Do we have to describe with 'total'? Is there a partial?

I guess I'll get to the defines further down...


>  	if (!cxlds->media_ready) {
>  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	}
>  
>  	cxlds->dpa_res =
> -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> +
> +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> +				 dcr->base, dcr->decode_len, dcr->name);
> +		if (rc)
> +			return rc;
> +	}
>  
>  	if (mds->partition_align_bytes == 0) {
>  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
>  				 mds->volatile_only_bytes, "ram");
>  		if (rc)
>  			return rc;
> +
>  		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
>  				   mds->volatile_only_bytes,
>  				   mds->persistent_only_bytes, "pmem");
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 89e560ea14c0..9c0b2fa72bdd 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -239,6 +239,15 @@ struct cxl_event_state {
>  	struct mutex log_lock;
>  };
>  
> +/* Device enabled DCD commands */
> +enum dcd_cmd_enabled_bits {
> +	CXL_DCD_ENABLED_GET_CONFIG,
> +	CXL_DCD_ENABLED_GET_EXTENT_LIST,
> +	CXL_DCD_ENABLED_ADD_RESPONSE,
> +	CXL_DCD_ENABLED_RELEASE,
> +	CXL_DCD_ENABLED_MAX
> +};
> +
>  /* Device enabled poison commands */
>  enum poison_cmd_enabled_bits {
>  	CXL_POISON_ENABLED_LIST,
> @@ -284,6 +293,9 @@ enum cxl_devtype {
>  	CXL_DEVTYPE_CLASSMEM,
>  };
>  
> +#define CXL_MAX_DC_REGION 8
> +#define CXL_DC_REGION_SRTLEN 8
> +
>  /**
>   * struct cxl_dev_state - The driver device state
>   *
> @@ -300,6 +312,8 @@ enum cxl_devtype {
>   * @dpa_res: Overall DPA resource tree for the device
>   * @pmem_res: Active Persistent memory capacity configuration
>   * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>   * @component_reg_phys: register base of component registers
>   * @info: Cached DVSEC information about the device.
>   * @serial: PCIe Device Serial Number
> @@ -315,6 +329,7 @@ struct cxl_dev_state {
>  	struct resource dpa_res;
>  	struct resource pmem_res;
>  	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];
>  	resource_size_t component_reg_phys;
>  	u64 serial;
>  	enum cxl_devtype type;
> @@ -334,9 +349,12 @@ struct cxl_dev_state {
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
> + * @dcd_cmds: List of DCD commands implemented by memory device
>   * @enabled_cmds: Hardware commands found enabled in CEL.
>   * @exclusive_cmds: Commands that are kernel-internal only
> - * @total_bytes: sum of all possible capacities
> + * @total_capacity: Sum of static and dynamic capacities
> + * @total_static_capacity: Sum of RAM and PMEM capacities
> + * @total_dynamic_capacity: Complete DPA range occupied by DC regions
>   * @volatile_only_bytes: hard volatile capacity
>   * @persistent_only_bytes: hard persistent capacity
>   * @partition_align_bytes: alignment size for partition-able capacity
> @@ -344,6 +362,10 @@ struct cxl_dev_state {
>   * @active_persistent_bytes: sum of hard + soft persistent
>   * @next_volatile_bytes: volatile capacity change pending device reset
>   * @next_persistent_bytes: persistent capacity change pending device reset
> + * @nr_dc_region: number of DC regions implemented in the memory device
> + * @dc_region: array containing info about the DC regions
> + * @dc_event_log_size: The number of events the device can store in the
> + * Dynamic Capacity Event Log before it overflows
>   * @event: event log driver state
>   * @poison: poison driver state info
>   * @mbox_send: @dev specific transport for transmitting mailbox commands
> @@ -357,9 +379,13 @@ struct cxl_memdev_state {
>  	size_t lsa_size;
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
> +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> -	u64 total_bytes;
> +
> +	u64 total_capacity;
> +	u64 total_static_capacity;
> +	u64 total_dynamic_capacity;

maybe cap, static_cap, dynamic_cap

(because I think I had a hand in defining the long names that
follow and deeply regret it ;))

>  	u64 volatile_only_bytes;
>  	u64 persistent_only_bytes;
>  	u64 partition_align_bytes;
> @@ -367,6 +393,20 @@ struct cxl_memdev_state {
>  	u64 active_persistent_bytes;
>  	u64 next_volatile_bytes;
>  	u64 next_persistent_bytes;
> +
> +	u8 nr_dc_region;
> +
> +	struct cxl_dc_region_info {
> +		u8 name[CXL_DC_REGION_SRTLEN];
> +		u64 base;
> +		u64 decode_len;
> +		u64 len;
> +		u64 blk_size;
> +		u32 dsmad_handle;
> +		u8 flags;
> +	} dc_region[CXL_MAX_DC_REGION];
> +
> +	size_t dc_event_log_size;
>  	struct cxl_event_state event;
>  	struct cxl_poison_state poison;
>  	int (*mbox_send)(struct cxl_memdev_state *mds,
> @@ -415,6 +455,10 @@ enum cxl_opcode {
>  	CXL_MBOX_OP_UNLOCK		= 0x4503,
>  	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
>  	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
> +	CXL_MBOX_OP_GET_DC_CONFIG	= 0x4800,
> +	CXL_MBOX_OP_GET_DC_EXTENT_LIST	= 0x4801,
> +	CXL_MBOX_OP_ADD_DC_RESPONSE	= 0x4802,
> +	CXL_MBOX_OP_RELEASE_DC		= 0x4803,
>  	CXL_MBOX_OP_MAX			= 0x10000
>  };
>  
> @@ -462,6 +506,7 @@ struct cxl_mbox_identify {
>  	__le16 inject_poison_limit;
>  	u8 poison_caps;
>  	u8 qos_telemetry_caps;
> +	__le16 dc_event_log_size;
>  } __packed;
>  
>  /*
> @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
>  	u8 flags;
>  } __packed;
>  
> +struct cxl_mbox_get_dc_config {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> +struct cxl_mbox_dynamic_capacity {
> +	u8 avail_region_count;
> +	u8 rsvd[7];
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[];
> +} __packed;
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)

This ^ goes with the cxl_mbox_set_partition_info above.
Please don't split.

> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
>  
>  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>  struct cxl_mbox_set_timestamp_in {
> @@ -742,6 +807,7 @@ enum {
>  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
>  			  struct cxl_mbox_cmd *cmd);
>  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 4e2845b7331a..ac1a41bc083d 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -742,6 +742,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	rc = cxl_dev_dynamic_capacity_identify(mds);
> +	if (rc)
> +		return rc;
> +
>  	rc = cxl_mem_create_range_info(mds);
>  	if (rc)
>  		return rc;
> 
> -- 
> 2.40.0
>
Ira Weiny June 15, 2023, 3:04 p.m. UTC | #3
Dave Jiang wrote:
> 
> 
> On 6/14/23 12:16, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > Read the Dynamic capacity configuration and store dynamic capacity region
> > information in the device state which driver will use to map into the HDM
> > ranges.
> > 
> > Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> > command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > 
> > ---
> > [iweiny: ensure all mds->dc_region's are named]
> > ---
> >   drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
> >   drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
> >   drivers/cxl/pci.c       |   4 +
> >   3 files changed, 256 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 3ca0bf12c55f..c5b696737c87 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -111,6 +111,37 @@ static u8 security_command_sets[] = {
> >   	0x46, /* Security Passthrough */
> >   };
> >   
> > +static bool cxl_is_dcd_command(u16 opcode)
> > +{
> > +#define CXL_MBOX_OP_DCD_CMDS 0x48
> 
> Move this to cxlmem.h?

This is only ever used in mbox.c.  So I left the scope restricted for now.
If modules need it in the future we can lift it.  Also
cxl_is_poison_command() is also this way.

> 
> > +
> > +	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
> > +		return true;
> > +
> > +	return false;
> 
> I think you simplify by:
> 
> return (opcode >> 8) == CXL_MBOX_OP_DCD_CMDS;

Good catch.  I'll clean it up.

[snip]

> > @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >   	}
> >   
> >   	cxlds->dpa_res =
> > -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> > +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> > +
> > +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> > +				 dcr->base, dcr->decode_len, dcr->name);
> > +		if (rc)
> > +			return rc;
> > +	}
> >   
> >   	if (mds->partition_align_bytes == 0) {
> >   		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
> >   				 mds->volatile_only_bytes, "ram");
> >   		if (rc)
> >   			return rc;
> > +
> 
> Stray blank line?

Ah yep!  Fixed!

Thanks for looking!
Ira
nifan@outlook.com June 15, 2023, 6:30 p.m. UTC | #4
The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Read the Dynamic capacity configuration and store dynamic capacity region
> information in the device state which driver will use to map into the HDM
> ranges.
> 
> Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 

See the comments below how total_dynamic_capacity is collected.


> ---
> [iweiny: ensure all mds->dc_region's are named]
> ---
>  drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
>  drivers/cxl/pci.c       |   4 +
>  3 files changed, 256 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 3ca0bf12c55f..c5b696737c87 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -111,6 +111,37 @@ static u8 security_command_sets[] = {
>  	0x46, /* Security Passthrough */
>  };
>  
> +static bool cxl_is_dcd_command(u16 opcode)
> +{
> +#define CXL_MBOX_OP_DCD_CMDS 0x48
> +
> +	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
> +		return true;
> +
> +	return false;
> +}
> +
> +static void cxl_set_dcd_cmd_enabled(struct cxl_memdev_state *mds,
> +					u16 opcode)
> +{
> +	switch (opcode) {
> +	case CXL_MBOX_OP_GET_DC_CONFIG:
> +		set_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_GET_DC_EXTENT_LIST:
> +		set_bit(CXL_DCD_ENABLED_GET_EXTENT_LIST, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_ADD_DC_RESPONSE:
> +		set_bit(CXL_DCD_ENABLED_ADD_RESPONSE, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_RELEASE_DC:
> +		set_bit(CXL_DCD_ENABLED_RELEASE, mds->dcd_cmds);
> +		break;
> +	default:
> +		break;
> +	}
> +}
> +
>  static bool cxl_is_security_command(u16 opcode)
>  {
>  	int i;
> @@ -666,6 +697,7 @@ static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
>  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  {
>  	struct cxl_cel_entry *cel_entry;
> +	struct cxl_mem_command *cmd;
>  	const int cel_entries = size / sizeof(*cel_entry);
>  	struct device *dev = mds->cxlds.dev;
>  	int i;
> @@ -674,11 +706,12 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  
>  	for (i = 0; i < cel_entries; i++) {
>  		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
> -		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
> +		cmd = cxl_mem_find_command(opcode);
>  
> -		if (!cmd && !cxl_is_poison_command(opcode)) {
> -			dev_dbg(dev,
> -				"Opcode 0x%04x unsupported by driver\n", opcode);
> +		if (!cmd && !cxl_is_poison_command(opcode) &&
> +		    !cxl_is_dcd_command(opcode)) {
> +			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
> +				opcode);
>  			continue;
>  		}
>  
> @@ -688,6 +721,9 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  		if (cxl_is_poison_command(opcode))
>  			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
>  
> +		if (cxl_is_dcd_command(opcode))
> +			cxl_set_dcd_cmd_enabled(mds, opcode);
> +
>  		dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode);
>  	}
>  }
> @@ -1059,7 +1095,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  	if (rc < 0)
>  		return rc;
>  
> -	mds->total_bytes =
> +	mds->total_static_capacity =
>  		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
>  	mds->volatile_only_bytes =
>  		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> @@ -1077,10 +1113,137 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
>  	}
>  
> +	mds->dc_event_log_size = le16_to_cpu(id.dc_event_log_size);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
>  
> +/**
> + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> + * information from the device.
> + * @mds: The memory device state
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the get_dynamic_capacity command to the device
> + * and on success populate structures to be exported to sysfs.
> + */
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> +{
> +	struct cxl_dev_state *cxlds = &mds->cxlds;
> +	struct device *dev = cxlds->dev;
> +	struct cxl_mbox_dynamic_capacity *dc;
> +	struct cxl_mbox_get_dc_config get_dc;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	u64 next_dc_region_start;
> +	int rc, i;
> +
> +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> +		sprintf(mds->dc_region[i].name, "dc%d", i);
> +
> +	/* Check GET_DC_CONFIG is supported by device */
> +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> +		return 0;
> +	}
> +
> +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> +	if (!dc)
> +		return -ENOMEM;
> +
> +	get_dc = (struct cxl_mbox_get_dc_config) {
> +		.region_count = CXL_MAX_DC_REGION,
> +		.start_region_index = 0,
> +	};
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> +		.payload_in = &get_dc,
> +		.size_in = sizeof(get_dc),
> +		.size_out = mds->payload_size,
> +		.payload_out = dc,
> +		.min_out = 1,
> +	};
> +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> +	if (rc < 0)
> +		goto dc_error;
> +
> +	mds->nr_dc_region = dc->avail_region_count;
> +
> +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> +			mds->nr_dc_region);
> +		rc = -EINVAL;
> +		goto dc_error;
> +	}
> +
> +	for (i = 0; i < mds->nr_dc_region; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> +		dcr->decode_len =
> +			le64_to_cpu(dc->region[i].region_decode_length);
> +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> +
> +		/* Check regions are in increasing DPA order */
> +		if ((i + 1) < mds->nr_dc_region) {
> +			next_dc_region_start =
> +				le64_to_cpu(dc->region[i + 1].region_base);
> +			if ((dcr->base > next_dc_region_start) ||
> +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
> +				dev_err(dev,
> +					"DPA ordering violation for DC region %d and %d\n",
> +					i, i + 1);
> +				rc = -EINVAL;
> +				goto dc_error;
> +			}
> +		}
> +
> +		/* Check the region is 256 MB aligned */
> +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
> +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		/* Check Region base and length are aligned to block size */
> +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> +				dcr->blk_size);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		dcr->dsmad_handle =
> +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> +		dcr->flags = dc->region[i].flags;
> +		sprintf(dcr->name, "dc%d", i);
> +
> +		dev_dbg(dev,
> +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> +	}
> +
> +	/*
> +	 * Calculate entire DPA range of all configured regions which will be mapped by
> +	 * one or more HDM decoders
> +	 */
> +	mds->total_dynamic_capacity =
> +		mds->dc_region[mds->nr_dc_region - 1].base +
> +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> +		mds->dc_region[0].base;

The above code assumes there is no DPA address gap between two adjacent dc
regions. Not sure whether it will always be true or not, I cannot find any
specific statement in the spec. 

Fan

> +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> +		mds->total_dynamic_capacity);
> +
> +dc_error:
> +	kvfree(dc);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> +
>  static int add_dpa_res(struct device *dev, struct resource *parent,
>  		       struct resource *res, resource_size_t start,
>  		       resource_size_t size, const char *type)
> @@ -1112,6 +1275,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	struct cxl_dev_state *cxlds = &mds->cxlds;
>  	struct device *dev = cxlds->dev;
>  	int rc;
> +	size_t untenanted_mem =
> +		mds->dc_region[0].base - mds->total_static_capacity;
> +
> +	mds->total_capacity = mds->total_static_capacity +
> +			untenanted_mem + mds->total_dynamic_capacity;
>  
>  	if (!cxlds->media_ready) {
>  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	}
>  
>  	cxlds->dpa_res =
> -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> +
> +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> +				 dcr->base, dcr->decode_len, dcr->name);
> +		if (rc)
> +			return rc;
> +	}
>  
>  	if (mds->partition_align_bytes == 0) {
>  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
>  				 mds->volatile_only_bytes, "ram");
>  		if (rc)
>  			return rc;
> +
>  		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
>  				   mds->volatile_only_bytes,
>  				   mds->persistent_only_bytes, "pmem");
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 89e560ea14c0..9c0b2fa72bdd 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -239,6 +239,15 @@ struct cxl_event_state {
>  	struct mutex log_lock;
>  };
>  
> +/* Device enabled DCD commands */
> +enum dcd_cmd_enabled_bits {
> +	CXL_DCD_ENABLED_GET_CONFIG,
> +	CXL_DCD_ENABLED_GET_EXTENT_LIST,
> +	CXL_DCD_ENABLED_ADD_RESPONSE,
> +	CXL_DCD_ENABLED_RELEASE,
> +	CXL_DCD_ENABLED_MAX
> +};
> +
>  /* Device enabled poison commands */
>  enum poison_cmd_enabled_bits {
>  	CXL_POISON_ENABLED_LIST,
> @@ -284,6 +293,9 @@ enum cxl_devtype {
>  	CXL_DEVTYPE_CLASSMEM,
>  };
>  
> +#define CXL_MAX_DC_REGION 8
> +#define CXL_DC_REGION_SRTLEN 8
> +
>  /**
>   * struct cxl_dev_state - The driver device state
>   *
> @@ -300,6 +312,8 @@ enum cxl_devtype {
>   * @dpa_res: Overall DPA resource tree for the device
>   * @pmem_res: Active Persistent memory capacity configuration
>   * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>   * @component_reg_phys: register base of component registers
>   * @info: Cached DVSEC information about the device.
>   * @serial: PCIe Device Serial Number
> @@ -315,6 +329,7 @@ struct cxl_dev_state {
>  	struct resource dpa_res;
>  	struct resource pmem_res;
>  	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];
>  	resource_size_t component_reg_phys;
>  	u64 serial;
>  	enum cxl_devtype type;
> @@ -334,9 +349,12 @@ struct cxl_dev_state {
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
> + * @dcd_cmds: List of DCD commands implemented by memory device
>   * @enabled_cmds: Hardware commands found enabled in CEL.
>   * @exclusive_cmds: Commands that are kernel-internal only
> - * @total_bytes: sum of all possible capacities
> + * @total_capacity: Sum of static and dynamic capacities
> + * @total_static_capacity: Sum of RAM and PMEM capacities
> + * @total_dynamic_capacity: Complete DPA range occupied by DC regions
>   * @volatile_only_bytes: hard volatile capacity
>   * @persistent_only_bytes: hard persistent capacity
>   * @partition_align_bytes: alignment size for partition-able capacity
> @@ -344,6 +362,10 @@ struct cxl_dev_state {
>   * @active_persistent_bytes: sum of hard + soft persistent
>   * @next_volatile_bytes: volatile capacity change pending device reset
>   * @next_persistent_bytes: persistent capacity change pending device reset
> + * @nr_dc_region: number of DC regions implemented in the memory device
> + * @dc_region: array containing info about the DC regions
> + * @dc_event_log_size: The number of events the device can store in the
> + * Dynamic Capacity Event Log before it overflows
>   * @event: event log driver state
>   * @poison: poison driver state info
>   * @mbox_send: @dev specific transport for transmitting mailbox commands
> @@ -357,9 +379,13 @@ struct cxl_memdev_state {
>  	size_t lsa_size;
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
> +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> -	u64 total_bytes;
> +
> +	u64 total_capacity;
> +	u64 total_static_capacity;
> +	u64 total_dynamic_capacity;
>  	u64 volatile_only_bytes;
>  	u64 persistent_only_bytes;
>  	u64 partition_align_bytes;
> @@ -367,6 +393,20 @@ struct cxl_memdev_state {
>  	u64 active_persistent_bytes;
>  	u64 next_volatile_bytes;
>  	u64 next_persistent_bytes;
> +
> +	u8 nr_dc_region;
> +
> +	struct cxl_dc_region_info {
> +		u8 name[CXL_DC_REGION_SRTLEN];
> +		u64 base;
> +		u64 decode_len;
> +		u64 len;
> +		u64 blk_size;
> +		u32 dsmad_handle;
> +		u8 flags;
> +	} dc_region[CXL_MAX_DC_REGION];
> +
> +	size_t dc_event_log_size;
>  	struct cxl_event_state event;
>  	struct cxl_poison_state poison;
>  	int (*mbox_send)(struct cxl_memdev_state *mds,
> @@ -415,6 +455,10 @@ enum cxl_opcode {
>  	CXL_MBOX_OP_UNLOCK		= 0x4503,
>  	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
>  	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
> +	CXL_MBOX_OP_GET_DC_CONFIG	= 0x4800,
> +	CXL_MBOX_OP_GET_DC_EXTENT_LIST	= 0x4801,
> +	CXL_MBOX_OP_ADD_DC_RESPONSE	= 0x4802,
> +	CXL_MBOX_OP_RELEASE_DC		= 0x4803,
>  	CXL_MBOX_OP_MAX			= 0x10000
>  };
>  
> @@ -462,6 +506,7 @@ struct cxl_mbox_identify {
>  	__le16 inject_poison_limit;
>  	u8 poison_caps;
>  	u8 qos_telemetry_caps;
> +	__le16 dc_event_log_size;
>  } __packed;
>  
>  /*
> @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
>  	u8 flags;
>  } __packed;
>  
> +struct cxl_mbox_get_dc_config {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> +struct cxl_mbox_dynamic_capacity {
> +	u8 avail_region_count;
> +	u8 rsvd[7];
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[];
> +} __packed;
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
>  
>  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>  struct cxl_mbox_set_timestamp_in {
> @@ -742,6 +807,7 @@ enum {
>  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
>  			  struct cxl_mbox_cmd *cmd);
>  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 4e2845b7331a..ac1a41bc083d 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -742,6 +742,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	rc = cxl_dev_dynamic_capacity_identify(mds);
> +	if (rc)
> +		return rc;
> +
>  	rc = cxl_mem_create_range_info(mds);
>  	if (rc)
>  		return rc;
> 
> -- 
> 2.40.0
>
Navneet Singh June 15, 2023, 7:17 p.m. UTC | #5
On Thu, Jun 15, 2023 at 11:30:08AM -0700, Fan Ni wrote:
> The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > Read the Dynamic capacity configuration and store dynamic capacity region
> > information in the device state which driver will use to map into the HDM
> > ranges.
> > 
> > Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> > command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > 
> 
> See the comments below how total_dynamic_capacity is collected.
> 
> 
> > ---
> > [iweiny: ensure all mds->dc_region's are named]
> > ---
> >  drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
> >  drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
> >  drivers/cxl/pci.c       |   4 +
> >  3 files changed, 256 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 3ca0bf12c55f..c5b696737c87 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -111,6 +111,37 @@ static u8 security_command_sets[] = {
> >  	0x46, /* Security Passthrough */
> >  };
> >  
> > +static bool cxl_is_dcd_command(u16 opcode)
> > +{
> > +#define CXL_MBOX_OP_DCD_CMDS 0x48
> > +
> > +	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +static void cxl_set_dcd_cmd_enabled(struct cxl_memdev_state *mds,
> > +					u16 opcode)
> > +{
> > +	switch (opcode) {
> > +	case CXL_MBOX_OP_GET_DC_CONFIG:
> > +		set_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > +		break;
> > +	case CXL_MBOX_OP_GET_DC_EXTENT_LIST:
> > +		set_bit(CXL_DCD_ENABLED_GET_EXTENT_LIST, mds->dcd_cmds);
> > +		break;
> > +	case CXL_MBOX_OP_ADD_DC_RESPONSE:
> > +		set_bit(CXL_DCD_ENABLED_ADD_RESPONSE, mds->dcd_cmds);
> > +		break;
> > +	case CXL_MBOX_OP_RELEASE_DC:
> > +		set_bit(CXL_DCD_ENABLED_RELEASE, mds->dcd_cmds);
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +}
> > +
> >  static bool cxl_is_security_command(u16 opcode)
> >  {
> >  	int i;
> > @@ -666,6 +697,7 @@ static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
> >  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
> >  {
> >  	struct cxl_cel_entry *cel_entry;
> > +	struct cxl_mem_command *cmd;
> >  	const int cel_entries = size / sizeof(*cel_entry);
> >  	struct device *dev = mds->cxlds.dev;
> >  	int i;
> > @@ -674,11 +706,12 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
> >  
> >  	for (i = 0; i < cel_entries; i++) {
> >  		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
> > -		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
> > +		cmd = cxl_mem_find_command(opcode);
> >  
> > -		if (!cmd && !cxl_is_poison_command(opcode)) {
> > -			dev_dbg(dev,
> > -				"Opcode 0x%04x unsupported by driver\n", opcode);
> > +		if (!cmd && !cxl_is_poison_command(opcode) &&
> > +		    !cxl_is_dcd_command(opcode)) {
> > +			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
> > +				opcode);
> >  			continue;
> >  		}
> >  
> > @@ -688,6 +721,9 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
> >  		if (cxl_is_poison_command(opcode))
> >  			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
> >  
> > +		if (cxl_is_dcd_command(opcode))
> > +			cxl_set_dcd_cmd_enabled(mds, opcode);
> > +
> >  		dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode);
> >  	}
> >  }
> > @@ -1059,7 +1095,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
> >  	if (rc < 0)
> >  		return rc;
> >  
> > -	mds->total_bytes =
> > +	mds->total_static_capacity =
> >  		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
> >  	mds->volatile_only_bytes =
> >  		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> > @@ -1077,10 +1113,137 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
> >  		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
> >  	}
> >  
> > +	mds->dc_event_log_size = le16_to_cpu(id.dc_event_log_size);
> > +
> >  	return 0;
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
> >  
> > +/**
> > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> > + * information from the device.
> > + * @mds: The memory device state
> > + * Return: 0 if identify was executed successfully.
> > + *
> > + * This will dispatch the get_dynamic_capacity command to the device
> > + * and on success populate structures to be exported to sysfs.
> > + */
> > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > +{
> > +	struct cxl_dev_state *cxlds = &mds->cxlds;
> > +	struct device *dev = cxlds->dev;
> > +	struct cxl_mbox_dynamic_capacity *dc;
> > +	struct cxl_mbox_get_dc_config get_dc;
> > +	struct cxl_mbox_cmd mbox_cmd;
> > +	u64 next_dc_region_start;
> > +	int rc, i;
> > +
> > +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> > +		sprintf(mds->dc_region[i].name, "dc%d", i);
> > +
> > +	/* Check GET_DC_CONFIG is supported by device */
> > +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> > +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> > +		return 0;
> > +	}
> > +
> > +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> > +	if (!dc)
> > +		return -ENOMEM;
> > +
> > +	get_dc = (struct cxl_mbox_get_dc_config) {
> > +		.region_count = CXL_MAX_DC_REGION,
> > +		.start_region_index = 0,
> > +	};
> > +
> > +	mbox_cmd = (struct cxl_mbox_cmd) {
> > +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> > +		.payload_in = &get_dc,
> > +		.size_in = sizeof(get_dc),
> > +		.size_out = mds->payload_size,
> > +		.payload_out = dc,
> > +		.min_out = 1,
> > +	};
> > +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> > +	if (rc < 0)
> > +		goto dc_error;
> > +
> > +	mds->nr_dc_region = dc->avail_region_count;
> > +
> > +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> > +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> > +			mds->nr_dc_region);
> > +		rc = -EINVAL;
> > +		goto dc_error;
> > +	}
> > +
> > +	for (i = 0; i < mds->nr_dc_region; i++) {
> > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> > +		dcr->decode_len =
> > +			le64_to_cpu(dc->region[i].region_decode_length);
> > +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> > +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> > +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> > +
> > +		/* Check regions are in increasing DPA order */
> > +		if ((i + 1) < mds->nr_dc_region) {
> > +			next_dc_region_start =
> > +				le64_to_cpu(dc->region[i + 1].region_base);
> > +			if ((dcr->base > next_dc_region_start) ||
> > +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
> > +				dev_err(dev,
> > +					"DPA ordering violation for DC region %d and %d\n",
> > +					i, i + 1);
> > +				rc = -EINVAL;
> > +				goto dc_error;
> > +			}
> > +		}
> > +
> > +		/* Check the region is 256 MB aligned */
> > +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
> > +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> > +			rc = -EINVAL;
> > +			goto dc_error;
> > +		}
> > +
> > +		/* Check Region base and length are aligned to block size */
> > +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> > +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> > +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> > +				dcr->blk_size);
> > +			rc = -EINVAL;
> > +			goto dc_error;
> > +		}
> > +
> > +		dcr->dsmad_handle =
> > +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> > +		dcr->flags = dc->region[i].flags;
> > +		sprintf(dcr->name, "dc%d", i);
> > +
> > +		dev_dbg(dev,
> > +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> > +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> > +	}
> > +
> > +	/*
> > +	 * Calculate entire DPA range of all configured regions which will be mapped by
> > +	 * one or more HDM decoders
> > +	 */
> > +	mds->total_dynamic_capacity =
> > +		mds->dc_region[mds->nr_dc_region - 1].base +
> > +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> > +		mds->dc_region[0].base;
> 
> The above code assumes there is no DPA address gap between two adjacent dc
> regions. Not sure whether it will always be true or not, I cannot find any
> specific statement in the spec. 
> 
> Fan
> 
Navneet - Total dynamic capacity name gives a perception that its a sum of all DC regions but its
a DPA range which contains all the dc regions. Each DC region is added
as child resource to it. While allocating the DPA gaps between the
regions are taken care to skip. Maybe a better name would help here to
avoid confusion.

> > +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> > +		mds->total_dynamic_capacity);
> > +
> > +dc_error:
> > +	kvfree(dc);
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> > +
> >  static int add_dpa_res(struct device *dev, struct resource *parent,
> >  		       struct resource *res, resource_size_t start,
> >  		       resource_size_t size, const char *type)
> > @@ -1112,6 +1275,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >  	struct cxl_dev_state *cxlds = &mds->cxlds;
> >  	struct device *dev = cxlds->dev;
> >  	int rc;
> > +	size_t untenanted_mem =
> > +		mds->dc_region[0].base - mds->total_static_capacity;
> > +
> > +	mds->total_capacity = mds->total_static_capacity +
> > +			untenanted_mem + mds->total_dynamic_capacity;
> >  
> >  	if (!cxlds->media_ready) {
> >  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> > @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >  	}
> >  
> >  	cxlds->dpa_res =
> > -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> > +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> > +
> > +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> > +				 dcr->base, dcr->decode_len, dcr->name);
> > +		if (rc)
> > +			return rc;
> > +	}
> >  
> >  	if (mds->partition_align_bytes == 0) {
> >  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
> >  				 mds->volatile_only_bytes, "ram");
> >  		if (rc)
> >  			return rc;
> > +
> >  		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
> >  				   mds->volatile_only_bytes,
> >  				   mds->persistent_only_bytes, "pmem");
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 89e560ea14c0..9c0b2fa72bdd 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -239,6 +239,15 @@ struct cxl_event_state {
> >  	struct mutex log_lock;
> >  };
> >  
> > +/* Device enabled DCD commands */
> > +enum dcd_cmd_enabled_bits {
> > +	CXL_DCD_ENABLED_GET_CONFIG,
> > +	CXL_DCD_ENABLED_GET_EXTENT_LIST,
> > +	CXL_DCD_ENABLED_ADD_RESPONSE,
> > +	CXL_DCD_ENABLED_RELEASE,
> > +	CXL_DCD_ENABLED_MAX
> > +};
> > +
> >  /* Device enabled poison commands */
> >  enum poison_cmd_enabled_bits {
> >  	CXL_POISON_ENABLED_LIST,
> > @@ -284,6 +293,9 @@ enum cxl_devtype {
> >  	CXL_DEVTYPE_CLASSMEM,
> >  };
> >  
> > +#define CXL_MAX_DC_REGION 8
> > +#define CXL_DC_REGION_SRTLEN 8
> > +
> >  /**
> >   * struct cxl_dev_state - The driver device state
> >   *
> > @@ -300,6 +312,8 @@ enum cxl_devtype {
> >   * @dpa_res: Overall DPA resource tree for the device
> >   * @pmem_res: Active Persistent memory capacity configuration
> >   * @ram_res: Active Volatile memory capacity configuration
> > + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> > + *          region
> >   * @component_reg_phys: register base of component registers
> >   * @info: Cached DVSEC information about the device.
> >   * @serial: PCIe Device Serial Number
> > @@ -315,6 +329,7 @@ struct cxl_dev_state {
> >  	struct resource dpa_res;
> >  	struct resource pmem_res;
> >  	struct resource ram_res;
> > +	struct resource dc_res[CXL_MAX_DC_REGION];
> >  	resource_size_t component_reg_phys;
> >  	u64 serial;
> >  	enum cxl_devtype type;
> > @@ -334,9 +349,12 @@ struct cxl_dev_state {
> >   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> >   * @mbox_mutex: Mutex to synchronize mailbox access.
> >   * @firmware_version: Firmware version for the memory device.
> > + * @dcd_cmds: List of DCD commands implemented by memory device
> >   * @enabled_cmds: Hardware commands found enabled in CEL.
> >   * @exclusive_cmds: Commands that are kernel-internal only
> > - * @total_bytes: sum of all possible capacities
> > + * @total_capacity: Sum of static and dynamic capacities
> > + * @total_static_capacity: Sum of RAM and PMEM capacities
> > + * @total_dynamic_capacity: Complete DPA range occupied by DC regions
> >   * @volatile_only_bytes: hard volatile capacity
> >   * @persistent_only_bytes: hard persistent capacity
> >   * @partition_align_bytes: alignment size for partition-able capacity
> > @@ -344,6 +362,10 @@ struct cxl_dev_state {
> >   * @active_persistent_bytes: sum of hard + soft persistent
> >   * @next_volatile_bytes: volatile capacity change pending device reset
> >   * @next_persistent_bytes: persistent capacity change pending device reset
> > + * @nr_dc_region: number of DC regions implemented in the memory device
> > + * @dc_region: array containing info about the DC regions
> > + * @dc_event_log_size: The number of events the device can store in the
> > + * Dynamic Capacity Event Log before it overflows
> >   * @event: event log driver state
> >   * @poison: poison driver state info
> >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > @@ -357,9 +379,13 @@ struct cxl_memdev_state {
> >  	size_t lsa_size;
> >  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
> >  	char firmware_version[0x10];
> > +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
> >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> > -	u64 total_bytes;
> > +
> > +	u64 total_capacity;
> > +	u64 total_static_capacity;
> > +	u64 total_dynamic_capacity;
> >  	u64 volatile_only_bytes;
> >  	u64 persistent_only_bytes;
> >  	u64 partition_align_bytes;
> > @@ -367,6 +393,20 @@ struct cxl_memdev_state {
> >  	u64 active_persistent_bytes;
> >  	u64 next_volatile_bytes;
> >  	u64 next_persistent_bytes;
> > +
> > +	u8 nr_dc_region;
> > +
> > +	struct cxl_dc_region_info {
> > +		u8 name[CXL_DC_REGION_SRTLEN];
> > +		u64 base;
> > +		u64 decode_len;
> > +		u64 len;
> > +		u64 blk_size;
> > +		u32 dsmad_handle;
> > +		u8 flags;
> > +	} dc_region[CXL_MAX_DC_REGION];
> > +
> > +	size_t dc_event_log_size;
> >  	struct cxl_event_state event;
> >  	struct cxl_poison_state poison;
> >  	int (*mbox_send)(struct cxl_memdev_state *mds,
> > @@ -415,6 +455,10 @@ enum cxl_opcode {
> >  	CXL_MBOX_OP_UNLOCK		= 0x4503,
> >  	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
> >  	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
> > +	CXL_MBOX_OP_GET_DC_CONFIG	= 0x4800,
> > +	CXL_MBOX_OP_GET_DC_EXTENT_LIST	= 0x4801,
> > +	CXL_MBOX_OP_ADD_DC_RESPONSE	= 0x4802,
> > +	CXL_MBOX_OP_RELEASE_DC		= 0x4803,
> >  	CXL_MBOX_OP_MAX			= 0x10000
> >  };
> >  
> > @@ -462,6 +506,7 @@ struct cxl_mbox_identify {
> >  	__le16 inject_poison_limit;
> >  	u8 poison_caps;
> >  	u8 qos_telemetry_caps;
> > +	__le16 dc_event_log_size;
> >  } __packed;
> >  
> >  /*
> > @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
> >  	u8 flags;
> >  } __packed;
> >  
> > +struct cxl_mbox_get_dc_config {
> > +	u8 region_count;
> > +	u8 start_region_index;
> > +} __packed;
> > +
> > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> > +struct cxl_mbox_dynamic_capacity {
> > +	u8 avail_region_count;
> > +	u8 rsvd[7];
> > +	struct cxl_dc_region_config {
> > +		__le64 region_base;
> > +		__le64 region_decode_length;
> > +		__le64 region_length;
> > +		__le64 region_block_size;
> > +		__le32 region_dsmad_handle;
> > +		u8 flags;
> > +		u8 rsvd[3];
> > +	} __packed region[];
> > +} __packed;
> >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
> >  
> >  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
> >  struct cxl_mbox_set_timestamp_in {
> > @@ -742,6 +807,7 @@ enum {
> >  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
> >  			  struct cxl_mbox_cmd *cmd);
> >  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
> >  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
> >  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
> >  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 4e2845b7331a..ac1a41bc083d 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -742,6 +742,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > +	rc = cxl_dev_dynamic_capacity_identify(mds);
> > +	if (rc)
> > +		return rc;
> > +
> >  	rc = cxl_mem_create_range_info(mds);
> >  	if (rc)
> >  		return rc;
> > 
> > -- 
> > 2.40.0
> > 
> 
> -- 
> Fan Ni <nifan@outlook.com>
nifan@outlook.com June 15, 2023, 9:41 p.m. UTC | #6
The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Read the Dynamic capacity configuration and store dynamic capacity region
> information in the device state which driver will use to map into the HDM
> ranges.
> 
> Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: ensure all mds->dc_region's are named]
> ---
>  drivers/cxl/core/mbox.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/cxl/cxlmem.h    |  70 +++++++++++++++++-
>  drivers/cxl/pci.c       |   4 +
>  3 files changed, 256 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 3ca0bf12c55f..c5b696737c87 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -111,6 +111,37 @@ static u8 security_command_sets[] = {
>  	0x46, /* Security Passthrough */
>  };
>  
> +static bool cxl_is_dcd_command(u16 opcode)
> +{
> +#define CXL_MBOX_OP_DCD_CMDS 0x48
> +
> +	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
> +		return true;
> +
> +	return false;
> +}
> +
> +static void cxl_set_dcd_cmd_enabled(struct cxl_memdev_state *mds,
> +					u16 opcode)
> +{
> +	switch (opcode) {
> +	case CXL_MBOX_OP_GET_DC_CONFIG:
> +		set_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_GET_DC_EXTENT_LIST:
> +		set_bit(CXL_DCD_ENABLED_GET_EXTENT_LIST, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_ADD_DC_RESPONSE:
> +		set_bit(CXL_DCD_ENABLED_ADD_RESPONSE, mds->dcd_cmds);
> +		break;
> +	case CXL_MBOX_OP_RELEASE_DC:
> +		set_bit(CXL_DCD_ENABLED_RELEASE, mds->dcd_cmds);
> +		break;
> +	default:
> +		break;
> +	}
> +}
> +
>  static bool cxl_is_security_command(u16 opcode)
>  {
>  	int i;
> @@ -666,6 +697,7 @@ static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
>  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  {
>  	struct cxl_cel_entry *cel_entry;
> +	struct cxl_mem_command *cmd;
>  	const int cel_entries = size / sizeof(*cel_entry);
>  	struct device *dev = mds->cxlds.dev;
>  	int i;
> @@ -674,11 +706,12 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  
>  	for (i = 0; i < cel_entries; i++) {
>  		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
> -		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
> +		cmd = cxl_mem_find_command(opcode);
>  
> -		if (!cmd && !cxl_is_poison_command(opcode)) {
> -			dev_dbg(dev,
> -				"Opcode 0x%04x unsupported by driver\n", opcode);
> +		if (!cmd && !cxl_is_poison_command(opcode) &&
> +		    !cxl_is_dcd_command(opcode)) {
> +			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
> +				opcode);
>  			continue;
>  		}
>  
> @@ -688,6 +721,9 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
>  		if (cxl_is_poison_command(opcode))
>  			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
>  
> +		if (cxl_is_dcd_command(opcode))
> +			cxl_set_dcd_cmd_enabled(mds, opcode);
> +
>  		dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode);
>  	}
>  }
> @@ -1059,7 +1095,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  	if (rc < 0)
>  		return rc;
>  
> -	mds->total_bytes =
> +	mds->total_static_capacity =
>  		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
>  	mds->volatile_only_bytes =
>  		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
> @@ -1077,10 +1113,137 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds)
>  		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
>  	}
>  
> +	mds->dc_event_log_size = le16_to_cpu(id.dc_event_log_size);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
>  
> +/**
> + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> + * information from the device.
> + * @mds: The memory device state
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the get_dynamic_capacity command to the device
> + * and on success populate structures to be exported to sysfs.
> + */
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> +{
> +	struct cxl_dev_state *cxlds = &mds->cxlds;
> +	struct device *dev = cxlds->dev;
> +	struct cxl_mbox_dynamic_capacity *dc;
> +	struct cxl_mbox_get_dc_config get_dc;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	u64 next_dc_region_start;
> +	int rc, i;
> +
> +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> +		sprintf(mds->dc_region[i].name, "dc%d", i);
> +
> +	/* Check GET_DC_CONFIG is supported by device */
> +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> +		return 0;
> +	}
> +
> +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> +	if (!dc)
> +		return -ENOMEM;
> +
> +	get_dc = (struct cxl_mbox_get_dc_config) {
> +		.region_count = CXL_MAX_DC_REGION,
> +		.start_region_index = 0,
> +	};
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> +		.payload_in = &get_dc,
> +		.size_in = sizeof(get_dc),
> +		.size_out = mds->payload_size,
> +		.payload_out = dc,
> +		.min_out = 1,
> +	};
> +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> +	if (rc < 0)
> +		goto dc_error;
> +
> +	mds->nr_dc_region = dc->avail_region_count;
> +
> +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> +			mds->nr_dc_region);
> +		rc = -EINVAL;
> +		goto dc_error;
> +	}
> +
> +	for (i = 0; i < mds->nr_dc_region; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> +		dcr->decode_len =
> +			le64_to_cpu(dc->region[i].region_decode_length);
> +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> +
> +		/* Check regions are in increasing DPA order */
> +		if ((i + 1) < mds->nr_dc_region) {
> +			next_dc_region_start =
> +				le64_to_cpu(dc->region[i + 1].region_base);
> +			if ((dcr->base > next_dc_region_start) ||
> +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
> +				dev_err(dev,
> +					"DPA ordering violation for DC region %d and %d\n",
> +					i, i + 1);
> +				rc = -EINVAL;
> +				goto dc_error;
> +			}
> +		}
> +
> +		/* Check the region is 256 MB aligned */
> +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
> +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		/* Check Region base and length are aligned to block size */
> +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> +				dcr->blk_size);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		dcr->dsmad_handle =
> +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> +		dcr->flags = dc->region[i].flags;
> +		sprintf(dcr->name, "dc%d", i);
> +
> +		dev_dbg(dev,
> +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> +	}
> +
> +	/*
> +	 * Calculate entire DPA range of all configured regions which will be mapped by
> +	 * one or more HDM decoders
> +	 */
> +	mds->total_dynamic_capacity =
> +		mds->dc_region[mds->nr_dc_region - 1].base +
> +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> +		mds->dc_region[0].base;
> +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> +		mds->total_dynamic_capacity);
> +
> +dc_error:
> +	kvfree(dc);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> +
>  static int add_dpa_res(struct device *dev, struct resource *parent,
>  		       struct resource *res, resource_size_t start,
>  		       resource_size_t size, const char *type)
> @@ -1112,6 +1275,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	struct cxl_dev_state *cxlds = &mds->cxlds;
>  	struct device *dev = cxlds->dev;
>  	int rc;
> +	size_t untenanted_mem =
> +		mds->dc_region[0].base - mds->total_static_capacity;
> +
> +	mds->total_capacity = mds->total_static_capacity +
> +			untenanted_mem + mds->total_dynamic_capacity;
>  
>  	if (!cxlds->media_ready) {
>  		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
> @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	}
>  
>  	cxlds->dpa_res =
> -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> +
> +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {

should it be i < mds->nr_dc_region???

> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> +				 dcr->base, dcr->decode_len, dcr->name);
> +		if (rc)
> +			return rc;
> +	}
>  
>  	if (mds->partition_align_bytes == 0) {
>  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
>  				 mds->volatile_only_bytes, "ram");
>  		if (rc)
>  			return rc;
> +
>  		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
>  				   mds->volatile_only_bytes,
>  				   mds->persistent_only_bytes, "pmem");
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 89e560ea14c0..9c0b2fa72bdd 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -239,6 +239,15 @@ struct cxl_event_state {
>  	struct mutex log_lock;
>  };
>  
> +/* Device enabled DCD commands */
> +enum dcd_cmd_enabled_bits {
> +	CXL_DCD_ENABLED_GET_CONFIG,
> +	CXL_DCD_ENABLED_GET_EXTENT_LIST,
> +	CXL_DCD_ENABLED_ADD_RESPONSE,
> +	CXL_DCD_ENABLED_RELEASE,
> +	CXL_DCD_ENABLED_MAX
> +};
> +
>  /* Device enabled poison commands */
>  enum poison_cmd_enabled_bits {
>  	CXL_POISON_ENABLED_LIST,
> @@ -284,6 +293,9 @@ enum cxl_devtype {
>  	CXL_DEVTYPE_CLASSMEM,
>  };
>  
> +#define CXL_MAX_DC_REGION 8
> +#define CXL_DC_REGION_SRTLEN 8
> +
>  /**
>   * struct cxl_dev_state - The driver device state
>   *
> @@ -300,6 +312,8 @@ enum cxl_devtype {
>   * @dpa_res: Overall DPA resource tree for the device
>   * @pmem_res: Active Persistent memory capacity configuration
>   * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>   * @component_reg_phys: register base of component registers
>   * @info: Cached DVSEC information about the device.
>   * @serial: PCIe Device Serial Number
> @@ -315,6 +329,7 @@ struct cxl_dev_state {
>  	struct resource dpa_res;
>  	struct resource pmem_res;
>  	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];
>  	resource_size_t component_reg_phys;
>  	u64 serial;
>  	enum cxl_devtype type;
> @@ -334,9 +349,12 @@ struct cxl_dev_state {
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
> + * @dcd_cmds: List of DCD commands implemented by memory device
>   * @enabled_cmds: Hardware commands found enabled in CEL.
>   * @exclusive_cmds: Commands that are kernel-internal only
> - * @total_bytes: sum of all possible capacities
> + * @total_capacity: Sum of static and dynamic capacities
> + * @total_static_capacity: Sum of RAM and PMEM capacities
> + * @total_dynamic_capacity: Complete DPA range occupied by DC regions
>   * @volatile_only_bytes: hard volatile capacity
>   * @persistent_only_bytes: hard persistent capacity
>   * @partition_align_bytes: alignment size for partition-able capacity
> @@ -344,6 +362,10 @@ struct cxl_dev_state {
>   * @active_persistent_bytes: sum of hard + soft persistent
>   * @next_volatile_bytes: volatile capacity change pending device reset
>   * @next_persistent_bytes: persistent capacity change pending device reset
> + * @nr_dc_region: number of DC regions implemented in the memory device
> + * @dc_region: array containing info about the DC regions
> + * @dc_event_log_size: The number of events the device can store in the
> + * Dynamic Capacity Event Log before it overflows
>   * @event: event log driver state
>   * @poison: poison driver state info
>   * @mbox_send: @dev specific transport for transmitting mailbox commands
> @@ -357,9 +379,13 @@ struct cxl_memdev_state {
>  	size_t lsa_size;
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
> +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> -	u64 total_bytes;
> +

Remove blank line ?

> +	u64 total_capacity;
> +	u64 total_static_capacity;
> +	u64 total_dynamic_capacity;
>  	u64 volatile_only_bytes;
>  	u64 persistent_only_bytes;
>  	u64 partition_align_bytes;
> @@ -367,6 +393,20 @@ struct cxl_memdev_state {
>  	u64 active_persistent_bytes;
>  	u64 next_volatile_bytes;
>  	u64 next_persistent_bytes;
> +
> +	u8 nr_dc_region;
> +

Remove blank line?

Fan
> +	struct cxl_dc_region_info {
> +		u8 name[CXL_DC_REGION_SRTLEN];
> +		u64 base;
> +		u64 decode_len;
> +		u64 len;
> +		u64 blk_size;
> +		u32 dsmad_handle;
> +		u8 flags;
> +	} dc_region[CXL_MAX_DC_REGION];
> +
> +	size_t dc_event_log_size;
>  	struct cxl_event_state event;
>  	struct cxl_poison_state poison;
>  	int (*mbox_send)(struct cxl_memdev_state *mds,
> @@ -415,6 +455,10 @@ enum cxl_opcode {
>  	CXL_MBOX_OP_UNLOCK		= 0x4503,
>  	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
>  	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
> +	CXL_MBOX_OP_GET_DC_CONFIG	= 0x4800,
> +	CXL_MBOX_OP_GET_DC_EXTENT_LIST	= 0x4801,
> +	CXL_MBOX_OP_ADD_DC_RESPONSE	= 0x4802,
> +	CXL_MBOX_OP_RELEASE_DC		= 0x4803,
>  	CXL_MBOX_OP_MAX			= 0x10000
>  };
>  
> @@ -462,6 +506,7 @@ struct cxl_mbox_identify {
>  	__le16 inject_poison_limit;
>  	u8 poison_caps;
>  	u8 qos_telemetry_caps;
> +	__le16 dc_event_log_size;
>  } __packed;
>  
>  /*
> @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
>  	u8 flags;
>  } __packed;
>  
> +struct cxl_mbox_get_dc_config {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> +struct cxl_mbox_dynamic_capacity {
> +	u8 avail_region_count;
> +	u8 rsvd[7];
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[];
> +} __packed;
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
>  
>  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>  struct cxl_mbox_set_timestamp_in {
> @@ -742,6 +807,7 @@ enum {
>  int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
>  			  struct cxl_mbox_cmd *cmd);
>  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 4e2845b7331a..ac1a41bc083d 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -742,6 +742,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	rc = cxl_dev_dynamic_capacity_identify(mds);
> +	if (rc)
> +		return rc;
> +
>  	rc = cxl_mem_create_range_info(mds);
>  	if (rc)
>  		return rc;
> 
> -- 
> 2.40.0
>
Ira Weiny June 15, 2023, 10:46 p.m. UTC | #7
Alison Schofield wrote:
> On Wed, Jun 14, 2023 at 12:16:28PM -0700, Ira Weiny wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > Read the Dynamic capacity configuration and store dynamic capacity region
> > information in the device state which driver will use to map into the HDM
> > ranges.
> > 
> > Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> > command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > 
> > ---
> > [iweiny: ensure all mds->dc_region's are named]
> > ---

[snip]

> > @@ -666,6 +697,7 @@ static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
> >  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
> >  {
> >  	struct cxl_cel_entry *cel_entry;
> > +	struct cxl_mem_command *cmd;
> >  	const int cel_entries = size / sizeof(*cel_entry);
> >  	struct device *dev = mds->cxlds.dev;
> >  	int i;
> > @@ -674,11 +706,12 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
> >  
> >  	for (i = 0; i < cel_entries; i++) {
> >  		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
> > -		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
> > +		cmd = cxl_mem_find_command(opcode);
> 
> Is the move of the 'cmd' define related to this patch?
> Checkpatch warns on it: WARNING: Missing a blank line after declarations

That seems unneeded.  Perhaps left over from a previous version.  I've
moved it back.

> 
> >  
> > -		if (!cmd && !cxl_is_poison_command(opcode)) {
> > -			dev_dbg(dev,
> > -				"Opcode 0x%04x unsupported by driver\n", opcode);
> > +		if (!cmd && !cxl_is_poison_command(opcode) &&
> > +		    !cxl_is_dcd_command(opcode)) {
> > +			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
> > +				opcode);
> >  			continue;
> >  		}
> >  

[snip]

> > +
> > +	/*
> > +	 * Calculate entire DPA range of all configured regions which will be mapped by
> > +	 * one or more HDM decoders
> > +	 */
> 
> Comment is needlessly going >80 chars.

My checkpatch script is set to 100 lines due to the recent change.  But
this line length is unneeded here.  Thanks for noticing.

Fixed.

> 
> 
> > +	mds->total_dynamic_capacity =
> > +		mds->dc_region[mds->nr_dc_region - 1].base +
> > +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> > +		mds->dc_region[0].base;
> > +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> > +		mds->total_dynamic_capacity);
> > +
> > +dc_error:
> > +	kvfree(dc);
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> > +
> >  static int add_dpa_res(struct device *dev, struct resource *parent,
> >  		       struct resource *res, resource_size_t start,
> >  		       resource_size_t size, const char *type)
> > @@ -1112,6 +1275,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >  	struct cxl_dev_state *cxlds = &mds->cxlds;
> >  	struct device *dev = cxlds->dev;
> >  	int rc;
> > +	size_t untenanted_mem =
> > +		mds->dc_region[0].base - mds->total_static_capacity;
> 
> Perhaps:
> 	size_t untenanted_mem;  (and put that in reverse x-tree order)
> 
> 	untenanted_mem = mds->dc_region[0].base - mds->total_static_capacity;

That looks good, fixed.

> 
> > +
> > +	mds->total_capacity = mds->total_static_capacity +
> > +			untenanted_mem + mds->total_dynamic_capacity;
> >  
> 
> Also, looking at this first patch with the long names, wondering if
> there is an opportunity to (re-)define these fields in fewers chars.
> Do we have to describe with 'total'? Is there a partial?
> 
> I guess I'll get to the defines further down...
> 
> 

[snip]

> > @@ -334,9 +349,12 @@ struct cxl_dev_state {
> >   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> >   * @mbox_mutex: Mutex to synchronize mailbox access.
> >   * @firmware_version: Firmware version for the memory device.
> > + * @dcd_cmds: List of DCD commands implemented by memory device
> >   * @enabled_cmds: Hardware commands found enabled in CEL.
> >   * @exclusive_cmds: Commands that are kernel-internal only
> > - * @total_bytes: sum of all possible capacities
> > + * @total_capacity: Sum of static and dynamic capacities
> > + * @total_static_capacity: Sum of RAM and PMEM capacities
> > + * @total_dynamic_capacity: Complete DPA range occupied by DC regions
> >   * @volatile_only_bytes: hard volatile capacity
> >   * @persistent_only_bytes: hard persistent capacity
> >   * @partition_align_bytes: alignment size for partition-able capacity
> > @@ -344,6 +362,10 @@ struct cxl_dev_state {
> >   * @active_persistent_bytes: sum of hard + soft persistent
> >   * @next_volatile_bytes: volatile capacity change pending device reset
> >   * @next_persistent_bytes: persistent capacity change pending device reset
> > + * @nr_dc_region: number of DC regions implemented in the memory device
> > + * @dc_region: array containing info about the DC regions
> > + * @dc_event_log_size: The number of events the device can store in the
> > + * Dynamic Capacity Event Log before it overflows
> >   * @event: event log driver state
> >   * @poison: poison driver state info
> >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > @@ -357,9 +379,13 @@ struct cxl_memdev_state {
> >  	size_t lsa_size;
> >  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
> >  	char firmware_version[0x10];
> > +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
> >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> > -	u64 total_bytes;
> > +
> > +	u64 total_capacity;
> > +	u64 total_static_capacity;
> > +	u64 total_dynamic_capacity;
> 
> maybe cap, static_cap, dynamic_cap

Since these are new it is probably better to use shorter names.  Also we
have good kdocs for each above.

> 
> (because I think I had a hand in defining the long names that
> follow and deeply regret it ;))

Well no one made a comment to correct them.  So remember:  "There is no
crying in CXL!"

[snip]

> > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> > +struct cxl_mbox_dynamic_capacity {
> > +	u8 avail_region_count;
> > +	u8 rsvd[7];
> > +	struct cxl_dc_region_config {
> > +		__le64 region_base;
> > +		__le64 region_decode_length;
> > +		__le64 region_length;
> > +		__le64 region_block_size;
> > +		__le32 region_dsmad_handle;
> > +		u8 flags;
> > +		u8 rsvd[3];
> > +	} __packed region[];
> > +} __packed;
> >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> 
> This ^ goes with the cxl_mbox_set_partition_info above.
> Please don't split.

Oh yea that is bad.  Fixed.

Thanks for looking,
Ira
Jonathan Cameron June 22, 2023, 3:58 p.m. UTC | #8
On Wed, 14 Jun 2023 12:16:28 -0700
ira.weiny@intel.com wrote:

> From: Navneet Singh <navneet.singh@intel.com>
> 
> Read the Dynamic capacity configuration and store dynamic capacity region
> information in the device state which driver will use to map into the HDM
> ranges.
> 
> Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>

Hi Ira / Navneet,

I'll probably overlap with comments of others (good to see so much review!)
so feel free to ignore duplication.

Comments inline,

Jonathan



> +/**
> + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> + * information from the device.
> + * @mds: The memory device state
> + * Return: 0 if identify was executed successfully.
> + *
> + * This will dispatch the get_dynamic_capacity command to the device
> + * and on success populate structures to be exported to sysfs.
> + */
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> +{
> +	struct cxl_dev_state *cxlds = &mds->cxlds;
> +	struct device *dev = cxlds->dev;
> +	struct cxl_mbox_dynamic_capacity *dc;

Calling it dc is confusing.  I'd make it clear this is the mailbox
response. config_resp or dc_config_res.

> +	struct cxl_mbox_get_dc_config get_dc;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	u64 next_dc_region_start;
> +	int rc, i;
> +
> +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> +		sprintf(mds->dc_region[i].name, "dc%d", i);
> +
> +	/* Check GET_DC_CONFIG is supported by device */
> +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> +		return 0;
> +	}
> +
> +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> +	if (!dc)
> +		return -ENOMEM;

Response to CXL_MBOX_OP_GET_DC_CONFIG has a known maximum
size. Can we provide that instead of potentially much larger?

8 + 0x28 * 8 I think so 328 bytes. Use struct_size()


But fun corner.... Mailbox is allowed to be smaller than that (256 bytes min
I think) so need to handle multiple reads with different start regions.
Which reminds me that we need to add support for running out of space
in the mailbox to qemu... So far we've just made sure everything fitted :)


> +
> +	get_dc = (struct cxl_mbox_get_dc_config) {
> +		.region_count = CXL_MAX_DC_REGION,
> +		.start_region_index = 0,
> +	};
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> +		.payload_in = &get_dc,
> +		.size_in = sizeof(get_dc),
> +		.size_out = mds->payload_size,
> +		.payload_out = dc,
> +		.min_out = 1,
> +	};
> +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> +	if (rc < 0)
> +		goto dc_error;
The error label is a bit too generic.  Why dc_error?
"error" conveys just as small amount of info.  I'd got for goto free_resp;

> +
> +	mds->nr_dc_region = dc->avail_region_count;
> +
> +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> +			mds->nr_dc_region);
> +		rc = -EINVAL;
> +		goto dc_error;
> +	}
> +
> +	for (i = 0; i < mds->nr_dc_region; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> +		dcr->decode_len =
> +			le64_to_cpu(dc->region[i].region_decode_length);
> +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> +
> +		/* Check regions are in increasing DPA order */
> +		if ((i + 1) < mds->nr_dc_region) {

Feels a bit odd to look at entries we haven't seen yet.  Maybe flip this around
to check the ones we have looked at?  So don't start until 2nd region and then check
it's start against mds->dc_region[0] etc?
Or factor out this loop contents in general and just pass in the single
value needed for checking this. Biggest advantage being direct returns in
that function as allocation and free will be in caller.


> +			next_dc_region_start =
> +				le64_to_cpu(dc->region[i + 1].region_base);
> +			if ((dcr->base > next_dc_region_start) ||
> +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {

Unless you have a negative decode length the second condition includes the first.
So just check that.

> +				dev_err(dev,
> +					"DPA ordering violation for DC region %d and %d\n",
> +					i, i + 1);
> +				rc = -EINVAL;
> +				goto dc_error;
> +			}
> +		}
> +
> +		/* Check the region is 256 MB aligned */
> +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {

That's an oddity. I wonder why those lower bits where defined as reserved...
Anyhow code is right if paranoid ;)

> +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		/* Check Region base and length are aligned to block size */
> +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> +				dcr->blk_size);
> +			rc = -EINVAL;
> +			goto dc_error;
> +		}
> +
> +		dcr->dsmad_handle =
> +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> +		dcr->flags = dc->region[i].flags;

I'd just grab these at same time as all the other fields above.
A pattern where you fill values in only after checking would be fine, or one
where you fill them in all in one place. The mixture of the two is less clear
than either consistent approach.

> +		sprintf(dcr->name, "dc%d", i);
> +
> +		dev_dbg(dev,
> +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> +	}
> +
> +	/*
> +	 * Calculate entire DPA range of all configured regions which will be mapped by
> +	 * one or more HDM decoders
> +	 */
> +	mds->total_dynamic_capacity =
> +		mds->dc_region[mds->nr_dc_region - 1].base +
> +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> +		mds->dc_region[0].base;
> +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> +		mds->total_dynamic_capacity);
> +
> +dc_error:
> +	kvfree(dc);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> +



> @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
>  	}
>  
>  	cxlds->dpa_res =
> -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> +
> +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> +
> +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> +				 dcr->base, dcr->decode_len, dcr->name);
> +		if (rc)
> +			return rc;
> +	}
>  
>  	if (mds->partition_align_bytes == 0) {
>  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
>  				 mds->volatile_only_bytes, "ram");
>  		if (rc)
>  			return rc;
> +

Scrub for this stuff before posting v2. Just noise that slows down review
a little.  If it is worth doing, do it in a separate patch.

>  		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
>  				   mds->volatile_only_bytes,
>  				   mds->persistent_only_bytes, "pmem");
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 89e560ea14c0..9c0b2fa72bdd 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
>
>  
> +#define CXL_MAX_DC_REGION 8
> +#define CXL_DC_REGION_SRTLEN 8

SRT? 

> +
>  /**
>   * struct cxl_dev_state - The driver device state
>   *
> @@ -300,6 +312,8 @@ enum cxl_devtype {
>   * @dpa_res: Overall DPA resource tree for the device
>   * @pmem_res: Active Persistent memory capacity configuration
>   * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>   * @component_reg_phys: register base of component registers
>   * @info: Cached DVSEC information about the device.
>   * @serial: PCIe Device Serial Number
> @@ -315,6 +329,7 @@ struct cxl_dev_state {
>  	struct resource dpa_res;
>  	struct resource pmem_res;
>  	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];
>  	resource_size_t component_reg_phys;
>  	u64 serial;
>  	enum cxl_devtype type;

...

> @@ -357,9 +379,13 @@ struct cxl_memdev_state {
>  	size_t lsa_size;
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
> +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> -	u64 total_bytes;
> +
> +	u64 total_capacity;
> +	u64 total_static_capacity;
> +	u64 total_dynamic_capacity;
>  	u64 volatile_only_bytes;
>  	u64 persistent_only_bytes;
>  	u64 partition_align_bytes;
> @@ -367,6 +393,20 @@ struct cxl_memdev_state {
>  	u64 active_persistent_bytes;
>  	u64 next_volatile_bytes;
>  	u64 next_persistent_bytes;
> +
> +	u8 nr_dc_region;
> +
> +	struct cxl_dc_region_info {
> +		u8 name[CXL_DC_REGION_SRTLEN];

char? SRT?  Also isn't it a bit big? Looks like max 4 chars to me.
Put it next to flags and we can save some space.

> +		u64 base;
> +		u64 decode_len;
> +		u64 len;
> +		u64 blk_size;
> +		u32 dsmad_handle;
> +		u8 flags;
> +	} dc_region[CXL_MAX_DC_REGION];
> +
> +	size_t dc_event_log_size;

>  /*
> @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
>  	u8 flags;
>  } __packed;
>  
> +struct cxl_mbox_get_dc_config {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> +struct cxl_mbox_dynamic_capacity {
> +	u8 avail_region_count;
> +	u8 rsvd[7];
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[];
> +} __packed;
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)

This looks to have merged oddly with existing changes. I'd
move the define into the structure definition so ti's clear which
flag it reflects and avoids this sort of interleaving in future.

> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
>
Ira Weiny June 24, 2023, 1:08 p.m. UTC | #9
Jonathan Cameron wrote:
> On Wed, 14 Jun 2023 12:16:28 -0700
> ira.weiny@intel.com wrote:
> 
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > Read the Dynamic capacity configuration and store dynamic capacity region
> > information in the device state which driver will use to map into the HDM
> > ranges.
> > 
> > Implement Get Dynamic Capacity Configuration (opcode 4800h) mailbox
> > command as specified in CXL 3.0 spec section 8.2.9.8.9.1.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> Hi Ira / Navneet,
> 
> I'll probably overlap with comments of others (good to see so much review!)

Indeed!  Thanks!

> so feel free to ignore duplication.
> 
> Comments inline,
> 
> Jonathan
> 
> 
> 
> > +/**
> > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
> > + * information from the device.
> > + * @mds: The memory device state
> > + * Return: 0 if identify was executed successfully.
> > + *
> > + * This will dispatch the get_dynamic_capacity command to the device
> > + * and on success populate structures to be exported to sysfs.
> > + */
> > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > +{
> > +	struct cxl_dev_state *cxlds = &mds->cxlds;
> > +	struct device *dev = cxlds->dev;
> > +	struct cxl_mbox_dynamic_capacity *dc;
> 
> Calling it dc is confusing.  I'd make it clear this is the mailbox
> response. config_resp or dc_config_res.

How about dc_resp?

> 
> > +	struct cxl_mbox_get_dc_config get_dc;
> > +	struct cxl_mbox_cmd mbox_cmd;
> > +	u64 next_dc_region_start;
> > +	int rc, i;
> > +
> > +	for (i = 0; i < CXL_MAX_DC_REGION; i++)
> > +		sprintf(mds->dc_region[i].name, "dc%d", i);
> > +
> > +	/* Check GET_DC_CONFIG is supported by device */
> > +	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
> > +		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
> > +		return 0;
> > +	}
> > +
> > +	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
> > +	if (!dc)
> > +		return -ENOMEM;
> 
> Response to CXL_MBOX_OP_GET_DC_CONFIG has a known maximum
> size. Can we provide that instead of potentially much larger?
> 
> 8 + 0x28 * 8 I think so 328 bytes. Use struct_size()

Actually yea and just putting that on the stack might also be better.

> 
> 
> But fun corner.... Mailbox is allowed to be smaller than that (256 bytes min
> I think) so need to handle multiple reads with different start regions.

Oh bother.  :-/

What are the chances a device is going to only support 256B and DC?  I think
you are correct though.  I'll add a loop to handle this possibility.

Anyway I've adjusted the algorithm...  Hopefully it will just loop 1 time.

> Which reminds me that we need to add support for running out of space
> in the mailbox to qemu... So far we've just made sure everything fitted :)

Might be nice to test stuff.

> 
> 
> > +
> > +	get_dc = (struct cxl_mbox_get_dc_config) {
> > +		.region_count = CXL_MAX_DC_REGION,
> > +		.start_region_index = 0,
> > +	};
> > +
> > +	mbox_cmd = (struct cxl_mbox_cmd) {
> > +		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
> > +		.payload_in = &get_dc,
> > +		.size_in = sizeof(get_dc),
> > +		.size_out = mds->payload_size,
> > +		.payload_out = dc,
> > +		.min_out = 1,
> > +	};
> > +	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> > +	if (rc < 0)
> > +		goto dc_error;
> The error label is a bit too generic.  Why dc_error?
> "error" conveys just as small amount of info.  I'd got for goto free_resp;

Sure.  But if we make dc on the stack then we get rid of this entirely.  I
prefer that.

> 
> > +
> > +	mds->nr_dc_region = dc->avail_region_count;
> > +
> > +	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
> > +		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
> > +			mds->nr_dc_region);
> > +		rc = -EINVAL;
> > +		goto dc_error;
> > +	}
> > +
> > +	for (i = 0; i < mds->nr_dc_region; i++) {
> > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > +		dcr->base = le64_to_cpu(dc->region[i].region_base);
> > +		dcr->decode_len =
> > +			le64_to_cpu(dc->region[i].region_decode_length);
> > +		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
> > +		dcr->len = le64_to_cpu(dc->region[i].region_length);
> > +		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
> > +
> > +		/* Check regions are in increasing DPA order */
> > +		if ((i + 1) < mds->nr_dc_region) {
> 
> Feels a bit odd to look at entries we haven't seen yet.  Maybe flip this around
> to check the ones we have looked at?

Totally agree.  I already did that.

> So don't start until 2nd region and then check
> it's start against mds->dc_region[0] etc?

Yep!  It makes the check easier...

> Or factor out this loop contents in general and just pass in the single
> value needed for checking this. Biggest advantage being direct returns in
> that function as allocation and free will be in caller.

Did that too.  I did not like how big cxl_dev_dynamic_capacity_identify() was
getting, especially with possibility of having to loop for a 2nd time.

> 
> 
> > +			next_dc_region_start =
> > +				le64_to_cpu(dc->region[i + 1].region_base);
> > +			if ((dcr->base > next_dc_region_start) ||
> > +			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
> 
> Unless you have a negative decode length the second condition includes the first.
> So just check that.

... exactly!

> 
> > +				dev_err(dev,
> > +					"DPA ordering violation for DC region %d and %d\n",
> > +					i, i + 1);
> > +				rc = -EINVAL;
> > +				goto dc_error;
> > +			}
> > +		}
> > +
> > +		/* Check the region is 256 MB aligned */
> > +		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
> 
> That's an oddity. I wonder why those lower bits where defined as reserved...
> Anyhow code is right if paranoid ;)

:shrug:

> 
> > +			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
> > +			rc = -EINVAL;
> > +			goto dc_error;
> > +		}
> > +
> > +		/* Check Region base and length are aligned to block size */
> > +		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
> > +		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
> > +			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
> > +				dcr->blk_size);
> > +			rc = -EINVAL;
> > +			goto dc_error;
> > +		}
> > +
> > +		dcr->dsmad_handle =
> > +			le32_to_cpu(dc->region[i].region_dsmad_handle);
> > +		dcr->flags = dc->region[i].flags;
> 
> I'd just grab these at same time as all the other fields above.
> A pattern where you fill values in only after checking would be fine, or one
> where you fill them in all in one place. The mixture of the two is less clear
> than either consistent approach.

Ok, yea this did seem odd but I kind of ignored it.  Done.

> 
> > +		sprintf(dcr->name, "dc%d", i);

I may take this out too now that we always set the name regardless of if the
region is available.

Although...  I wonder if setting the name to something like '<nil>' by default
would be beneficial in some way?  :-/

> > +
> > +		dev_dbg(dev,
> > +			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
> > +			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
> > +	}
> > +
> > +	/*
> > +	 * Calculate entire DPA range of all configured regions which will be mapped by
> > +	 * one or more HDM decoders
> > +	 */
> > +	mds->total_dynamic_capacity =
> > +		mds->dc_region[mds->nr_dc_region - 1].base +
> > +		mds->dc_region[mds->nr_dc_region - 1].decode_len -
> > +		mds->dc_region[0].base;
> > +	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
> > +		mds->total_dynamic_capacity);
> > +
> > +dc_error:
> > +	kvfree(dc);
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
> > +
> 
> 
> 
> > @@ -1121,13 +1289,23 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
> >  	}
> >  
> >  	cxlds->dpa_res =
> > -		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
> > +		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
> > +
> > +	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
> > +		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +
> > +		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
> > +				 dcr->base, dcr->decode_len, dcr->name);
> > +		if (rc)
> > +			return rc;
> > +	}
> >  
> >  	if (mds->partition_align_bytes == 0) {
> >  		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
> >  				 mds->volatile_only_bytes, "ram");
> >  		if (rc)
> >  			return rc;
> > +
> 
> Scrub for this stuff before posting v2. Just noise that slows down review
> a little.  If it is worth doing, do it in a separate patch.

I believe Alison or Dave caught that already.

> 
> >  		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
> >  				   mds->volatile_only_bytes,
> >  				   mds->persistent_only_bytes, "pmem");
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 89e560ea14c0..9c0b2fa72bdd 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> >
> >  
> > +#define CXL_MAX_DC_REGION 8
> > +#define CXL_DC_REGION_SRTLEN 8
> 
> SRT? 

LOL oh yea 'STR'...  :-D

> 
> > +
> >  /**
> >   * struct cxl_dev_state - The driver device state
> >   *
> > @@ -300,6 +312,8 @@ enum cxl_devtype {
> >   * @dpa_res: Overall DPA resource tree for the device
> >   * @pmem_res: Active Persistent memory capacity configuration
> >   * @ram_res: Active Volatile memory capacity configuration
> > + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> > + *          region
> >   * @component_reg_phys: register base of component registers
> >   * @info: Cached DVSEC information about the device.
> >   * @serial: PCIe Device Serial Number
> > @@ -315,6 +329,7 @@ struct cxl_dev_state {
> >  	struct resource dpa_res;
> >  	struct resource pmem_res;
> >  	struct resource ram_res;
> > +	struct resource dc_res[CXL_MAX_DC_REGION];
> >  	resource_size_t component_reg_phys;
> >  	u64 serial;
> >  	enum cxl_devtype type;
> 
> ...
> 
> > @@ -357,9 +379,13 @@ struct cxl_memdev_state {
> >  	size_t lsa_size;
> >  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
> >  	char firmware_version[0x10];
> > +	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
> >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> > -	u64 total_bytes;
> > +
> > +	u64 total_capacity;
> > +	u64 total_static_capacity;
> > +	u64 total_dynamic_capacity;
> >  	u64 volatile_only_bytes;
> >  	u64 persistent_only_bytes;
> >  	u64 partition_align_bytes;
> > @@ -367,6 +393,20 @@ struct cxl_memdev_state {
> >  	u64 active_persistent_bytes;
> >  	u64 next_volatile_bytes;
> >  	u64 next_persistent_bytes;
> > +
> > +	u8 nr_dc_region;
> > +
> > +	struct cxl_dc_region_info {
> > +		u8 name[CXL_DC_REGION_SRTLEN];
> 
> char? SRT?  Also isn't it a bit big? Looks like max 4 chars to me.
> Put it next to flags and we can save some space.

Well if I go forward with the idea of having them named something like '<nil>'
this would need to be longer than 4.

> 
> > +		u64 base;
> > +		u64 decode_len;
> > +		u64 len;
> > +		u64 blk_size;
> > +		u32 dsmad_handle;
> > +		u8 flags;
> > +	} dc_region[CXL_MAX_DC_REGION];
> > +
> > +	size_t dc_event_log_size;
> 
> >  /*
> > @@ -617,7 +662,27 @@ struct cxl_mbox_set_partition_info {
> >  	u8 flags;
> >  } __packed;
> >  
> > +struct cxl_mbox_get_dc_config {
> > +	u8 region_count;
> > +	u8 start_region_index;
> > +} __packed;
> > +
> > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
> > +struct cxl_mbox_dynamic_capacity {
> > +	u8 avail_region_count;
> > +	u8 rsvd[7];
> > +	struct cxl_dc_region_config {
> > +		__le64 region_base;
> > +		__le64 region_decode_length;
> > +		__le64 region_length;
> > +		__le64 region_block_size;
> > +		__le32 region_dsmad_handle;
> > +		u8 flags;
> > +		u8 rsvd[3];
> > +	} __packed region[];
> > +} __packed;
> >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> 
> This looks to have merged oddly with existing changes. I'd
> move the define into the structure definition so ti's clear which
> flag it reflects and avoids this sort of interleaving in future.

That has not been done anywhere in this file.  I think in this case it was just
a mistake to separate the partition define from the partition structure.  We
already fixed that based on Alisons feedback.

What I did do is move CXL_DC_REGION_STRLEN next to cxl_dc_region_info and made
it 7 chars to compact the structure a bit.  I think having that define next to
the structure helps to show why the odd length.

While we are at it I'm changing the sprintf's to snprintf's.  We are not in a
fast path and I'm paranoid now.

Ira
Jonathan Cameron July 3, 2023, 2:29 a.m. UTC | #10
#> > > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
> > > +{
> > > +	struct cxl_dev_state *cxlds = &mds->cxlds;
> > > +	struct device *dev = cxlds->dev;
> > > +	struct cxl_mbox_dynamic_capacity *dc;  
> > 
> > Calling it dc is confusing.  I'd make it clear this is the mailbox
> > response. config_resp or dc_config_res.  
> 
> How about dc_resp?

That works for me as well.

...

> > 
> > 
> > But fun corner.... Mailbox is allowed to be smaller than that (256 bytes min
> > I think) so need to handle multiple reads with different start regions.  
> 
> Oh bother.  :-/
> 
> What are the chances a device is going to only support 256B and DC?  I think
> you are correct though.  I'll add a loop to handle this possibility.
> 
> Anyway I've adjusted the algorithm...  Hopefully it will just loop 1 time.
> 
> > Which reminds me that we need to add support for running out of space
> > in the mailbox to qemu... So far we've just made sure everything fitted :)  
> 
> Might be nice to test stuff.
>

Hmm. I'll try not to forget about it this time, unlike the previous 10 times
I've thought we should fix that :)
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 3ca0bf12c55f..c5b696737c87 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -111,6 +111,37 @@  static u8 security_command_sets[] = {
 	0x46, /* Security Passthrough */
 };
 
+static bool cxl_is_dcd_command(u16 opcode)
+{
+#define CXL_MBOX_OP_DCD_CMDS 0x48
+
+	if ((opcode >> 8) == CXL_MBOX_OP_DCD_CMDS)
+		return true;
+
+	return false;
+}
+
+static void cxl_set_dcd_cmd_enabled(struct cxl_memdev_state *mds,
+					u16 opcode)
+{
+	switch (opcode) {
+	case CXL_MBOX_OP_GET_DC_CONFIG:
+		set_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
+		break;
+	case CXL_MBOX_OP_GET_DC_EXTENT_LIST:
+		set_bit(CXL_DCD_ENABLED_GET_EXTENT_LIST, mds->dcd_cmds);
+		break;
+	case CXL_MBOX_OP_ADD_DC_RESPONSE:
+		set_bit(CXL_DCD_ENABLED_ADD_RESPONSE, mds->dcd_cmds);
+		break;
+	case CXL_MBOX_OP_RELEASE_DC:
+		set_bit(CXL_DCD_ENABLED_RELEASE, mds->dcd_cmds);
+		break;
+	default:
+		break;
+	}
+}
+
 static bool cxl_is_security_command(u16 opcode)
 {
 	int i;
@@ -666,6 +697,7 @@  static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
 static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
 {
 	struct cxl_cel_entry *cel_entry;
+	struct cxl_mem_command *cmd;
 	const int cel_entries = size / sizeof(*cel_entry);
 	struct device *dev = mds->cxlds.dev;
 	int i;
@@ -674,11 +706,12 @@  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
 
 	for (i = 0; i < cel_entries; i++) {
 		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
-		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
+		cmd = cxl_mem_find_command(opcode);
 
-		if (!cmd && !cxl_is_poison_command(opcode)) {
-			dev_dbg(dev,
-				"Opcode 0x%04x unsupported by driver\n", opcode);
+		if (!cmd && !cxl_is_poison_command(opcode) &&
+		    !cxl_is_dcd_command(opcode)) {
+			dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n",
+				opcode);
 			continue;
 		}
 
@@ -688,6 +721,9 @@  static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
 		if (cxl_is_poison_command(opcode))
 			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
 
+		if (cxl_is_dcd_command(opcode))
+			cxl_set_dcd_cmd_enabled(mds, opcode);
+
 		dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode);
 	}
 }
@@ -1059,7 +1095,7 @@  int cxl_dev_state_identify(struct cxl_memdev_state *mds)
 	if (rc < 0)
 		return rc;
 
-	mds->total_bytes =
+	mds->total_static_capacity =
 		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
 	mds->volatile_only_bytes =
 		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
@@ -1077,10 +1113,137 @@  int cxl_dev_state_identify(struct cxl_memdev_state *mds)
 		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
 	}
 
+	mds->dc_event_log_size = le16_to_cpu(id.dc_event_log_size);
+
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
 
+/**
+ * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
+ * information from the device.
+ * @mds: The memory device state
+ * Return: 0 if identify was executed successfully.
+ *
+ * This will dispatch the get_dynamic_capacity command to the device
+ * and on success populate structures to be exported to sysfs.
+ */
+int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct device *dev = cxlds->dev;
+	struct cxl_mbox_dynamic_capacity *dc;
+	struct cxl_mbox_get_dc_config get_dc;
+	struct cxl_mbox_cmd mbox_cmd;
+	u64 next_dc_region_start;
+	int rc, i;
+
+	for (i = 0; i < CXL_MAX_DC_REGION; i++)
+		sprintf(mds->dc_region[i].name, "dc%d", i);
+
+	/* Check GET_DC_CONFIG is supported by device */
+	if (!test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds)) {
+		dev_dbg(dev, "unsupported cmd: get_dynamic_capacity_config\n");
+		return 0;
+	}
+
+	dc = kvmalloc(mds->payload_size, GFP_KERNEL);
+	if (!dc)
+		return -ENOMEM;
+
+	get_dc = (struct cxl_mbox_get_dc_config) {
+		.region_count = CXL_MAX_DC_REGION,
+		.start_region_index = 0,
+	};
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
+		.payload_in = &get_dc,
+		.size_in = sizeof(get_dc),
+		.size_out = mds->payload_size,
+		.payload_out = dc,
+		.min_out = 1,
+	};
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		goto dc_error;
+
+	mds->nr_dc_region = dc->avail_region_count;
+
+	if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
+		dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
+			mds->nr_dc_region);
+		rc = -EINVAL;
+		goto dc_error;
+	}
+
+	for (i = 0; i < mds->nr_dc_region; i++) {
+		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
+
+		dcr->base = le64_to_cpu(dc->region[i].region_base);
+		dcr->decode_len =
+			le64_to_cpu(dc->region[i].region_decode_length);
+		dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
+		dcr->len = le64_to_cpu(dc->region[i].region_length);
+		dcr->blk_size = le64_to_cpu(dc->region[i].region_block_size);
+
+		/* Check regions are in increasing DPA order */
+		if ((i + 1) < mds->nr_dc_region) {
+			next_dc_region_start =
+				le64_to_cpu(dc->region[i + 1].region_base);
+			if ((dcr->base > next_dc_region_start) ||
+			    ((dcr->base + dcr->decode_len) > next_dc_region_start)) {
+				dev_err(dev,
+					"DPA ordering violation for DC region %d and %d\n",
+					i, i + 1);
+				rc = -EINVAL;
+				goto dc_error;
+			}
+		}
+
+		/* Check the region is 256 MB aligned */
+		if (!IS_ALIGNED(dcr->base, SZ_256M)) {
+			dev_err(dev, "DC region %d not aligned to 256MB\n", i);
+			rc = -EINVAL;
+			goto dc_error;
+		}
+
+		/* Check Region base and length are aligned to block size */
+		if (!IS_ALIGNED(dcr->base, dcr->blk_size) ||
+		    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
+			dev_err(dev, "DC region %d not aligned to %#llx\n", i,
+				dcr->blk_size);
+			rc = -EINVAL;
+			goto dc_error;
+		}
+
+		dcr->dsmad_handle =
+			le32_to_cpu(dc->region[i].region_dsmad_handle);
+		dcr->flags = dc->region[i].flags;
+		sprintf(dcr->name, "dc%d", i);
+
+		dev_dbg(dev,
+			"DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n",
+			dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
+	}
+
+	/*
+	 * Calculate entire DPA range of all configured regions which will be mapped by
+	 * one or more HDM decoders
+	 */
+	mds->total_dynamic_capacity =
+		mds->dc_region[mds->nr_dc_region - 1].base +
+		mds->dc_region[mds->nr_dc_region - 1].decode_len -
+		mds->dc_region[0].base;
+	dev_dbg(dev, "Total dynamic capacity: %#llx\n",
+		mds->total_dynamic_capacity);
+
+dc_error:
+	kvfree(dc);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
+
 static int add_dpa_res(struct device *dev, struct resource *parent,
 		       struct resource *res, resource_size_t start,
 		       resource_size_t size, const char *type)
@@ -1112,6 +1275,11 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 	struct cxl_dev_state *cxlds = &mds->cxlds;
 	struct device *dev = cxlds->dev;
 	int rc;
+	size_t untenanted_mem =
+		mds->dc_region[0].base - mds->total_static_capacity;
+
+	mds->total_capacity = mds->total_static_capacity +
+			untenanted_mem + mds->total_dynamic_capacity;
 
 	if (!cxlds->media_ready) {
 		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
@@ -1121,13 +1289,23 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 	}
 
 	cxlds->dpa_res =
-		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
+		(struct resource)DEFINE_RES_MEM(0, mds->total_capacity);
+
+	for (int i = 0; i < CXL_MAX_DC_REGION; i++) {
+		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
+
+		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
+				 dcr->base, dcr->decode_len, dcr->name);
+		if (rc)
+			return rc;
+	}
 
 	if (mds->partition_align_bytes == 0) {
 		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
 				 mds->volatile_only_bytes, "ram");
 		if (rc)
 			return rc;
+
 		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
 				   mds->volatile_only_bytes,
 				   mds->persistent_only_bytes, "pmem");
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 89e560ea14c0..9c0b2fa72bdd 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -239,6 +239,15 @@  struct cxl_event_state {
 	struct mutex log_lock;
 };
 
+/* Device enabled DCD commands */
+enum dcd_cmd_enabled_bits {
+	CXL_DCD_ENABLED_GET_CONFIG,
+	CXL_DCD_ENABLED_GET_EXTENT_LIST,
+	CXL_DCD_ENABLED_ADD_RESPONSE,
+	CXL_DCD_ENABLED_RELEASE,
+	CXL_DCD_ENABLED_MAX
+};
+
 /* Device enabled poison commands */
 enum poison_cmd_enabled_bits {
 	CXL_POISON_ENABLED_LIST,
@@ -284,6 +293,9 @@  enum cxl_devtype {
 	CXL_DEVTYPE_CLASSMEM,
 };
 
+#define CXL_MAX_DC_REGION 8
+#define CXL_DC_REGION_SRTLEN 8
+
 /**
  * struct cxl_dev_state - The driver device state
  *
@@ -300,6 +312,8 @@  enum cxl_devtype {
  * @dpa_res: Overall DPA resource tree for the device
  * @pmem_res: Active Persistent memory capacity configuration
  * @ram_res: Active Volatile memory capacity configuration
+ * @dc_res: Active Dynamic Capacity memory configuration for each possible
+ *          region
  * @component_reg_phys: register base of component registers
  * @info: Cached DVSEC information about the device.
  * @serial: PCIe Device Serial Number
@@ -315,6 +329,7 @@  struct cxl_dev_state {
 	struct resource dpa_res;
 	struct resource pmem_res;
 	struct resource ram_res;
+	struct resource dc_res[CXL_MAX_DC_REGION];
 	resource_size_t component_reg_phys;
 	u64 serial;
 	enum cxl_devtype type;
@@ -334,9 +349,12 @@  struct cxl_dev_state {
  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
  * @mbox_mutex: Mutex to synchronize mailbox access.
  * @firmware_version: Firmware version for the memory device.
+ * @dcd_cmds: List of DCD commands implemented by memory device
  * @enabled_cmds: Hardware commands found enabled in CEL.
  * @exclusive_cmds: Commands that are kernel-internal only
- * @total_bytes: sum of all possible capacities
+ * @total_capacity: Sum of static and dynamic capacities
+ * @total_static_capacity: Sum of RAM and PMEM capacities
+ * @total_dynamic_capacity: Complete DPA range occupied by DC regions
  * @volatile_only_bytes: hard volatile capacity
  * @persistent_only_bytes: hard persistent capacity
  * @partition_align_bytes: alignment size for partition-able capacity
@@ -344,6 +362,10 @@  struct cxl_dev_state {
  * @active_persistent_bytes: sum of hard + soft persistent
  * @next_volatile_bytes: volatile capacity change pending device reset
  * @next_persistent_bytes: persistent capacity change pending device reset
+ * @nr_dc_region: number of DC regions implemented in the memory device
+ * @dc_region: array containing info about the DC regions
+ * @dc_event_log_size: The number of events the device can store in the
+ * Dynamic Capacity Event Log before it overflows
  * @event: event log driver state
  * @poison: poison driver state info
  * @mbox_send: @dev specific transport for transmitting mailbox commands
@@ -357,9 +379,13 @@  struct cxl_memdev_state {
 	size_t lsa_size;
 	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
 	char firmware_version[0x10];
+	DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX);
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
 	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
-	u64 total_bytes;
+
+	u64 total_capacity;
+	u64 total_static_capacity;
+	u64 total_dynamic_capacity;
 	u64 volatile_only_bytes;
 	u64 persistent_only_bytes;
 	u64 partition_align_bytes;
@@ -367,6 +393,20 @@  struct cxl_memdev_state {
 	u64 active_persistent_bytes;
 	u64 next_volatile_bytes;
 	u64 next_persistent_bytes;
+
+	u8 nr_dc_region;
+
+	struct cxl_dc_region_info {
+		u8 name[CXL_DC_REGION_SRTLEN];
+		u64 base;
+		u64 decode_len;
+		u64 len;
+		u64 blk_size;
+		u32 dsmad_handle;
+		u8 flags;
+	} dc_region[CXL_MAX_DC_REGION];
+
+	size_t dc_event_log_size;
 	struct cxl_event_state event;
 	struct cxl_poison_state poison;
 	int (*mbox_send)(struct cxl_memdev_state *mds,
@@ -415,6 +455,10 @@  enum cxl_opcode {
 	CXL_MBOX_OP_UNLOCK		= 0x4503,
 	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
 	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
+	CXL_MBOX_OP_GET_DC_CONFIG	= 0x4800,
+	CXL_MBOX_OP_GET_DC_EXTENT_LIST	= 0x4801,
+	CXL_MBOX_OP_ADD_DC_RESPONSE	= 0x4802,
+	CXL_MBOX_OP_RELEASE_DC		= 0x4803,
 	CXL_MBOX_OP_MAX			= 0x10000
 };
 
@@ -462,6 +506,7 @@  struct cxl_mbox_identify {
 	__le16 inject_poison_limit;
 	u8 poison_caps;
 	u8 qos_telemetry_caps;
+	__le16 dc_event_log_size;
 } __packed;
 
 /*
@@ -617,7 +662,27 @@  struct cxl_mbox_set_partition_info {
 	u8 flags;
 } __packed;
 
+struct cxl_mbox_get_dc_config {
+	u8 region_count;
+	u8 start_region_index;
+} __packed;
+
+/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */
+struct cxl_mbox_dynamic_capacity {
+	u8 avail_region_count;
+	u8 rsvd[7];
+	struct cxl_dc_region_config {
+		__le64 region_base;
+		__le64 region_decode_length;
+		__le64 region_length;
+		__le64 region_block_size;
+		__le32 region_dsmad_handle;
+		u8 flags;
+		u8 rsvd[3];
+	} __packed region[];
+} __packed;
 #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
+#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
 
 /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
 struct cxl_mbox_set_timestamp_in {
@@ -742,6 +807,7 @@  enum {
 int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
 			  struct cxl_mbox_cmd *cmd);
 int cxl_dev_state_identify(struct cxl_memdev_state *mds);
+int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
 int cxl_await_media_ready(struct cxl_dev_state *cxlds);
 int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
 int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 4e2845b7331a..ac1a41bc083d 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -742,6 +742,10 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	rc = cxl_dev_dynamic_capacity_identify(mds);
+	if (rc)
+		return rc;
+
 	rc = cxl_mem_create_range_info(mds);
 	if (rc)
 		return rc;