diff mbox series

[v8,02/21] cxl/mem: Read dynamic capacity configuration from the device

Message ID 20241210-dcd-type2-upstream-v8-2-812852504400@intel.com (mailing list archive)
State New
Headers show
Series DCD: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny Dec. 11, 2024, 3:42 a.m. UTC
Devices which optionally support Dynamic Capacity (DC) are configured
via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
Configuration command in order to properly configure DCDs.  Without the
Get DC Configuration command DCD can't be supported.

Implement the DC mailbox commands as specified in CXL 3.1 section
8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
information.  Disable DCD if DCD is not supported.  Leverage the Get DC
Configuration command supported bit to indicate if DCD is supported.

Linux has no use for the trailing fields of the Get Dynamic Capacity
Configuration Output Payload (Total number of supported extents, number
of available extents, total number of supported tags, and number of
available tags).  Avoid defining those fields to use the more useful
dynamic C array.

Based on an original patch by Navneet Singh.

Cc: Li Ming <ming.li@zohomail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: linux-hardening@vger.kernel.org
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes:
[iweiny: fix EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify)]
[iweiny: limit variable scope in cxl_dev_dynamic_capacity_identify]
---
 drivers/cxl/core/mbox.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/cxl/cxlmem.h    |  64 ++++++++++++++++++-
 drivers/cxl/pci.c       |   4 ++
 3 files changed, 232 insertions(+), 2 deletions(-)

Comments

Dan Williams Jan. 15, 2025, 2:35 a.m. UTC | #1
Ira Weiny wrote:
> Devices which optionally support Dynamic Capacity (DC) are configured
> via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
> Configuration command in order to properly configure DCDs.  Without the
> Get DC Configuration command DCD can't be supported.
> 
> Implement the DC mailbox commands as specified in CXL 3.1 section
> 8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
> information.  Disable DCD if DCD is not supported.  Leverage the Get DC
> Configuration command supported bit to indicate if DCD is supported.
> 
> Linux has no use for the trailing fields of the Get Dynamic Capacity
> Configuration Output Payload (Total number of supported extents, number
> of available extents, total number of supported tags, and number of
> available tags).  Avoid defining those fields to use the more useful
> dynamic C array.
> 
> Based on an original patch by Navneet Singh.
> 
> Cc: Li Ming <ming.li@zohomail.com>
> Cc: Kees Cook <kees@kernel.org>
> Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
> Cc: linux-hardening@vger.kernel.org
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes:
> [iweiny: fix EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify)]
> [iweiny: limit variable scope in cxl_dev_dynamic_capacity_identify]
> ---
>  drivers/cxl/core/mbox.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/cxl/cxlmem.h    |  64 ++++++++++++++++++-
>  drivers/cxl/pci.c       |   4 ++
>  3 files changed, 232 insertions(+), 2 deletions(-)
> 
[snipping the C code to do a data structure review first]

> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -403,6 +403,7 @@ enum cxl_devtype {
>  	CXL_DEVTYPE_CLASSMEM,
>  };
>  
> +#define CXL_MAX_DC_REGION 8

Please no, lets not sign up to have the "which cxl 'region' concept are
you referring to?" debate in perpetuity. "DPA partition", "DPA
resource", "DPA capacity" anything but "region".


>  /**
>   * struct cxl_dpa_perf - DPA performance property entry
>   * @dpa_range: range for DPA address
> @@ -434,6 +435,8 @@ struct cxl_dpa_perf {
>   * @dpa_res: Overall DPA resource tree for the device
>   * @pmem_res: Active Persistent memory capacity configuration
>   * @ram_res: Active Volatile memory capacity configuration
> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> + *          region
>   * @serial: PCIe Device Serial Number
>   * @type: Generic Memory Class device or Vendor Specific Memory device
>   * @cxl_mbox: CXL mailbox context
> @@ -449,11 +452,23 @@ struct cxl_dev_state {
>  	struct resource dpa_res;
>  	struct resource pmem_res;
>  	struct resource ram_res;
> +	struct resource dc_res[CXL_MAX_DC_REGION];

This is throwing off cargo-cult alarms. The named pmem_res and ram_res
served us well up until the point where DPA partitions grew past 2 types
at well defined locations. I like the array of resources idea, but that
begs the question why not put all partition information into an array?

This would also head off complications later on in this series where the
DPA capacity reservation and allocation flows have "dc" sidecars bolted
on rather than general semantics like "allocating from partition index N
means that all partitions indices less than N need to be skipped and
marked reserved".

>  	u64 serial;
>  	enum cxl_devtype type;
>  	struct cxl_mailbox cxl_mbox;
>  };
>  
> +#define CXL_DC_REGION_STRLEN 8
> +struct cxl_dc_region_info {
> +	u64 base;
> +	u64 decode_len;
> +	u64 len;

Duplicating partition information in multiple places, like
mds->dc_region[X].base and cxlds->dc_res[X].start, feels like an
RFC-quality decision for expediency that needs to reconciled on the way
to upstream.

> +	u64 blk_size;
> +	u32 dsmad_handle;
> +	u8 flags;
> +	u8 name[CXL_DC_REGION_STRLEN];

No, lets not entertain:

    printk("%s\n", mds->dc_region[index].name);

...when:

    printk("dc%d\n", index);

...will do.

> +};
> +
>  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>  {
>  	return dev_get_drvdata(cxl_mbox->host);
> @@ -473,7 +488,9 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>   * @dcd_cmds: List of DCD commands implemented by memory device
>   * @enabled_cmds: Hardware commands found enabled in CEL.
>   * @exclusive_cmds: Commands that are kernel-internal only
> - * @total_bytes: sum of all possible capacities
> + * @total_bytes: length of all possible capacities
> + * @static_bytes: length of possible static RAM and PMEM partitions
> + * @dynamic_bytes: length of possible DC partitions (DC Regions)
>   * @volatile_only_bytes: hard volatile capacity
>   * @persistent_only_bytes: hard persistent capacity

I have regrets that cxl_memdev_state permanently carries runtime storage
for init time variables, lets not continue down that path with DCD
enabling.

>   * @partition_align_bytes: alignment size for partition-able capacity
> @@ -483,6 +500,8 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>   * @next_persistent_bytes: persistent capacity change pending device reset
>   * @ram_perf: performance data entry matched to RAM partition
>   * @pmem_perf: performance data entry matched to PMEM partition
> + * @nr_dc_region: number of DC regions implemented in the memory device
> + * @dc_region: array containing info about the DC regions
>   * @event: event log driver state
>   * @poison: poison driver state info
>   * @security: security driver state info
> @@ -499,6 +518,8 @@ struct cxl_memdev_state {
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
>  	u64 total_bytes;
> +	u64 static_bytes;
> +	u64 dynamic_bytes;
>  	u64 volatile_only_bytes;
>  	u64 persistent_only_bytes;
>  	u64 partition_align_bytes;
> @@ -510,6 +531,9 @@ struct cxl_memdev_state {
>  	struct cxl_dpa_perf ram_perf;
>  	struct cxl_dpa_perf pmem_perf;
>  
> +	u8 nr_dc_region;
> +	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];

DPA capacity is a generic CXL.mem concern and partition information is
contained cxl_dev_state. Lets find a way to not need partially redundant
data structures across in cxl_memdev_state and cxl_dev_state.

DCD introduces the concept of "decode size vs usable capacity" into the
partition information, but I see no reason to conceptually tie that to
only DCD.  Fabio's memory hole patches show that there is already a
memory-hole concept in the CXL arena. DCD is just saying "be prepared for
the concept of DPA partitions with memory holes at the end".

> +
>  	struct cxl_event_state event;
>  	struct cxl_poison_state poison;
>  	struct cxl_security_state security;
> @@ -708,6 +732,32 @@ struct cxl_mbox_set_partition_info {
>  
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>  
> +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> +struct cxl_mbox_get_dc_config_in {
> +	u8 region_count;
> +	u8 start_region_index;
> +} __packed;
> +
> +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> +struct cxl_mbox_get_dc_config_out {
> +	u8 avail_region_count;
> +	u8 regions_returned;
> +	u8 rsvd[6];
> +	/* See CXL 3.1 Table 8-165 */
> +	struct cxl_dc_region_config {
> +		__le64 region_base;
> +		__le64 region_decode_length;
> +		__le64 region_length;
> +		__le64 region_block_size;
> +		__le32 region_dsmad_handle;
> +		u8 flags;
> +		u8 rsvd[3];
> +	} __packed region[] __counted_by(regions_returned);

Yes, the spec unfortunately uses "region" for this partition info
payload. This would be a good place to say "CXL spec calls this 'region'
but Linux calls it 'partition' not to be confused with the Linux 'struct
cxl_region' or all the other usages of 'region' in the specification".

Linux is not obligated to follow the questionable naming decisions of
specifications.

> +	/* Trailing fields unused */
> +} __packed;
> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
> +#define CXL_DCD_BLOCK_LINE_SIZE 0x40
> +
>  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>  struct cxl_mbox_set_timestamp_in {
>  	__le64 timestamp;
> @@ -831,6 +881,7 @@ enum {
>  int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
>  			  struct cxl_mbox_cmd *cmd);
>  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> @@ -844,6 +895,17 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>  			    enum cxl_event_log_type type,
>  			    enum cxl_event_type event_type,
>  			    const uuid_t *uuid, union cxl_event *evt);
> +
> +static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
> +{
> +	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +}
> +
> +static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
> +{
> +	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> +}

This hunk is out of place, and per the last patch, I think it can just be
a flag that does not need a helper.
Alejandro Lucero Palau Jan. 15, 2025, 1:55 p.m. UTC | #2
On 1/15/25 02:35, Dan Williams wrote:
> Ira Weiny wrote:
>> Devices which optionally support Dynamic Capacity (DC) are configured
>> via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
>> Configuration command in order to properly configure DCDs.  Without the
>> Get DC Configuration command DCD can't be supported.
>>
>> Implement the DC mailbox commands as specified in CXL 3.1 section
>> 8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
>> information.  Disable DCD if DCD is not supported.  Leverage the Get DC
>> Configuration command supported bit to indicate if DCD is supported.
>>
>> Linux has no use for the trailing fields of the Get Dynamic Capacity
>> Configuration Output Payload (Total number of supported extents, number
>> of available extents, total number of supported tags, and number of
>> available tags).  Avoid defining those fields to use the more useful
>> dynamic C array.
>>
>> Based on an original patch by Navneet Singh.
>>
>> Cc: Li Ming <ming.li@zohomail.com>
>> Cc: Kees Cook <kees@kernel.org>
>> Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
>> Cc: linux-hardening@vger.kernel.org
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
>>
>> ---
>> Changes:
>> [iweiny: fix EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify)]
>> [iweiny: limit variable scope in cxl_dev_dynamic_capacity_identify]
>> ---
>>   drivers/cxl/core/mbox.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++-
>>   drivers/cxl/cxlmem.h    |  64 ++++++++++++++++++-
>>   drivers/cxl/pci.c       |   4 ++
>>   3 files changed, 232 insertions(+), 2 deletions(-)
>>
> [snipping the C code to do a data structure review first]
>
>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>> index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
>> --- a/drivers/cxl/cxlmem.h
>> +++ b/drivers/cxl/cxlmem.h
>> @@ -403,6 +403,7 @@ enum cxl_devtype {
>>   	CXL_DEVTYPE_CLASSMEM,
>>   };
>>   
>> +#define CXL_MAX_DC_REGION 8
> Please no, lets not sign up to have the "which cxl 'region' concept are
> you referring to?" debate in perpetuity. "DPA partition", "DPA
> resource", "DPA capacity" anything but "region".
>
>

This next comment is not my main point to discuss in this email 
(resources initialization is), but I seize it for giving my view in this 
one.

Dan, you say later we (Linux) are not obligated to use "questionable 
naming decisions of specifications", but we should not confuse people 
either.

Maybe CXL_MAX_DC_HW_REGION would help here, for differentiating it from 
the kernel software cxl region construct. I think we will need a CXL 
kernel dictionary sooner or later ...

>>   /**
>>    * struct cxl_dpa_perf - DPA performance property entry
>>    * @dpa_range: range for DPA address
>> @@ -434,6 +435,8 @@ struct cxl_dpa_perf {
>>    * @dpa_res: Overall DPA resource tree for the device
>>    * @pmem_res: Active Persistent memory capacity configuration
>>    * @ram_res: Active Volatile memory capacity configuration
>> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
>> + *          region
>>    * @serial: PCIe Device Serial Number
>>    * @type: Generic Memory Class device or Vendor Specific Memory device
>>    * @cxl_mbox: CXL mailbox context
>> @@ -449,11 +452,23 @@ struct cxl_dev_state {
>>   	struct resource dpa_res;
>>   	struct resource pmem_res;
>>   	struct resource ram_res;
>> +	struct resource dc_res[CXL_MAX_DC_REGION];
> This is throwing off cargo-cult alarms. The named pmem_res and ram_res
> served us well up until the point where DPA partitions grew past 2 types
> at well defined locations. I like the array of resources idea, but that
> begs the question why not put all partition information into an array?
>
> This would also head off complications later on in this series where the
> DPA capacity reservation and allocation flows have "dc" sidecars bolted
> on rather than general semantics like "allocating from partition index N
> means that all partitions indices less than N need to be skipped and
> marked reserved".


I guess this is likely how you want to change the type2 resource 
initialization issue and where I'm afraid these two patchsets are going 
to collide at.

If that is the case, both are going to miss the next kernel cycle since 
it means major changes, but let's discuss it without further delays for 
the sake of implementing the accepted changes as soon as possible, and I 
guess with a close sync between Ira and I.

BTW, in the case of the Type2, there are more things to discuss which I 
do there.

Thank you


>>   	u64 serial;
>>   	enum cxl_devtype type;
>>   	struct cxl_mailbox cxl_mbox;
>>   };
>>   
>> +#define CXL_DC_REGION_STRLEN 8
>> +struct cxl_dc_region_info {
>> +	u64 base;
>> +	u64 decode_len;
>> +	u64 len;
> Duplicating partition information in multiple places, like
> mds->dc_region[X].base and cxlds->dc_res[X].start, feels like an
> RFC-quality decision for expediency that needs to reconciled on the way
> to upstream.
>
>> +	u64 blk_size;
>> +	u32 dsmad_handle;
>> +	u8 flags;
>> +	u8 name[CXL_DC_REGION_STRLEN];
> No, lets not entertain:
>
>      printk("%s\n", mds->dc_region[index].name);
>
> ...when:
>
>      printk("dc%d\n", index);
>
> ...will do.
>
>> +};
>> +
>>   static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>>   {
>>   	return dev_get_drvdata(cxl_mbox->host);
>> @@ -473,7 +488,9 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>>    * @dcd_cmds: List of DCD commands implemented by memory device
>>    * @enabled_cmds: Hardware commands found enabled in CEL.
>>    * @exclusive_cmds: Commands that are kernel-internal only
>> - * @total_bytes: sum of all possible capacities
>> + * @total_bytes: length of all possible capacities
>> + * @static_bytes: length of possible static RAM and PMEM partitions
>> + * @dynamic_bytes: length of possible DC partitions (DC Regions)
>>    * @volatile_only_bytes: hard volatile capacity
>>    * @persistent_only_bytes: hard persistent capacity
> I have regrets that cxl_memdev_state permanently carries runtime storage
> for init time variables, lets not continue down that path with DCD
> enabling.
>
>>    * @partition_align_bytes: alignment size for partition-able capacity
>> @@ -483,6 +500,8 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>>    * @next_persistent_bytes: persistent capacity change pending device reset
>>    * @ram_perf: performance data entry matched to RAM partition
>>    * @pmem_perf: performance data entry matched to PMEM partition
>> + * @nr_dc_region: number of DC regions implemented in the memory device
>> + * @dc_region: array containing info about the DC regions
>>    * @event: event log driver state
>>    * @poison: poison driver state info
>>    * @security: security driver state info
>> @@ -499,6 +518,8 @@ struct cxl_memdev_state {
>>   	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>>   	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
>>   	u64 total_bytes;
>> +	u64 static_bytes;
>> +	u64 dynamic_bytes;
>>   	u64 volatile_only_bytes;
>>   	u64 persistent_only_bytes;
>>   	u64 partition_align_bytes;
>> @@ -510,6 +531,9 @@ struct cxl_memdev_state {
>>   	struct cxl_dpa_perf ram_perf;
>>   	struct cxl_dpa_perf pmem_perf;
>>   
>> +	u8 nr_dc_region;
>> +	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
> DPA capacity is a generic CXL.mem concern and partition information is
> contained cxl_dev_state. Lets find a way to not need partially redundant
> data structures across in cxl_memdev_state and cxl_dev_state.
>
> DCD introduces the concept of "decode size vs usable capacity" into the
> partition information, but I see no reason to conceptually tie that to
> only DCD.  Fabio's memory hole patches show that there is already a
> memory-hole concept in the CXL arena. DCD is just saying "be prepared for
> the concept of DPA partitions with memory holes at the end".
>
>> +
>>   	struct cxl_event_state event;
>>   	struct cxl_poison_state poison;
>>   	struct cxl_security_state security;
>> @@ -708,6 +732,32 @@ struct cxl_mbox_set_partition_info {
>>   
>>   #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>>   
>> +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
>> +struct cxl_mbox_get_dc_config_in {
>> +	u8 region_count;
>> +	u8 start_region_index;
>> +} __packed;
>> +
>> +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
>> +struct cxl_mbox_get_dc_config_out {
>> +	u8 avail_region_count;
>> +	u8 regions_returned;
>> +	u8 rsvd[6];
>> +	/* See CXL 3.1 Table 8-165 */
>> +	struct cxl_dc_region_config {
>> +		__le64 region_base;
>> +		__le64 region_decode_length;
>> +		__le64 region_length;
>> +		__le64 region_block_size;
>> +		__le32 region_dsmad_handle;
>> +		u8 flags;
>> +		u8 rsvd[3];
>> +	} __packed region[] __counted_by(regions_returned);
> Yes, the spec unfortunately uses "region" for this partition info
> payload. This would be a good place to say "CXL spec calls this 'region'
> but Linux calls it 'partition' not to be confused with the Linux 'struct
> cxl_region' or all the other usages of 'region' in the specification".
>
> Linux is not obligated to follow the questionable naming decisions of
> specifications.
>
>> +	/* Trailing fields unused */
>> +} __packed;
>> +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
>> +#define CXL_DCD_BLOCK_LINE_SIZE 0x40
>> +
>>   /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
>>   struct cxl_mbox_set_timestamp_in {
>>   	__le64 timestamp;
>> @@ -831,6 +881,7 @@ enum {
>>   int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
>>   			  struct cxl_mbox_cmd *cmd);
>>   int cxl_dev_state_identify(struct cxl_memdev_state *mds);
>> +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
>>   int cxl_await_media_ready(struct cxl_dev_state *cxlds);
>>   int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
>>   int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
>> @@ -844,6 +895,17 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>>   			    enum cxl_event_log_type type,
>>   			    enum cxl_event_type event_type,
>>   			    const uuid_t *uuid, union cxl_event *evt);
>> +
>> +static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
>> +{
>> +	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
>> +}
>> +
>> +static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
>> +{
>> +	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
>> +}
> This hunk is out of place, and per the last patch, I think it can just be
> a flag that does not need a helper.
>
Ira Weiny Jan. 15, 2025, 8:32 p.m. UTC | #3
Dan Williams wrote:
> Ira Weiny wrote:

[snip]

> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -403,6 +403,7 @@ enum cxl_devtype {
> >  	CXL_DEVTYPE_CLASSMEM,
> >  };
> >  
> > +#define CXL_MAX_DC_REGION 8
> 
> Please no, lets not sign up to have the "which cxl 'region' concept are
> you referring to?" debate in perpetuity. "DPA partition", "DPA
> resource", "DPA capacity" anything but "region".
> 

I'm inclined to agree with Alejandro on this one.  I've walked this
tightrope quite a bit with this series.  But there are other places where
we have chosen to change the verbiage from the spec and it has made it
difficult for new comers to correlate the spec with the code.

So I like Alejandro's idea of adding "HW" to the name to indicate that we
are talking about a spec or hardware defined thing.

That said I am open to changing some names where it is clear it is a
software structure.  I'll audit the series for that.

> >  /**
> >   * struct cxl_dpa_perf - DPA performance property entry
> >   * @dpa_range: range for DPA address
> > @@ -434,6 +435,8 @@ struct cxl_dpa_perf {
> >   * @dpa_res: Overall DPA resource tree for the device
> >   * @pmem_res: Active Persistent memory capacity configuration
> >   * @ram_res: Active Volatile memory capacity configuration
> > + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> > + *          region
> >   * @serial: PCIe Device Serial Number
> >   * @type: Generic Memory Class device or Vendor Specific Memory device
> >   * @cxl_mbox: CXL mailbox context
> > @@ -449,11 +452,23 @@ struct cxl_dev_state {
> >  	struct resource dpa_res;
> >  	struct resource pmem_res;
> >  	struct resource ram_res;
> > +	struct resource dc_res[CXL_MAX_DC_REGION];
> 
> This is throwing off cargo-cult alarms. The named pmem_res and ram_res
> served us well up until the point where DPA partitions grew past 2 types
> at well defined locations. I like the array of resources idea, but that
> begs the question why not put all partition information into an array?

For me that keeps it clear what is pmem/ram/dc.

> 
> This would also head off complications later on in this series where the
> DPA capacity reservation and allocation flows have "dc" sidecars bolted
> on rather than general semantics like "allocating from partition index N
> means that all partitions indices less than N need to be skipped and
> marked reserved".

I assume you are speaking of this patch:

	cxl/hdm: Add dynamic capacity size support to endpoint decoders

I took some care to make the skip calculations and tracking generic.  But
I think you are correct more could be done.

We would also need to adjust cxl_dpa_alloc().

I thought there might be other places where the memdev_state would need to
be adjusted.  But it looks like those are minor issues.

Over all I did spend a lot of time making the skip generic and honestly it
is probably worth making it all generic.  However, I think it will take
careful review and testing to make sure we don't break things.

> 
> >  	u64 serial;
> >  	enum cxl_devtype type;
> >  	struct cxl_mailbox cxl_mbox;
> >  };
> >  
> > +#define CXL_DC_REGION_STRLEN 8
> > +struct cxl_dc_region_info {
> > +	u64 base;
> > +	u64 decode_len;
> > +	u64 len;
> 
> Duplicating partition information in multiple places, like
> mds->dc_region[X].base and cxlds->dc_res[X].start, feels like an
> RFC-quality decision for expediency that needs to reconciled on the way
> to upstream.

I think this was done to follow a pattern of the mds being passed around
rather than creating resources right when partitions are read.

Furthermore this stands to hold this information in CPU endianess rather
than holding an array of region info coming from the hardware.

Let see how other changes fall out before I go hacking this though.

> 
> > +	u64 blk_size;
> > +	u32 dsmad_handle;
> > +	u8 flags;
> > +	u8 name[CXL_DC_REGION_STRLEN];
> 
> No, lets not entertain:
> 
>     printk("%s\n", mds->dc_region[index].name);
> 
> ...when:
> 
>     printk("dc%d\n", index);
> 
> ...will do.

Actually these buffers provide a buffer for the (struct
resource)dc_res[x].name pointers to point to.

It could be devm_malloc'ed memory but this actually worked ok.  ram/pmem
just use static strings.

I could add a comment to that effect.  Or I could define a static array
thusly...  I think this would go against defining a resource map though.

@@ -1460,6 +1459,10 @@ static int add_dpa_res(struct device *dev, struct resource *parent,
        return 0;
 }
 
+static const char * const dc_resource_name[] = {
+       "dc0", "dc1", "dc2", "dc3", "dc4", "dc5", "dc6", "dc7",
+};
+
 int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 {
        struct cxl_dev_state *cxlds = &mds->cxlds;
@@ -1486,7 +1489,8 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
                struct cxl_dc_region_info *dcr = &mds->dc_region[i];
 
                rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
-                                dcr->base, dcr->decode_len, dcr->name);
+                                dcr->base, dcr->decode_len,
+                                dc_resource_name[i]);
                if (rc)
                        return rc;
        }

> 
> > +};
> > +
> >  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
> >  {
> >  	return dev_get_drvdata(cxl_mbox->host);
> > @@ -473,7 +488,9 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
> >   * @dcd_cmds: List of DCD commands implemented by memory device
> >   * @enabled_cmds: Hardware commands found enabled in CEL.
> >   * @exclusive_cmds: Commands that are kernel-internal only
> > - * @total_bytes: sum of all possible capacities
> > + * @total_bytes: length of all possible capacities
> > + * @static_bytes: length of possible static RAM and PMEM partitions
> > + * @dynamic_bytes: length of possible DC partitions (DC Regions)
> >   * @volatile_only_bytes: hard volatile capacity
> >   * @persistent_only_bytes: hard persistent capacity
> 
> I have regrets that cxl_memdev_state permanently carries runtime storage
> for init time variables, lets not continue down that path with DCD
> enabling.

Yea I thought these were used more than they are.  I feel like over time
they got used less and less.  Perhaps now they can be removed.

> 
> >   * @partition_align_bytes: alignment size for partition-able capacity
> > @@ -483,6 +500,8 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
> >   * @next_persistent_bytes: persistent capacity change pending device reset
> >   * @ram_perf: performance data entry matched to RAM partition
> >   * @pmem_perf: performance data entry matched to PMEM partition
> > + * @nr_dc_region: number of DC regions implemented in the memory device
> > + * @dc_region: array containing info about the DC regions
> >   * @event: event log driver state
> >   * @poison: poison driver state info
> >   * @security: security driver state info
> > @@ -499,6 +518,8 @@ struct cxl_memdev_state {
> >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
> >  	u64 total_bytes;
> > +	u64 static_bytes;
> > +	u64 dynamic_bytes;
> >  	u64 volatile_only_bytes;
> >  	u64 persistent_only_bytes;
> >  	u64 partition_align_bytes;
> > @@ -510,6 +531,9 @@ struct cxl_memdev_state {
> >  	struct cxl_dpa_perf ram_perf;
> >  	struct cxl_dpa_perf pmem_perf;
> >  
> > +	u8 nr_dc_region;
> > +	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
> 
> DPA capacity is a generic CXL.mem concern and partition information is
> contained cxl_dev_state. Lets find a way to not need partially redundant
> data structures across in cxl_memdev_state and cxl_dev_state.

I'll think on it.

> 
> DCD introduces the concept of "decode size vs usable capacity" into the
> partition information, but I see no reason to conceptually tie that to
> only DCD.  Fabio's memory hole patches show that there is already a
> memory-hole concept in the CXL arena. DCD is just saying "be prepared for
> the concept of DPA partitions with memory holes at the end".

I'm not clear how this relates.  ram and pmem partitions can already have
holes at the end if not mapped.

> 
> > +
> >  	struct cxl_event_state event;
> >  	struct cxl_poison_state poison;
> >  	struct cxl_security_state security;
> > @@ -708,6 +732,32 @@ struct cxl_mbox_set_partition_info {
> >  
> >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> >  
> > +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> > +struct cxl_mbox_get_dc_config_in {
> > +	u8 region_count;
> > +	u8 start_region_index;
> > +} __packed;
> > +
> > +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> > +struct cxl_mbox_get_dc_config_out {
> > +	u8 avail_region_count;
> > +	u8 regions_returned;
> > +	u8 rsvd[6];
> > +	/* See CXL 3.1 Table 8-165 */
> > +	struct cxl_dc_region_config {
> > +		__le64 region_base;
> > +		__le64 region_decode_length;
> > +		__le64 region_length;
> > +		__le64 region_block_size;
> > +		__le32 region_dsmad_handle;
> > +		u8 flags;
> > +		u8 rsvd[3];
> > +	} __packed region[] __counted_by(regions_returned);
> 
> Yes, the spec unfortunately uses "region" for this partition info
> payload. This would be a good place to say "CXL spec calls this 'region'
> but Linux calls it 'partition' not to be confused with the Linux 'struct
> cxl_region' or all the other usages of 'region' in the specification".

In this case I totally disagree.  This is a structure being filled in by
the hardware and is directly related to the spec.  I think I would rather
change 

s/cxl_dc_region_info/cxl_dc_partition_info/

And leave this.  Which draws a more distinct line between what is
specified in hardware vs a software construct.

> 
> Linux is not obligated to follow the questionable naming decisions of
> specifications.

We are not.  But as Alejandro says it can be confusing if we don't make
some association to the spec.

What do you think about the HW/SW line I propose above?

> 
> > +	/* Trailing fields unused */
> > +} __packed;
> > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
> > +#define CXL_DCD_BLOCK_LINE_SIZE 0x40
> > +
> >  /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
> >  struct cxl_mbox_set_timestamp_in {
> >  	__le64 timestamp;
> > @@ -831,6 +881,7 @@ enum {
> >  int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
> >  			  struct cxl_mbox_cmd *cmd);
> >  int cxl_dev_state_identify(struct cxl_memdev_state *mds);
> > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
> >  int cxl_await_media_ready(struct cxl_dev_state *cxlds);
> >  int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
> >  int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
> > @@ -844,6 +895,17 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> >  			    enum cxl_event_log_type type,
> >  			    enum cxl_event_type event_type,
> >  			    const uuid_t *uuid, union cxl_event *evt);
> > +
> > +static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
> > +{
> > +	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > +}
> > +
> > +static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
> > +{
> > +	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
> > +}
> 
> This hunk is out of place,

Not sure why they are out of place.  This is the first patch they are used
in so they were added here.  I could push them back a patch but you have
mentioned before you don't like to see functions defined without a use.
So I structured the series this way.

> and per the last patch, I think it can just be
> a flag that does not need a helper.

Agreed.  This has already been changed.  But these functions remain
defined here along with where they are used for proper context.

Ira
Ira Weiny Jan. 15, 2025, 8:48 p.m. UTC | #4
Alejandro Lucero Palau wrote:
> 
> On 1/15/25 02:35, Dan Williams wrote:
> > Ira Weiny wrote:

[snip]

> >> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> >> index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> >> --- a/drivers/cxl/cxlmem.h
> >> +++ b/drivers/cxl/cxlmem.h
> >> @@ -403,6 +403,7 @@ enum cxl_devtype {
> >>   	CXL_DEVTYPE_CLASSMEM,
> >>   };
> >>   
> >> +#define CXL_MAX_DC_REGION 8
> > Please no, lets not sign up to have the "which cxl 'region' concept are
> > you referring to?" debate in perpetuity. "DPA partition", "DPA
> > resource", "DPA capacity" anything but "region".
> >
> >
> 
> This next comment is not my main point to discuss in this email 
> (resources initialization is), but I seize it for giving my view in this 
> one.
> 
> Dan, you say later we (Linux) are not obligated to use "questionable 
> naming decisions of specifications", but we should not confuse people 
> either.
> 
> Maybe CXL_MAX_DC_HW_REGION would help here, for differentiating it from 
> the kernel software cxl region construct. I think we will need a CXL 
> kernel dictionary sooner or later ...

I agree.  I have had folks confused between spec and code and I'm really trying
to differentiate hardware region vs software partition.

> 
> >>   /**
> >>    * struct cxl_dpa_perf - DPA performance property entry
> >>    * @dpa_range: range for DPA address
> >> @@ -434,6 +435,8 @@ struct cxl_dpa_perf {
> >>    * @dpa_res: Overall DPA resource tree for the device
> >>    * @pmem_res: Active Persistent memory capacity configuration
> >>    * @ram_res: Active Volatile memory capacity configuration
> >> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> >> + *          region
> >>    * @serial: PCIe Device Serial Number
> >>    * @type: Generic Memory Class device or Vendor Specific Memory device
> >>    * @cxl_mbox: CXL mailbox context
> >> @@ -449,11 +452,23 @@ struct cxl_dev_state {
> >>   	struct resource dpa_res;
> >>   	struct resource pmem_res;
> >>   	struct resource ram_res;
> >> +	struct resource dc_res[CXL_MAX_DC_REGION];
> > This is throwing off cargo-cult alarms. The named pmem_res and ram_res
> > served us well up until the point where DPA partitions grew past 2 types
> > at well defined locations. I like the array of resources idea, but that
> > begs the question why not put all partition information into an array?
> >
> > This would also head off complications later on in this series where the
> > DPA capacity reservation and allocation flows have "dc" sidecars bolted
> > on rather than general semantics like "allocating from partition index N
> > means that all partitions indices less than N need to be skipped and
> > marked reserved".
> 
> 
> I guess this is likely how you want to change the type2 resource 
> initialization issue and where I'm afraid these two patchsets are going 
> to collide at.
> 
> If that is the case, both are going to miss the next kernel cycle since 
> it means major changes, but let's discuss it without further delays for 
> the sake of implementing the accepted changes as soon as possible, and I 
> guess with a close sync between Ira and I.
> 
> BTW, in the case of the Type2, there are more things to discuss which I 
> do there.

I'm looking at your set again because I think I missed this detail.

After looking into this more I think a singular array of resources could be
done without to much major surgery.

The question for type 2 is what interface does the core export for
accelerators to request these resources?  Or do we export a function like
add_dpa_res() and let drivers do that directly?

Dan is concerned about storing duplicate information about the partitions.
For DCD I think it should call add_dpa_res() to create resources on the
fly as I detect partition information from the device.  For type 2 they
can call that however/whenever they want.

We can even make this an xarray for complete flexibility with how many
partitions a device can have.  Although I'm not sure if the spec allows
for that on type 2.  Does it?

Ira

[snip]
Dan Williams Jan. 15, 2025, 10:34 p.m. UTC | #5
Ira Weiny wrote:
> Dan Williams wrote:
> > Ira Weiny wrote:
> 
> [snip]
> 
> > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> > > --- a/drivers/cxl/cxlmem.h
> > > +++ b/drivers/cxl/cxlmem.h
> > > @@ -403,6 +403,7 @@ enum cxl_devtype {
> > >  	CXL_DEVTYPE_CLASSMEM,
> > >  };
> > >  
> > > +#define CXL_MAX_DC_REGION 8
> > 
> > Please no, lets not sign up to have the "which cxl 'region' concept are
> > you referring to?" debate in perpetuity. "DPA partition", "DPA
> > resource", "DPA capacity" anything but "region".
> > 
> 
> I'm inclined to agree with Alejandro on this one.  I've walked this
> tightrope quite a bit with this series.  But there are other places where
> we have chosen to change the verbiage from the spec and it has made it
> difficult for new comers to correlate the spec with the code.
> 
> So I like Alejandro's idea of adding "HW" to the name to indicate that we
> are talking about a spec or hardware defined thing.

See below, the only people that could potentially be bothered by the
lack of spec terminology matching are the very same people that are
sophisticated enough to have read the spec to know its a problem.

> 
> That said I am open to changing some names where it is clear it is a
> software structure.  I'll audit the series for that.
> 
> > >  	u64 serial;
> > >  	enum cxl_devtype type;
> > >  	struct cxl_mailbox cxl_mbox;
> > >  };
> > >  
> > > +#define CXL_DC_REGION_STRLEN 8
> > > +struct cxl_dc_region_info {
> > > +	u64 base;
> > > +	u64 decode_len;
> > > +	u64 len;
> > 
> > Duplicating partition information in multiple places, like
> > mds->dc_region[X].base and cxlds->dc_res[X].start, feels like an
> > RFC-quality decision for expediency that needs to reconciled on the way
> > to upstream.
> 
> I think this was done to follow a pattern of the mds being passed around
> rather than creating resources right when partitions are read.
> 
> Furthermore this stands to hold this information in CPU endianess rather
> than holding an array of region info coming from the hardware.

Yes, the ask is translate all of this into common information that lives
at the cxl_dev_state level.

> 
> Let see how other changes fall out before I go hacking this though.
> 
> > 
> > > +	u64 blk_size;
> > > +	u32 dsmad_handle;
> > > +	u8 flags;
> > > +	u8 name[CXL_DC_REGION_STRLEN];
> > 
> > No, lets not entertain:
> > 
> >     printk("%s\n", mds->dc_region[index].name);
> > 
> > ...when:
> > 
> >     printk("dc%d\n", index);
> > 
> > ...will do.
> 
> Actually these buffers provide a buffer for the (struct
> resource)dc_res[x].name pointers to point to.

I missed that specific detail, but I still challenge whether this
precision is needed especially since it makes the data structure
messier. Given these names are for debug only and multi-partition DCD
devices seem unlikely to ever exist, just use a static shared name for
adding to ->dpa_res.

> 
> > 
> > DCD introduces the concept of "decode size vs usable capacity" into the
> > partition information, but I see no reason to conceptually tie that to
> > only DCD.  Fabio's memory hole patches show that there is already a
> > memory-hole concept in the CXL arena. DCD is just saying "be prepared for
> > the concept of DPA partitions with memory holes at the end".
> 
> I'm not clear how this relates.  ram and pmem partitions can already have
> holes at the end if not mapped.

The distinction is "can this DPA capacity be allocated to a region" the
new holes introduced by DCD are cases where the partition size is
greater than the allocatable size. Contrast to ram and pmem the
allocatable size is always identical to the partition size.

> > > +
> > >  	struct cxl_event_state event;
> > >  	struct cxl_poison_state poison;
> > >  	struct cxl_security_state security;
> > > @@ -708,6 +732,32 @@ struct cxl_mbox_set_partition_info {
> > >  
> > >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> > >  
> > > +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> > > +struct cxl_mbox_get_dc_config_in {
> > > +	u8 region_count;
> > > +	u8 start_region_index;
> > > +} __packed;
> > > +
> > > +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> > > +struct cxl_mbox_get_dc_config_out {
> > > +	u8 avail_region_count;
> > > +	u8 regions_returned;
> > > +	u8 rsvd[6];
> > > +	/* See CXL 3.1 Table 8-165 */
> > > +	struct cxl_dc_region_config {
> > > +		__le64 region_base;
> > > +		__le64 region_decode_length;
> > > +		__le64 region_length;
> > > +		__le64 region_block_size;
> > > +		__le32 region_dsmad_handle;
> > > +		u8 flags;
> > > +		u8 rsvd[3];
> > > +	} __packed region[] __counted_by(regions_returned);
> > 
> > Yes, the spec unfortunately uses "region" for this partition info
> > payload. This would be a good place to say "CXL spec calls this 'region'
> > but Linux calls it 'partition' not to be confused with the Linux 'struct
> > cxl_region' or all the other usages of 'region' in the specification".
> 
> In this case I totally disagree.  This is a structure being filled in by
> the hardware and is directly related to the spec.  I think I would rather
> change 
> 
> s/cxl_dc_region_info/cxl_dc_partition_info/
> 
> And leave this.  Which draws a more distinct line between what is
> specified in hardware vs a software construct.
> 
> > 
> > Linux is not obligated to follow the questionable naming decisions of
> > specifications.
> 
> We are not.  But as Alejandro says it can be confusing if we don't make
> some association to the spec.
> 
> What do you think about the HW/SW line I propose above?

Rename to cxl_dc_partition_info and drop the region_ prefixes, sure.

Otherwise, for this init-time only concern I would much rather deal with
the confusion of:

"why does Linux call this partition when the spec calls it region?":
which only trips up people that already know the difference because they read the
spec. In that case the comment will answer their confusion.

...versus:

"why are there multiple region concepts in the CXL subsystem": which
trips up everyone that greps through the CXL subsystem especially those
that have no intention of ever reading the spec.
Dan Williams Jan. 16, 2025, 6:33 a.m. UTC | #6
Alejandro Lucero Palau wrote:
> 
> On 1/15/25 02:35, Dan Williams wrote:
> > Ira Weiny wrote:
> >> Devices which optionally support Dynamic Capacity (DC) are configured
> >> via mailbox commands.  CXL 3.1 requires the host to issue the Get DC
> >> Configuration command in order to properly configure DCDs.  Without the
> >> Get DC Configuration command DCD can't be supported.
> >>
> >> Implement the DC mailbox commands as specified in CXL 3.1 section
> >> 8.2.9.9.9 (opcodes 48XXh) to read and store the DCD configuration
> >> information.  Disable DCD if DCD is not supported.  Leverage the Get DC
> >> Configuration command supported bit to indicate if DCD is supported.
> >>
> >> Linux has no use for the trailing fields of the Get Dynamic Capacity
> >> Configuration Output Payload (Total number of supported extents, number
> >> of available extents, total number of supported tags, and number of
> >> available tags).  Avoid defining those fields to use the more useful
> >> dynamic C array.
> >>
> >> Based on an original patch by Navneet Singh.
> >>
> >> Cc: Li Ming <ming.li@zohomail.com>
> >> Cc: Kees Cook <kees@kernel.org>
> >> Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
> >> Cc: linux-hardening@vger.kernel.org
> >> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> >> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> >>
> >> ---
> >> Changes:
> >> [iweiny: fix EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify)]
> >> [iweiny: limit variable scope in cxl_dev_dynamic_capacity_identify]
> >> ---
> >>   drivers/cxl/core/mbox.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++-
> >>   drivers/cxl/cxlmem.h    |  64 ++++++++++++++++++-
> >>   drivers/cxl/pci.c       |   4 ++
> >>   3 files changed, 232 insertions(+), 2 deletions(-)
> >>
> > [snipping the C code to do a data structure review first]
> >
> >> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> >> index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> >> --- a/drivers/cxl/cxlmem.h
> >> +++ b/drivers/cxl/cxlmem.h
> >> @@ -403,6 +403,7 @@ enum cxl_devtype {
> >>   	CXL_DEVTYPE_CLASSMEM,
> >>   };
> >>   
> >> +#define CXL_MAX_DC_REGION 8
> > Please no, lets not sign up to have the "which cxl 'region' concept are
> > you referring to?" debate in perpetuity. "DPA partition", "DPA
> > resource", "DPA capacity" anything but "region".
> >
> >
> 
> This next comment is not my main point to discuss in this email 
> (resources initialization is), but I seize it for giving my view in this 
> one.
> 
> Dan, you say later we (Linux) are not obligated to use "questionable 
> naming decisions of specifications", but we should not confuse people 
> either.
> 
> Maybe CXL_MAX_DC_HW_REGION would help here, for differentiating it from 
> the kernel software cxl region construct. I think we will need a CXL 
> kernel dictionary sooner or later ...

I addressed this on the reply to Ira, and yes one of the first entries
in a Linux CXL terminology document is that "regions" are mapped memory
and partitions are DPA capacity.

> >>   /**
> >>    * struct cxl_dpa_perf - DPA performance property entry
> >>    * @dpa_range: range for DPA address
> >> @@ -434,6 +435,8 @@ struct cxl_dpa_perf {
> >>    * @dpa_res: Overall DPA resource tree for the device
> >>    * @pmem_res: Active Persistent memory capacity configuration
> >>    * @ram_res: Active Volatile memory capacity configuration
> >> + * @dc_res: Active Dynamic Capacity memory configuration for each possible
> >> + *          region
> >>    * @serial: PCIe Device Serial Number
> >>    * @type: Generic Memory Class device or Vendor Specific Memory device
> >>    * @cxl_mbox: CXL mailbox context
> >> @@ -449,11 +452,23 @@ struct cxl_dev_state {
> >>   	struct resource dpa_res;
> >>   	struct resource pmem_res;
> >>   	struct resource ram_res;
> >> +	struct resource dc_res[CXL_MAX_DC_REGION];
> > This is throwing off cargo-cult alarms. The named pmem_res and ram_res
> > served us well up until the point where DPA partitions grew past 2 types
> > at well defined locations. I like the array of resources idea, but that
> > begs the question why not put all partition information into an array?
> >
> > This would also head off complications later on in this series where the
> > DPA capacity reservation and allocation flows have "dc" sidecars bolted
> > on rather than general semantics like "allocating from partition index N
> > means that all partitions indices less than N need to be skipped and
> > marked reserved".
> 
> 
> I guess this is likely how you want to change the type2 resource 
> initialization issue and where I'm afraid these two patchsets are going 
> to collide at.
> 
> If that is the case, both are going to miss the next kernel cycle since 
> it means major changes, but let's discuss it without further delays for 
> the sake of implementing the accepted changes as soon as possible, and I 
> guess with a close sync between Ira and I.

Type-2, as far as I can see, is a priority because it is in support of a
real device that end users can get their hands on today, right?

DCD, as far as I know, has no known product intercepts, just QEMU
emulation. So there is no rush there. If someone has information to the
contrary please share, if able.

The Type-2 series can also be prioritized because it is something we can get
done without cross-subsystem entanglements. So, no I do not think the
door is closed on Type-2 for v6.14, but it is certainly close which is
why I am throwing out code suggestions along with the review.

Otherwise, when 2 patch series trip over the same design wart (i.e. the
no longer suitable explicit ->ram_res and ->pmem_res members of 'struct
cxl_dev_state'), the cleanup needs to come first.

> BTW, in the case of the Type2, there are more things to discuss which I 
> do there.

Yes, hopefully it goes smoother after this point.
Jonathan Cameron Jan. 16, 2025, 10:32 a.m. UTC | #7
On Wed, 15 Jan 2025 14:34:36 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> Ira Weiny wrote:
> > Dan Williams wrote:  
> > > Ira Weiny wrote:  
> > 
> > [snip]
> >   
> > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> > > > --- a/drivers/cxl/cxlmem.h
> > > > +++ b/drivers/cxl/cxlmem.h
> > > > @@ -403,6 +403,7 @@ enum cxl_devtype {
> > > >  	CXL_DEVTYPE_CLASSMEM,
> > > >  };
> > > >  
> > > > +#define CXL_MAX_DC_REGION 8  
> > > 
> > > Please no, lets not sign up to have the "which cxl 'region' concept are
> > > you referring to?" debate in perpetuity. "DPA partition", "DPA
> > > resource", "DPA capacity" anything but "region".
> > >   
> > 
> > I'm inclined to agree with Alejandro on this one.  I've walked this
> > tightrope quite a bit with this series.  But there are other places where
> > we have chosen to change the verbiage from the spec and it has made it
> > difficult for new comers to correlate the spec with the code.
> > 
> > So I like Alejandro's idea of adding "HW" to the name to indicate that we
> > are talking about a spec or hardware defined thing.  
> 
> See below, the only people that could potentially be bothered by the
> lack of spec terminology matching are the very same people that are
> sophisticated enough to have read the spec to know its a problem.

It's confusing me.  :)  I know the confusion source exists but
that doesn't mean I remember how all the terms match up.

> 
> > 
> > That said I am open to changing some names where it is clear it is a
> > software structure.  I'll audit the series for that.
> >   
> > > >  	u64 serial;
> > > >  	enum cxl_devtype type;
> > > >  	struct cxl_mailbox cxl_mbox;
> > > >  };
> > > >  
> > > > +#define CXL_DC_REGION_STRLEN 8
> > > > +struct cxl_dc_region_info {
> > > > +	u64 base;
> > > > +	u64 decode_len;
> > > > +	u64 len;  
> > > 
> > > Duplicating partition information in multiple places, like
> > > mds->dc_region[X].base and cxlds->dc_res[X].start, feels like an
> > > RFC-quality decision for expediency that needs to reconciled on the way
> > > to upstream.  
> > 
> > I think this was done to follow a pattern of the mds being passed around
> > rather than creating resources right when partitions are read.
> > 
> > Furthermore this stands to hold this information in CPU endianess rather
> > than holding an array of region info coming from the hardware.  
> 
> Yes, the ask is translate all of this into common information that lives
> at the cxl_dev_state level.
> 
> > 
> > Let see how other changes fall out before I go hacking this though.
> >   
> > >   
> > > > +	u64 blk_size;
> > > > +	u32 dsmad_handle;
> > > > +	u8 flags;
> > > > +	u8 name[CXL_DC_REGION_STRLEN];  
> > > 
> > > No, lets not entertain:
> > > 
> > >     printk("%s\n", mds->dc_region[index].name);
> > > 
> > > ...when:
> > > 
> > >     printk("dc%d\n", index);
> > > 
> > > ...will do.  
> > 
> > Actually these buffers provide a buffer for the (struct
> > resource)dc_res[x].name pointers to point to.  
> 
> I missed that specific detail, but I still challenge whether this
> precision is needed especially since it makes the data structure
> messier. Given these names are for debug only and multi-partition DCD
> devices seem unlikely to ever exist, just use a static shared name for
> adding to ->dpa_res.

Given the read only shared concept relies on multiple hardware dc regions
(I think they map to partitions) then we are very likely to see
multiples.  (maybe I'm lost in terminology as well).


...

> > > 
> > > Linux is not obligated to follow the questionable naming decisions of
> > > specifications.  
> > 
> > We are not.  But as Alejandro says it can be confusing if we don't make
> > some association to the spec.
> > 
> > What do you think about the HW/SW line I propose above?  
> 
> Rename to cxl_dc_partition_info and drop the region_ prefixes, sure.
> 
> Otherwise, for this init-time only concern I would much rather deal with
> the confusion of:
> 
> "why does Linux call this partition when the spec calls it region?":
> which only trips up people that already know the difference because they read the
> spec. In that case the comment will answer their confusion.
> 
> ...versus:
> 
> "why are there multiple region concepts in the CXL subsystem": which
> trips up everyone that greps through the CXL subsystem especially those
> that have no intention of ever reading the spec.

versus one time rename of all internal infrastructure to align to the spec
and only keep the confusion at the boundaries where we have ABI.

Horrible option but how often are those diving in the code that bothered
about the userspace /kernel interaction terminology?

Anyhow, they are all horrible choices.

Jonathan

>
Ira Weiny Jan. 22, 2025, 6:02 p.m. UTC | #8
Dan Williams wrote:
> Ira Weiny wrote:
> > Dan Williams wrote:
> > > Ira Weiny wrote:
> > 
> > [snip]
> > 
> > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> > > > --- a/drivers/cxl/cxlmem.h
> > > > +++ b/drivers/cxl/cxlmem.h
> > > > @@ -403,6 +403,7 @@ enum cxl_devtype {
> > > >  	CXL_DEVTYPE_CLASSMEM,
> > > >  };
> > > >  
> > > > +#define CXL_MAX_DC_REGION 8
> > > 
> > > Please no, lets not sign up to have the "which cxl 'region' concept are
> > > you referring to?" debate in perpetuity. "DPA partition", "DPA
> > > resource", "DPA capacity" anything but "region".
> > > 
> > 
> > I'm inclined to agree with Alejandro on this one.  I've walked this
> > tightrope quite a bit with this series.  But there are other places where
> > we have chosen to change the verbiage from the spec and it has made it
> > difficult for new comers to correlate the spec with the code.
> > 
> > So I like Alejandro's idea of adding "HW" to the name to indicate that we
> > are talking about a spec or hardware defined thing.
> 
> See below, the only people that could potentially be bothered by the
> lack of spec terminology matching are the very same people that are
> sophisticated enough to have read the spec to know its a problem.

Honestly at this point I think the code has deviated enough from the spec
that it is just not worth me saying any more.  I'll change everything to
partition and field the questions later as they come.

> 
> > 
> > That said I am open to changing some names where it is clear it is a
> > software structure.  I'll audit the series for that.
> > 
> > > >  	u64 serial;
> > > >  	enum cxl_devtype type;
> > > >  	struct cxl_mailbox cxl_mbox;
> > > >  };
> > > >  
> > > > +#define CXL_DC_REGION_STRLEN 8
> > > > +struct cxl_dc_region_info {
> > > > +	u64 base;
> > > > +	u64 decode_len;
> > > > +	u64 len;
> > > 
> > > Duplicating partition information in multiple places, like
> > > mds->dc_region[X].base and cxlds->dc_res[X].start, feels like an
> > > RFC-quality decision for expediency that needs to reconciled on the way
> > > to upstream.
> > 
> > I think this was done to follow a pattern of the mds being passed around
> > rather than creating resources right when partitions are read.
> > 
> > Furthermore this stands to hold this information in CPU endianess rather
> > than holding an array of region info coming from the hardware.
> 
> Yes, the ask is translate all of this into common information that lives
> at the cxl_dev_state level.

yea.  And build on what you have in the DPA rework.

> 
> > 
> > Let see how other changes fall out before I go hacking this though.
> > 
> > > 
> > > > +	u64 blk_size;
> > > > +	u32 dsmad_handle;
> > > > +	u8 flags;
> > > > +	u8 name[CXL_DC_REGION_STRLEN];
> > > 
> > > No, lets not entertain:
> > > 
> > >     printk("%s\n", mds->dc_region[index].name);
> > > 
> > > ...when:
> > > 
> > >     printk("dc%d\n", index);
> > > 
> > > ...will do.
> > 
> > Actually these buffers provide a buffer for the (struct
> > resource)dc_res[x].name pointers to point to.
> 
> I missed that specific detail, but I still challenge whether this
> precision is needed especially since it makes the data structure
> messier. Given these names are for debug only and multi-partition DCD
> devices seem unlikely to ever exist, just use a static shared name for
> adding to ->dpa_res.

Using a static name is good.

> 
> > 
> > > 
> > > DCD introduces the concept of "decode size vs usable capacity" into the
> > > partition information, but I see no reason to conceptually tie that to
> > > only DCD.  Fabio's memory hole patches show that there is already a
> > > memory-hole concept in the CXL arena. DCD is just saying "be prepared for
> > > the concept of DPA partitions with memory holes at the end".
> > 
> > I'm not clear how this relates.  ram and pmem partitions can already have
> > holes at the end if not mapped.
> 
> The distinction is "can this DPA capacity be allocated to a region" the
> new holes introduced by DCD are cases where the partition size is
> greater than the allocatable size. Contrast to ram and pmem the
> allocatable size is always identical to the partition size.

I still don't quite get what you are saying.  The user can always allocate
a region of the full DCD partition size.  It is just that the memory
within that region may not be backed yet (no extents).

> 
> > > > +
> > > >  	struct cxl_event_state event;
> > > >  	struct cxl_poison_state poison;
> > > >  	struct cxl_security_state security;
> > > > @@ -708,6 +732,32 @@ struct cxl_mbox_set_partition_info {
> > > >  
> > > >  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
> > > >  
> > > > +/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
> > > > +struct cxl_mbox_get_dc_config_in {
> > > > +	u8 region_count;
> > > > +	u8 start_region_index;
> > > > +} __packed;
> > > > +
> > > > +/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
> > > > +struct cxl_mbox_get_dc_config_out {
> > > > +	u8 avail_region_count;
> > > > +	u8 regions_returned;
> > > > +	u8 rsvd[6];
> > > > +	/* See CXL 3.1 Table 8-165 */
> > > > +	struct cxl_dc_region_config {
> > > > +		__le64 region_base;
> > > > +		__le64 region_decode_length;
> > > > +		__le64 region_length;
> > > > +		__le64 region_block_size;
> > > > +		__le32 region_dsmad_handle;
> > > > +		u8 flags;
> > > > +		u8 rsvd[3];
> > > > +	} __packed region[] __counted_by(regions_returned);
> > > 
> > > Yes, the spec unfortunately uses "region" for this partition info
> > > payload. This would be a good place to say "CXL spec calls this 'region'
> > > but Linux calls it 'partition' not to be confused with the Linux 'struct
> > > cxl_region' or all the other usages of 'region' in the specification".
> > 
> > In this case I totally disagree.  This is a structure being filled in by
> > the hardware and is directly related to the spec.  I think I would rather
> > change 
> > 
> > s/cxl_dc_region_info/cxl_dc_partition_info/
> > 
> > And leave this.  Which draws a more distinct line between what is
> > specified in hardware vs a software construct.
> > 
> > > 
> > > Linux is not obligated to follow the questionable naming decisions of
> > > specifications.
> > 
> > We are not.  But as Alejandro says it can be confusing if we don't make
> > some association to the spec.
> > 
> > What do you think about the HW/SW line I propose above?
> 
> Rename to cxl_dc_partition_info and drop the region_ prefixes, sure.
> 
> Otherwise, for this init-time only concern I would much rather deal with
> the confusion of:
> 
> "why does Linux call this partition when the spec calls it region?":

But this is not the question I will get.  The question will be.

"Where is DCD region processed in the code?  I grepped for region and
found nothing."

Or

"I'm searching the spec PDF for DCD partition and can't find that.  Where
is DCD partition specified?"

> which only trips up people that already know the difference because they read the
> spec. In that case the comment will answer their confusion.
> 
> ...versus:
> 
> "why are there multiple region concepts in the CXL subsystem": which
> trips up everyone that greps through the CXL subsystem especially those
> that have no intention of ever reading the spec.

Ok I've already said more than I intended.  I will change everything to
partition.

Ira
Dan Williams Jan. 22, 2025, 9:02 p.m. UTC | #9
Jonathan Cameron wrote:
> On Wed, 15 Jan 2025 14:34:36 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > Ira Weiny wrote:
> > > Dan Williams wrote:  
> > > > Ira Weiny wrote:  
> > > 
> > > [snip]
> > >   
> > > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > > index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
> > > > > --- a/drivers/cxl/cxlmem.h
> > > > > +++ b/drivers/cxl/cxlmem.h
> > > > > @@ -403,6 +403,7 @@ enum cxl_devtype {
> > > > >  	CXL_DEVTYPE_CLASSMEM,
> > > > >  };
> > > > >  
> > > > > +#define CXL_MAX_DC_REGION 8  
> > > > 
> > > > Please no, lets not sign up to have the "which cxl 'region' concept are
> > > > you referring to?" debate in perpetuity. "DPA partition", "DPA
> > > > resource", "DPA capacity" anything but "region".
> > > >   
> > > 
> > > I'm inclined to agree with Alejandro on this one.  I've walked this
> > > tightrope quite a bit with this series.  But there are other places where
> > > we have chosen to change the verbiage from the spec and it has made it
> > > difficult for new comers to correlate the spec with the code.
> > > 
> > > So I like Alejandro's idea of adding "HW" to the name to indicate that we
> > > are talking about a spec or hardware defined thing.  
> > 
> > See below, the only people that could potentially be bothered by the
> > lack of spec terminology matching are the very same people that are
> > sophisticated enough to have read the spec to know its a problem.
> 
> It's confusing me.  :)  I know the confusion source exists but
> that doesn't mean I remember how all the terms match up.

CXL 3.1 Figure 9-24 DCD DPA Space Example

In that one diagram it uses "space", "capacity", "partition", and
"region". Linux is free to say "let's just pick one term and stick to
it". "Region" is already oversubscribed.

I agree with Alejandro that a glossary of Linux terms added to the
Documentation is overdue and would help people orient to what maps
where. That would be needed even if the "continue to oversubscribe
'region'" proposal went through to explain "oh, no, not that 'region'
*this* 'region'".

[..]
> > Actually these buffers provide a buffer for the (struct
> > > resource)dc_res[x].name pointers to point to.  
> > 
> > I missed that specific detail, but I still challenge whether this
> > precision is needed especially since it makes the data structure
> > messier. Given these names are for debug only and multi-partition DCD
> > devices seem unlikely to ever exist, just use a static shared name for
> > adding to ->dpa_res.
> 
> Given the read only shared concept relies on multiple hardware dc regions
> (I think they map to partitions) then we are very likely to see
> multiples.  (maybe I'm lost in terminology as well).

Ah, good point. I was focusing on "devices with DPA partitions of
different performance characteristics within the same operation mode" as
being unlikely, but "devices with both shared and non-shared capacity"
indeed seems more likely.

Now, part of the code smell that made me fall out of love with 'enum
cxl_decoder_mode' was this continued confusion between mode names and
partition ids, where printing "dc%d" to the resource name was part of
that smell.

The proposal for what goes into the "name" field of partition resources
in the "DPA metadata is a mess..." series is to disconnect operation
modes from partition indices. A natural consequence of allowing "pmem"
to be partition 0, is that a dynamic device may also have 0 static
capacity, or other arrangements that make the partition-id less
meaningful to userspace.

So instead of needing to print "dc%d" into the resource name field, the
resource name is simply the operation mode: ram, pmem, dynamic ram,
dynamic pmem*, shared ram, shared pmem*.

The implication is that userspace does not need to care about partition
ids, unless and until a device shows up that ships multiple partitions
with the same operation mode, but different performance characteristics.
If that happens userspace would need a knob to disambiguate partitions
with the same operation mode. That does not feel like something that is
threatening to become real in the near term, and partition ids can
continue to be hidden from userspace.

* I doubt we will see dynamic pmem or shared pmem.

[..]
> > > > Linux is not obligated to follow the questionable naming decisions of
> > > > specifications.  
> > > 
> > > We are not.  But as Alejandro says it can be confusing if we don't make
> > > some association to the spec.
> > > 
> > > What do you think about the HW/SW line I propose above?  
> > 
> > Rename to cxl_dc_partition_info and drop the region_ prefixes, sure.
> > 
> > Otherwise, for this init-time only concern I would much rather deal with
> > the confusion of:
> > 
> > "why does Linux call this partition when the spec calls it region?":
> > which only trips up people that already know the difference because they read the
> > spec. In that case the comment will answer their confusion.
> > 
> > ...versus:
> > 
> > "why are there multiple region concepts in the CXL subsystem": which
> > trips up everyone that greps through the CXL subsystem especially those
> > that have no intention of ever reading the spec.
> 
> versus one time rename of all internal infrastructure to align to the spec
> and only keep the confusion at the boundaries where we have ABI.

That's just it, to date 'region' has always meant 'struct cxl_region' in
drivers/cxl/, so there is no one time rename to be had. The decision is
whether to decline new claimers of the 'region' moniker and create a
document to explain that term, or play the "dc region" ambiguity game
for the duration.

I vote "diverge from spec and document".

> Horrible option but how often are those diving in the code that bothered
> about the userspace /kernel interaction terminology?
> 
> Anyhow, they are all horrible choices.

Agree!
Dan Williams Jan. 22, 2025, 9:30 p.m. UTC | #10
Ira Weiny wrote:
[..]
> > The distinction is "can this DPA capacity be allocated to a region" the
> > new holes introduced by DCD are cases where the partition size is
> > greater than the allocatable size. Contrast to ram and pmem the
> > allocatable size is always identical to the partition size.
> 
> I still don't quite get what you are saying.  The user can always allocate
> a region of the full DCD partition size.  It is just that the memory

Quick note: "region of the full DCD partition size" means something to
me immediately where "region of full DC region size" would tempt a
clarifying question.

> within that region may not be backed yet (no extents).

A partition is a boundary between DPA capacity ranges of different
operation modes / performance characteristics. A region is constructed
of decoders that reference decode length. The usable capacity of that
decode range, even when fully populated with extents, may be less than
the decode range of the decoder / partition. That is similar in concept
to the low-memory-hole where the decoders map more in their range than
the usable capacity seen by the region.

[..]
> > > > Linux is not obligated to follow the questionable naming decisions of
> > > > specifications.
> > > 
> > > We are not.  But as Alejandro says it can be confusing if we don't make
> > > some association to the spec.
> > > 
> > > What do you think about the HW/SW line I propose above?
> > 
> > Rename to cxl_dc_partition_info and drop the region_ prefixes, sure.
> > 
> > Otherwise, for this init-time only concern I would much rather deal with
> > the confusion of:
> > 
> > "why does Linux call this partition when the spec calls it region?":
> 
> But this is not the question I will get.  The question will be.
> 
> "Where is DCD region processed in the code?  I grepped for region and
> found nothing."

Make grep find that definition:

$ git grep -n -C2 -i "dc region"
Documentation/driver-api/cxl/memory-devices.rst-387-
Documentation/driver-api/cxl/memory-devices.rst-388-partition: a span of DPA capacity delineated by "Get Partition Info", or
Documentation/driver-api/cxl/memory-devices.rst:389:"Get Dynamic Capacity Configuration". A "DC Region" is equivalent to a

> Or
> 
> "I'm searching the spec PDF for DCD partition and can't find that.  Where
> is DCD partition specified?"

If they are coming from the code first that's why we have spec reference
comments in the code.
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 599934d066518341eb6ea9fc3319cd7098cbc2f3..a4cf9fbb1edfa275e8566bfacea03a49d68f9319 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1168,7 +1168,7 @@  int cxl_dev_state_identify(struct cxl_memdev_state *mds)
 	if (rc < 0)
 		return rc;
 
-	mds->total_bytes =
+	mds->static_bytes =
 		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
 	mds->volatile_only_bytes =
 		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
@@ -1274,6 +1274,154 @@  int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
 	return rc;
 }
 
+static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index,
+				   struct cxl_dc_region_config *region_config)
+{
+	struct cxl_dc_region_info *dcr = &mds->dc_region[index];
+	struct device *dev = mds->cxlds.dev;
+
+	dcr->base = le64_to_cpu(region_config->region_base);
+	dcr->decode_len = le64_to_cpu(region_config->region_decode_length);
+	dcr->decode_len *= CXL_CAPACITY_MULTIPLIER;
+	dcr->len = le64_to_cpu(region_config->region_length);
+	dcr->blk_size = le64_to_cpu(region_config->region_block_size);
+	dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle);
+	dcr->flags = region_config->flags;
+	snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index);
+
+	/* Check regions are in increasing DPA order */
+	if (index > 0) {
+		struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1];
+
+		if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) {
+			dev_err(dev,
+				"DPA ordering violation for DC region %d and %d\n",
+				index - 1, index);
+			return -EINVAL;
+		}
+	}
+
+	if (!IS_ALIGNED(dcr->base, SZ_256M) ||
+	    !IS_ALIGNED(dcr->base, dcr->blk_size)) {
+		dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n",
+			index, dcr->base, dcr->blk_size);
+		return -EINVAL;
+	}
+
+	if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len ||
+	    !IS_ALIGNED(dcr->len, dcr->blk_size)) {
+		dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n",
+			index, dcr->decode_len, dcr->len, dcr->blk_size);
+		return -EINVAL;
+	}
+
+	if (dcr->blk_size == 0 || dcr->blk_size % CXL_DCD_BLOCK_LINE_SIZE ||
+	    !is_power_of_2(dcr->blk_size)) {
+		dev_err(dev, "DC region %d invalid block size; %#llx\n",
+			index, dcr->blk_size);
+		return -EINVAL;
+	}
+
+	dev_dbg(dev,
+		"DC region %s base %#llx length %#llx block size %#llx\n",
+		dcr->name, dcr->base, dcr->decode_len, dcr->blk_size);
+
+	return 0;
+}
+
+/* Returns the number of regions in dc_resp or -ERRNO */
+static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region,
+			     struct cxl_mbox_get_dc_config_out *dc_resp,
+			     size_t dc_resp_size)
+{
+	struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) {
+		.region_count = CXL_MAX_DC_REGION,
+		.start_region_index = start_region,
+	};
+	struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_DC_CONFIG,
+		.payload_in = &get_dc,
+		.size_in = sizeof(get_dc),
+		.size_out = dc_resp_size,
+		.payload_out = dc_resp,
+		.min_out = 1,
+	};
+	struct device *dev = mds->cxlds.dev;
+	int rc;
+
+	rc = cxl_internal_send_cmd(&mds->cxlds.cxl_mbox, &mbox_cmd);
+	if (rc < 0)
+		return rc;
+
+	dev_dbg(dev, "Read %d/%d DC regions\n",
+		dc_resp->regions_returned, dc_resp->avail_region_count);
+	return dc_resp->regions_returned;
+}
+
+/**
+ * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity
+ *					 information from the device.
+ * @mds: The memory device state
+ *
+ * Read Dynamic Capacity information from the device and populate the state
+ * structures for later use.
+ *
+ * Return: 0 if identify was executed successfully, -ERRNO on error.
+ */
+int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds)
+{
+	size_t dc_resp_size = mds->cxlds.cxl_mbox.payload_size;
+	struct device *dev = mds->cxlds.dev;
+	u8 start_region;
+
+	if (!cxl_dcd_supported(mds)) {
+		dev_dbg(dev, "DCD not supported\n");
+		return 0;
+	}
+
+	struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) =
+					kvmalloc(dc_resp_size, GFP_KERNEL);
+	if (!dc_resp)
+		return -ENOMEM;
+
+	start_region = 0;
+	do {
+		int rc, i, j;
+
+		rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size);
+		if (rc < 0) {
+			dev_err(dev, "Failed to get DC config: %d\n", rc);
+			return rc;
+		}
+
+		mds->nr_dc_region += rc;
+
+		if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) {
+			dev_err(dev, "Invalid num of dynamic capacity regions %d\n",
+				mds->nr_dc_region);
+			return -EINVAL;
+		}
+
+		for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) {
+			rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]);
+			if (rc)
+				return rc;
+		}
+
+		start_region = mds->nr_dc_region;
+
+	} while (mds->nr_dc_region < dc_resp->avail_region_count);
+
+	mds->dynamic_bytes =
+		mds->dc_region[mds->nr_dc_region - 1].base +
+		mds->dc_region[mds->nr_dc_region - 1].decode_len -
+		mds->dc_region[0].base;
+	dev_dbg(dev, "Total dynamic range: %#llx\n", mds->dynamic_bytes);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, "CXL");
+
 static int add_dpa_res(struct device *dev, struct resource *parent,
 		       struct resource *res, resource_size_t start,
 		       resource_size_t size, const char *type)
@@ -1304,8 +1452,15 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 {
 	struct cxl_dev_state *cxlds = &mds->cxlds;
 	struct device *dev = cxlds->dev;
+	size_t untenanted_mem;
 	int rc;
 
+	mds->total_bytes = mds->static_bytes;
+	if (mds->nr_dc_region) {
+		untenanted_mem = mds->dc_region[0].base - mds->static_bytes;
+		mds->total_bytes += untenanted_mem + mds->dynamic_bytes;
+	}
+
 	if (!cxlds->media_ready) {
 		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
 		cxlds->ram_res = DEFINE_RES_MEM(0, 0);
@@ -1315,6 +1470,15 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 
 	cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
 
+	for (int i = 0; i < mds->nr_dc_region; i++) {
+		struct cxl_dc_region_info *dcr = &mds->dc_region[i];
+
+		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i],
+				 dcr->base, dcr->decode_len, dcr->name);
+		if (rc)
+			return rc;
+	}
+
 	if (mds->partition_align_bytes == 0) {
 		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
 				 mds->volatile_only_bytes, "ram");
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index e8907c403edbd83c8a36b8d013c6bc3391207ee6..05a0718aea73b3b2a02c608bae198eac7c462523 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -403,6 +403,7 @@  enum cxl_devtype {
 	CXL_DEVTYPE_CLASSMEM,
 };
 
+#define CXL_MAX_DC_REGION 8
 /**
  * struct cxl_dpa_perf - DPA performance property entry
  * @dpa_range: range for DPA address
@@ -434,6 +435,8 @@  struct cxl_dpa_perf {
  * @dpa_res: Overall DPA resource tree for the device
  * @pmem_res: Active Persistent memory capacity configuration
  * @ram_res: Active Volatile memory capacity configuration
+ * @dc_res: Active Dynamic Capacity memory configuration for each possible
+ *          region
  * @serial: PCIe Device Serial Number
  * @type: Generic Memory Class device or Vendor Specific Memory device
  * @cxl_mbox: CXL mailbox context
@@ -449,11 +452,23 @@  struct cxl_dev_state {
 	struct resource dpa_res;
 	struct resource pmem_res;
 	struct resource ram_res;
+	struct resource dc_res[CXL_MAX_DC_REGION];
 	u64 serial;
 	enum cxl_devtype type;
 	struct cxl_mailbox cxl_mbox;
 };
 
+#define CXL_DC_REGION_STRLEN 8
+struct cxl_dc_region_info {
+	u64 base;
+	u64 decode_len;
+	u64 len;
+	u64 blk_size;
+	u32 dsmad_handle;
+	u8 flags;
+	u8 name[CXL_DC_REGION_STRLEN];
+};
+
 static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
 {
 	return dev_get_drvdata(cxl_mbox->host);
@@ -473,7 +488,9 @@  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
  * @dcd_cmds: List of DCD commands implemented by memory device
  * @enabled_cmds: Hardware commands found enabled in CEL.
  * @exclusive_cmds: Commands that are kernel-internal only
- * @total_bytes: sum of all possible capacities
+ * @total_bytes: length of all possible capacities
+ * @static_bytes: length of possible static RAM and PMEM partitions
+ * @dynamic_bytes: length of possible DC partitions (DC Regions)
  * @volatile_only_bytes: hard volatile capacity
  * @persistent_only_bytes: hard persistent capacity
  * @partition_align_bytes: alignment size for partition-able capacity
@@ -483,6 +500,8 @@  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
  * @next_persistent_bytes: persistent capacity change pending device reset
  * @ram_perf: performance data entry matched to RAM partition
  * @pmem_perf: performance data entry matched to PMEM partition
+ * @nr_dc_region: number of DC regions implemented in the memory device
+ * @dc_region: array containing info about the DC regions
  * @event: event log driver state
  * @poison: poison driver state info
  * @security: security driver state info
@@ -499,6 +518,8 @@  struct cxl_memdev_state {
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
 	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
 	u64 total_bytes;
+	u64 static_bytes;
+	u64 dynamic_bytes;
 	u64 volatile_only_bytes;
 	u64 persistent_only_bytes;
 	u64 partition_align_bytes;
@@ -510,6 +531,9 @@  struct cxl_memdev_state {
 	struct cxl_dpa_perf ram_perf;
 	struct cxl_dpa_perf pmem_perf;
 
+	u8 nr_dc_region;
+	struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION];
+
 	struct cxl_event_state event;
 	struct cxl_poison_state poison;
 	struct cxl_security_state security;
@@ -708,6 +732,32 @@  struct cxl_mbox_set_partition_info {
 
 #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
 
+/* See CXL 3.1 Table 8-163 get dynamic capacity config Input Payload */
+struct cxl_mbox_get_dc_config_in {
+	u8 region_count;
+	u8 start_region_index;
+} __packed;
+
+/* See CXL 3.1 Table 8-164 get dynamic capacity config Output Payload */
+struct cxl_mbox_get_dc_config_out {
+	u8 avail_region_count;
+	u8 regions_returned;
+	u8 rsvd[6];
+	/* See CXL 3.1 Table 8-165 */
+	struct cxl_dc_region_config {
+		__le64 region_base;
+		__le64 region_decode_length;
+		__le64 region_length;
+		__le64 region_block_size;
+		__le32 region_dsmad_handle;
+		u8 flags;
+		u8 rsvd[3];
+	} __packed region[] __counted_by(regions_returned);
+	/* Trailing fields unused */
+} __packed;
+#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0)
+#define CXL_DCD_BLOCK_LINE_SIZE 0x40
+
 /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
 struct cxl_mbox_set_timestamp_in {
 	__le64 timestamp;
@@ -831,6 +881,7 @@  enum {
 int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
 			  struct cxl_mbox_cmd *cmd);
 int cxl_dev_state_identify(struct cxl_memdev_state *mds);
+int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds);
 int cxl_await_media_ready(struct cxl_dev_state *cxlds);
 int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
 int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
@@ -844,6 +895,17 @@  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 			    enum cxl_event_log_type type,
 			    enum cxl_event_type event_type,
 			    const uuid_t *uuid, union cxl_event *evt);
+
+static inline bool cxl_dcd_supported(struct cxl_memdev_state *mds)
+{
+	return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
+}
+
+static inline void cxl_disable_dcd(struct cxl_memdev_state *mds)
+{
+	clear_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds);
+}
+
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 0241d1d7133a4b9c3fe3fddfdc0bcc9cf807ee11..5082625a7b3f51a84f894a3265e922e51b794b68 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -989,6 +989,10 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	rc = cxl_dev_dynamic_capacity_identify(mds);
+	if (rc)
+		cxl_disable_dcd(mds);
+
 	rc = cxl_mem_create_range_info(mds);
 	if (rc)
 		return rc;