diff mbox series

[2/5] cxl/region: Add dynamic capacity cxl region support.

Message ID 20230604-dcd-type2-upstream-v1-2-71b6341bae54@intel.com
State New, archived
Headers show
Series cxl/dcd: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny June 14, 2023, 7:16 p.m. UTC
From: Navneet Singh <navneet.singh@intel.com>

CXL devices optionally support dynamic capacity. CXL Regions must be
created to access this capacity.

Add sysfs entries to create dynamic capacity cxl regions. Provide a new
Dynamic Capacity decoder mode which targets dynamic capacity on devices
which are added to that region.

Below are the steps to create and delete dynamic capacity region0
(example).

    region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
    echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
    echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
    echo 1 > /sys/bus/cxl/devices/$region/interleave_ways

    echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
    echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size

    echo 0x400000000 > /sys/bus/cxl/devices/$region/size
    echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
    echo 1 > /sys/bus/cxl/devices/$region/commit
    echo $region > /sys/bus/cxl/drivers/cxl_region/bind

    echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region

Signed-off-by: Navneet Singh <navneet.singh@intel.com>

---
[iweiny: fixups]
[iweiny: remove unused CXL_DC_REGION_MODE macro]
[iweiny: Make dc_mode_to_region_index static]
[iweiny: simplify <sysfs>/create_dc_region]
[iweiny: introduce decoder_mode_is_dc]
[djbw: fixups, no sign-off: preview only]
---
 drivers/cxl/Kconfig       |  11 +++
 drivers/cxl/core/core.h   |   7 ++
 drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
 drivers/cxl/core/port.c   |  18 ++++
 drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
 drivers/cxl/cxl.h         |  28 ++++++
 drivers/dax/cxl.c         |   4 +
 7 files changed, 409 insertions(+), 28 deletions(-)

Comments

Dave Jiang June 14, 2023, 11:37 p.m. UTC | #1
On 6/14/23 12:16, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity. CXL Regions must be
> created to access this capacity.
> 
> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> Dynamic Capacity decoder mode which targets dynamic capacity on devices
> which are added to that region.
> 
> Below are the steps to create and delete dynamic capacity region0
> (example).
> 
>      region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>      echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>      echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>      echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> 
>      echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>      echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> 
>      echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>      echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>      echo 1 > /sys/bus/cxl/devices/$region/commit
>      echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> 
>      echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: fixups]
> [iweiny: remove unused CXL_DC_REGION_MODE macro]
> [iweiny: Make dc_mode_to_region_index static]
> [iweiny: simplify <sysfs>/create_dc_region]
> [iweiny: introduce decoder_mode_is_dc]
> [djbw: fixups, no sign-off: preview only]
> ---
>   drivers/cxl/Kconfig       |  11 +++
>   drivers/cxl/core/core.h   |   7 ++
>   drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
>   drivers/cxl/core/port.c   |  18 ++++
>   drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
>   drivers/cxl/cxl.h         |  28 ++++++
>   drivers/dax/cxl.c         |   4 +
>   7 files changed, 409 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index ff4e78117b31..df034889d053 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -121,6 +121,17 @@ config CXL_REGION
>   
>   	  If unsure say 'y'
>   
> +config CXL_DCD
> +	bool "CXL: DCD Support"
> +	default CXL_BUS
> +	depends on CXL_REGION
> +	help
> +	  Enable the CXL core to provision CXL DCD regions.
> +	  CXL devices optionally support dynamic capacity and DCD region
> +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> +
> +	  If unsure say 'y'
> +
>   config CXL_REGION_INVALIDATION_TEST
>   	bool "CXL: Region Cache Management Bypass (TEST)"
>   	depends on CXL_REGION
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 27f0968449de..725700ab5973 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
>   
>   extern struct attribute_group cxl_base_attribute_group;
>   
> +#ifdef CONFIG_CXL_DCD
> +extern struct device_attribute dev_attr_create_dc_region;
> +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> +#else
> +#define SET_CXL_DC_REGION_ATTR(x)
> +#endif
> +
>   #ifdef CONFIG_CXL_REGION
>   extern struct device_attribute dev_attr_create_pmem_region;
>   extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 514d30131d92..29649b47d177 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>   	struct resource *res = cxled->dpa_res;
>   	resource_size_t skip_start;
> +	resource_size_t skipped = cxled->skip;
>   
>   	lockdep_assert_held_write(&cxl_dpa_rwsem);
>   
>   	/* save @skip_start, before @res is released */
> -	skip_start = res->start - cxled->skip;
> +	skip_start = res->start - skipped;
>   	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> -	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +	if (cxled->skip != 0) {
> +		while (skipped != 0) {
> +			res = xa_load(&cxled->skip_res, skip_start);
> +			__release_region(&cxlds->dpa_res, skip_start,
> +							resource_size(res));
> +			xa_erase(&cxled->skip_res, skip_start);
> +			skip_start += resource_size(res);
> +			skipped -= resource_size(res);
> +			}
> +	}
>   	cxled->skip = 0;
>   	cxled->dpa_res = NULL;
>   	put_device(&cxled->cxld.dev);
> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>   	__cxl_dpa_release(cxled);
>   }
>   
> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> +{
> +	int index = 0;
> +
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		if (mode == i)
> +			return index;
> +		index++;
> +	}
> +
> +	return -EINVAL;
> +}
> +
>   static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   			     resource_size_t base, resource_size_t len,
>   			     resource_size_t skipped)
> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   	struct cxl_port *port = cxled_to_port(cxled);
>   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>   	struct device *dev = &port->dev;
> +	struct device *ed_dev = &cxled->cxld.dev;
> +	struct resource *dpa_res = &cxlds->dpa_res;
> +	resource_size_t skip_len = 0;
>   	struct resource *res;
> +	int rc, index;
>   
>   	lockdep_assert_held_write(&cxl_dpa_rwsem);
>   
> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   	}
>   
>   	if (skipped) {
> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> +		resource_size_t skip_base = base - skipped;
> +
> +		if (decoder_mode_is_dc(cxled->mode)) {

Maybe move this entire block to a helper function to reduce the size of 
the current function and reduce indent levels and improve readability?

> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->ram_res.end) {
> +				skip_len = cxlds->ram_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->pmem_res.end) {
> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			index = dc_mode_to_region_index(cxled->mode);
> +			for (int i = 0; i <= index; i++) {
> +				struct resource *dcr = &cxlds->dc_res[i];
> +
> +				if (skip_base < dcr->start) {
> +					skip_len = dcr->start - skip_base;
> +					res = __request_region(dpa_res,
> +							skip_base, skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +
> +				if (skip_base == base) {
> +					dev_dbg(dev, "skip done!\n");
> +					break;
> +				}
> +
> +				if (resource_size(dcr) &&
> +						skip_base <= dcr->end) {
> +					if (skip_base > base)
> +						dev_err(dev, "Skip error\n");
> +
> +					skip_len = dcr->end - skip_base + 1;
> +					res = __request_region(dpa_res, skip_base,
> +							skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +			}
> +		} else	{
> +			res = __request_region(dpa_res, base - skipped, skipped,
> +							dev_name(ed_dev), 0);
> +			if (!res)
> +				goto error;
> +
> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
>   		}
>   	}
> -	res = __request_region(&cxlds->dpa_res, base, len,
> -			       dev_name(&cxled->cxld.dev), 0);
> +
> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>   	if (!res) {
>   		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> -			port->id, cxled->cxld.id);
> -		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +				port->id, cxled->cxld.id);
> +		if (skipped) {
> +			resource_size_t skip_base = base - skipped;
> +
> +			while (skipped != 0) {
> +				if (skip_base > base)
> +					dev_err(dev, "Skip error\n");
> +
> +				res = xa_load(&cxled->skip_res, skip_base);
> +				__release_region(dpa_res, skip_base,
> +							resource_size(res));
> +				xa_erase(&cxled->skip_res, skip_base);
> +				skip_base += resource_size(res);
> +				skipped -= resource_size(res);
> +			}
> +		}
>   		return -EBUSY;
>   	}
>   	cxled->dpa_res = res;
>   	cxled->skip = skipped;
>   
> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> +		int index = dc_mode_to_region_index(mode);
> +
> +		if (resource_contains(&cxlds->dc_res[index], res)) {
> +			cxled->mode = mode;
> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> +			goto success > +		}
> +	}

This block should only happen if decoder_mode_is_dc() right? If that's 
the case, you might be able to refactor it so the 'goto success' isn't 
necessary.

>   	if (resource_contains(&cxlds->pmem_res, res))
>   		cxled->mode = CXL_DECODER_PMEM;
>   	else if (resource_contains(&cxlds->ram_res, res))
> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   		cxled->mode = CXL_DECODER_MIXED;
>   	}
>   
> +success:
>   	port->hdm_end++;
>   	get_device(&cxled->cxld.dev);
>   	return 0;
> +
> +error:
> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> +			port->id, cxled->cxld.id);
> +	return -EBUSY;
> +
>   }
>   
>   int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>   	switch (mode) {
>   	case CXL_DECODER_RAM:
>   	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>   		break;
>   	default:
>   		dev_dbg(dev, "unsupported mode: %d\n", mode);
> @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>   		goto out;
>   	}
>   
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		int index = dc_mode_to_region_index(i);
> +
> +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> +			dev_dbg(dev, "no available dynamic capacity\n");
> +			rc = -ENXIO;
> +			goto out;
> +		}
> +	}
> +
>   	cxled->mode = mode;
>   	rc = 0;
>   out:
> @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>   					 resource_size_t *skip_out)
>   {
>   	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
> +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
>   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct device *dev = &cxled->cxld.dev;
>   	resource_size_t start, avail, skip;
>   	struct resource *p, *last;
> +	int index;
>   
>   	lockdep_assert_held(&cxl_dpa_rwsem);
>   
> @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>   	else
>   		free_pmem_start = cxlds->pmem_res.start;
>   
> +	/*
> +	 * One HDM Decoder per DC region to map memory with different
> +	 * DSMAS entry.
> +	 */
> +	index = dc_mode_to_region_index(cxled->mode);
> +	if (index >= 0) {
> +		if (cxlds->dc_res[index].child) {
> +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
> +					index);
> +			return -EINVAL;
> +		}
> +		free_dc_start = cxlds->dc_res[index].start;
> +	}
> +
>   	if (cxled->mode == CXL_DECODER_RAM) {
>   		start = free_ram_start;
>   		avail = cxlds->ram_res.end - start + 1;
> @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>   		else
>   			skip_end = start - 1;
>   		skip = skip_end - skip_start + 1;
> +	} else if (decoder_mode_is_dc(cxled->mode)) {
> +		resource_size_t skip_start, skip_end;
> +
> +		start = free_dc_start;
> +		avail = cxlds->dc_res[index].end - start + 1;
> +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> +			skip_start = free_ram_start;
> +		else
> +			skip_start = free_pmem_start;
> +		/*
> +		 * If some dc region is already mapped, then that allocation
> +		 * already handled the RAM and PMEM skip.Check for DC region
> +		 * skip.
> +		 */
> +		for (int i = index - 1; i >= 0 ; i--) {
> +			if (cxlds->dc_res[i].child) {
> +				skip_start = cxlds->dc_res[i].child->end + 1;
> +				break;
> +			}
> +		}
> +
> +		skip_end = start - 1;
> +		skip = skip_end - skip_start + 1;
>   	} else {
>   		dev_dbg(cxled_dev(cxled), "mode not set\n");
>   		avail = 0;
> @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>   
>   	avail = cxl_dpa_freespace(cxled, &start, &skip);
>   
> +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> +						start, size, skip);
>   	if (size > avail) {
> +		static const char * const names[] = {
> +			[CXL_DECODER_NONE] = "none",
> +			[CXL_DECODER_RAM] = "ram",
> +			[CXL_DECODER_PMEM] = "pmem",
> +			[CXL_DECODER_MIXED] = "mixed",
> +			[CXL_DECODER_DC0] = "dc0",
> +			[CXL_DECODER_DC1] = "dc1",
> +			[CXL_DECODER_DC2] = "dc2",
> +			[CXL_DECODER_DC3] = "dc3",
> +			[CXL_DECODER_DC4] = "dc4",
> +			[CXL_DECODER_DC5] = "dc5",
> +			[CXL_DECODER_DC6] = "dc6",
> +			[CXL_DECODER_DC7] = "dc7",
> +		};
>   		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> -			&avail);
> +			names[cxled->mode], &avail);
>   		rc = -ENOSPC;
>   		goto out;
>   	}
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 5e21b53362e6..a1a98aba24ed 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -195,6 +195,22 @@ static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
>   		mode = CXL_DECODER_PMEM;
>   	else if (sysfs_streq(buf, "ram"))
>   		mode = CXL_DECODER_RAM;
> +	else if (sysfs_streq(buf, "dc0"))
> +		mode = CXL_DECODER_DC0;
> +	else if (sysfs_streq(buf, "dc1"))
> +		mode = CXL_DECODER_DC1;
> +	else if (sysfs_streq(buf, "dc2"))
> +		mode = CXL_DECODER_DC2;
> +	else if (sysfs_streq(buf, "dc3"))
> +		mode = CXL_DECODER_DC3;
> +	else if (sysfs_streq(buf, "dc4"))
> +		mode = CXL_DECODER_DC4;
> +	else if (sysfs_streq(buf, "dc5"))
> +		mode = CXL_DECODER_DC5;
> +	else if (sysfs_streq(buf, "dc6"))
> +		mode = CXL_DECODER_DC6;
> +	else if (sysfs_streq(buf, "dc7"))
> +		mode = CXL_DECODER_DC7;
>   	else
>   		return -EINVAL;
>   
> @@ -296,6 +312,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>   	&dev_attr_target_list.attr,
>   	SET_CXL_REGION_ATTR(create_pmem_region)
>   	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_DC_REGION_ATTR(create_dc_region)
>   	SET_CXL_REGION_ATTR(delete_region)
>   	NULL,
>   };
> @@ -1691,6 +1708,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
>   		return ERR_PTR(-ENOMEM);
>   
>   	cxled->pos = -1;
> +	xa_init(&cxled->skip_res);
>   	cxld = &cxled->cxld;
>   	rc = cxl_decoder_init(port, cxld);
>   	if (rc)	 {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 543c4499379e..144232c8305e 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>   	lockdep_assert_held_write(&cxl_region_rwsem);
>   	lockdep_assert_held_read(&cxl_dpa_rwsem);
>   
> -	if (cxled->mode != cxlr->mode) {
> +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
>   		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
>   			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
>   		return -EINVAL;
> @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>   	switch (mode) {
>   	case CXL_DECODER_RAM:
>   	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>   		break;
>   	default:
>   		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
>   }
>   DEVICE_ATTR_RW(create_ram_region);
>   
> +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> +				const char *buf, enum cxl_decoder_mode mode,
> +				size_t len)
> +{
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	/*
> +	 * All DC regions use decoder mode DC0 as the region does not need the
> +	 * index information
> +	 */
> +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> +				CXL_DECODER_DC0, len);
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>   static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>   			   char *buf)
>   {
> @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>   	return rc;
>   }
>   
> +static void cxl_dc_region_release(void *data)
> +{
> +	struct cxl_region *cxlr = data;
> +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> +
> +	xa_destroy(&cxlr_dc->dax_dev_list);
> +	kfree(cxlr_dc);
> +}
> +
> +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> +{
> +	struct cxl_dc_region *cxlr_dc;
> +	struct cxl_dax_region *cxlr_dax;
> +	struct device *dev;
> +	int rc = 0;
> +
> +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> +	if (IS_ERR(cxlr_dax))
> +		return PTR_ERR(cxlr_dax);
> +
> +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> +	if (!cxlr_dc) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	dev = &cxlr_dax->dev;
> +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> +		dev_name(dev));
> +
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> +					cxlr_dax);
> +	if (rc)
> +		goto err;
> +
> +	cxlr_dc->cxlr_dax = cxlr_dax;
> +	xa_init(&cxlr_dc->dax_dev_list);
> +	cxlr->cxlr_dc = cxlr_dc;
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> +	if (!rc)
> +		return 0;
> +err:
> +	put_device(dev);
> +	kfree(cxlr_dc);
> +	return rc;
> +}
> +
>   static int match_decoder_by_range(struct device *dev, void *data)
>   {
>   	struct range *r1, *r2 = data;
> @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
>   	return 1;
>   }
>   
> +/*
> + * The region can not be manged by CXL if any portion of
> + * it is already online as 'System RAM'
> + */
> +static bool region_is_system_ram(struct cxl_region *cxlr,
> +				 struct cxl_region_params *p)
> +{
> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> +				    p->res->start, p->res->end, cxlr,
> +				    is_system_ram) > 0);
> +}
> +
>   static int cxl_region_probe(struct device *dev)
>   {
>   	struct cxl_region *cxlr = to_cxl_region(dev);
> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>   	case CXL_DECODER_PMEM:
>   		return devm_cxl_add_pmem_region(cxlr);
>   	case CXL_DECODER_RAM:
> -		/*
> -		 * The region can not be manged by CXL if any portion of
> -		 * it is already online as 'System RAM'
> -		 */
> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> -					p->res->start, p->res->end, cxlr,
> -					is_system_ram) > 0)
> +		if (region_is_system_ram(cxlr, p))

Maybe split this change out as a prep patch before the current patch.

>   			return 0;
>   
>   		/*
> @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
>   
>   		/* HDM-H routes to device-dax */
>   		return devm_cxl_add_dax_region(cxlr);
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
> +		if (region_is_system_ram(cxlr, p))
> +			return 0;
> +		return devm_cxl_add_dc_region(cxlr);
>   	default:
>   		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
>   			cxlr->mode);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 8400af85d99f..7ac1237938b7 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
>   	CXL_DECODER_NONE,
>   	CXL_DECODER_RAM,
>   	CXL_DECODER_PMEM,
> +	CXL_DECODER_DC0,
> +	CXL_DECODER_DC1,
> +	CXL_DECODER_DC2,
> +	CXL_DECODER_DC3,
> +	CXL_DECODER_DC4,
> +	CXL_DECODER_DC5,
> +	CXL_DECODER_DC6,
> +	CXL_DECODER_DC7,
>   	CXL_DECODER_MIXED,
>   	CXL_DECODER_DEAD,
>   };
> @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>   		[CXL_DECODER_NONE] = "none",
>   		[CXL_DECODER_RAM] = "ram",
>   		[CXL_DECODER_PMEM] = "pmem",
> +		[CXL_DECODER_DC0] = "dc0",
> +		[CXL_DECODER_DC1] = "dc1",
> +		[CXL_DECODER_DC2] = "dc2",
> +		[CXL_DECODER_DC3] = "dc3",
> +		[CXL_DECODER_DC4] = "dc4",
> +		[CXL_DECODER_DC5] = "dc5",
> +		[CXL_DECODER_DC6] = "dc6",
> +		[CXL_DECODER_DC7] = "dc7",
>   		[CXL_DECODER_MIXED] = "mixed",
>   	};
>   
> @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>   	return "mixed";
>   }
>   
> +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> +{
> +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> +}
> +
>   /*
>    * Track whether this decoder is reserved for region autodiscovery, or
>    * free for userspace provisioning.
> @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
>   	struct cxl_decoder cxld;
>   	struct resource *dpa_res;
>   	resource_size_t skip;
> +	struct xarray skip_res;
>   	enum cxl_decoder_mode mode;
>   	enum cxl_decoder_state state;
>   	int pos;
> @@ -475,6 +497,11 @@ struct cxl_region_params {
>    */
>   #define CXL_REGION_F_AUTO 1
>   
> +struct cxl_dc_region {
> +	struct xarray dax_dev_list;
> +	struct cxl_dax_region *cxlr_dax;
> +};
> +
>   /**
>    * struct cxl_region - CXL region
>    * @dev: This region's device
> @@ -493,6 +520,7 @@ struct cxl_region {
>   	enum cxl_decoder_type type;
>   	struct cxl_nvdimm_bridge *cxl_nvb;
>   	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dc_region *cxlr_dc;
>   	unsigned long flags;
>   	struct cxl_region_params params;
>   };
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index ccdf8de85bd5..eb5eb81bfbd7 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>   	if (!dax_region)
>   		return -ENOMEM;
>   
> +	if (decoder_mode_is_dc(cxlr->mode))
> +		return 0;
> +
>   	data = (struct dev_dax_data) {
>   		.dax_region = dax_region,
>   		.id = -1,
>   		.size = range_len(&cxlr_dax->hpa_range),
>   	};
> +

Stray blank line?

>   	dev_dax = devm_create_dev_dax(&data);
>   	if (IS_ERR(dev_dax))
>   		return PTR_ERR(dev_dax);
>
Alison Schofield June 15, 2023, 12:21 a.m. UTC | #2
On Wed, Jun 14, 2023 at 12:16:29PM -0700, Ira Weiny wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity. CXL Regions must be
> created to access this capacity.
> 
> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> Dynamic Capacity decoder mode which targets dynamic capacity on devices
> which are added to that region.

This is a lot in one patch, especially where it weaves in and out of
existing code. I'm wondering if this can be introduced in smaller
pieces (patches). An introductory patch explaining the DC DPA 
allocations might be a useful chunk to pull forward. 

Alison

> 
> Below are the steps to create and delete dynamic capacity region0
> (example).
> 
>     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> 
>     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> 
>     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>     echo 1 > /sys/bus/cxl/devices/$region/commit
>     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> 
>     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: fixups]
> [iweiny: remove unused CXL_DC_REGION_MODE macro]
> [iweiny: Make dc_mode_to_region_index static]
> [iweiny: simplify <sysfs>/create_dc_region]
> [iweiny: introduce decoder_mode_is_dc]
> [djbw: fixups, no sign-off: preview only]
> ---
>  drivers/cxl/Kconfig       |  11 +++
>  drivers/cxl/core/core.h   |   7 ++
>  drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
>  drivers/cxl/core/port.c   |  18 ++++
>  drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
>  drivers/cxl/cxl.h         |  28 ++++++
>  drivers/dax/cxl.c         |   4 +
>  7 files changed, 409 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index ff4e78117b31..df034889d053 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -121,6 +121,17 @@ config CXL_REGION
>  
>  	  If unsure say 'y'
>  
> +config CXL_DCD
> +	bool "CXL: DCD Support"
> +	default CXL_BUS
> +	depends on CXL_REGION
> +	help
> +	  Enable the CXL core to provision CXL DCD regions.
> +	  CXL devices optionally support dynamic capacity and DCD region
> +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> +
> +	  If unsure say 'y'
> +
>  config CXL_REGION_INVALIDATION_TEST
>  	bool "CXL: Region Cache Management Bypass (TEST)"
>  	depends on CXL_REGION
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 27f0968449de..725700ab5973 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
>  
>  extern struct attribute_group cxl_base_attribute_group;
>  
> +#ifdef CONFIG_CXL_DCD
> +extern struct device_attribute dev_attr_create_dc_region;
> +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> +#else
> +#define SET_CXL_DC_REGION_ATTR(x)
> +#endif
> +
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 514d30131d92..29649b47d177 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct resource *res = cxled->dpa_res;
>  	resource_size_t skip_start;
> +	resource_size_t skipped = cxled->skip;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
>  	/* save @skip_start, before @res is released */
> -	skip_start = res->start - cxled->skip;
> +	skip_start = res->start - skipped;
>  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> -	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +	if (cxled->skip != 0) {
> +		while (skipped != 0) {
> +			res = xa_load(&cxled->skip_res, skip_start);
> +			__release_region(&cxlds->dpa_res, skip_start,
> +							resource_size(res));
> +			xa_erase(&cxled->skip_res, skip_start);
> +			skip_start += resource_size(res);
> +			skipped -= resource_size(res);
> +			}
> +	}
>  	cxled->skip = 0;
>  	cxled->dpa_res = NULL;
>  	put_device(&cxled->cxld.dev);
> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	__cxl_dpa_release(cxled);
>  }
>  
> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> +{
> +	int index = 0;
> +
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		if (mode == i)
> +			return index;
> +		index++;
> +	}
> +
> +	return -EINVAL;
> +}
> +
>  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  			     resource_size_t base, resource_size_t len,
>  			     resource_size_t skipped)
> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	struct cxl_port *port = cxled_to_port(cxled);
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct device *dev = &port->dev;
> +	struct device *ed_dev = &cxled->cxld.dev;
> +	struct resource *dpa_res = &cxlds->dpa_res;
> +	resource_size_t skip_len = 0;
>  	struct resource *res;
> +	int rc, index;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	}
>  
>  	if (skipped) {
> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> +		resource_size_t skip_base = base - skipped;
> +
> +		if (decoder_mode_is_dc(cxled->mode)) {
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->ram_res.end) {
> +				skip_len = cxlds->ram_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->pmem_res.end) {
> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			index = dc_mode_to_region_index(cxled->mode);
> +			for (int i = 0; i <= index; i++) {
> +				struct resource *dcr = &cxlds->dc_res[i];
> +
> +				if (skip_base < dcr->start) {
> +					skip_len = dcr->start - skip_base;
> +					res = __request_region(dpa_res,
> +							skip_base, skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +
> +				if (skip_base == base) {
> +					dev_dbg(dev, "skip done!\n");
> +					break;
> +				}
> +
> +				if (resource_size(dcr) &&
> +						skip_base <= dcr->end) {
> +					if (skip_base > base)
> +						dev_err(dev, "Skip error\n");
> +
> +					skip_len = dcr->end - skip_base + 1;
> +					res = __request_region(dpa_res, skip_base,
> +							skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +			}
> +		} else	{
> +			res = __request_region(dpa_res, base - skipped, skipped,
> +							dev_name(ed_dev), 0);
> +			if (!res)
> +				goto error;
> +
> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
>  		}
>  	}
> -	res = __request_region(&cxlds->dpa_res, base, len,
> -			       dev_name(&cxled->cxld.dev), 0);
> +
> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>  	if (!res) {
>  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> -			port->id, cxled->cxld.id);
> -		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +				port->id, cxled->cxld.id);
> +		if (skipped) {
> +			resource_size_t skip_base = base - skipped;
> +
> +			while (skipped != 0) {
> +				if (skip_base > base)
> +					dev_err(dev, "Skip error\n");
> +
> +				res = xa_load(&cxled->skip_res, skip_base);
> +				__release_region(dpa_res, skip_base,
> +							resource_size(res));
> +				xa_erase(&cxled->skip_res, skip_base);
> +				skip_base += resource_size(res);
> +				skipped -= resource_size(res);
> +			}
> +		}
>  		return -EBUSY;
>  	}
>  	cxled->dpa_res = res;
>  	cxled->skip = skipped;
>  
> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> +		int index = dc_mode_to_region_index(mode);
> +
> +		if (resource_contains(&cxlds->dc_res[index], res)) {
> +			cxled->mode = mode;
> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> +			goto success;
> +		}
> +	}
>  	if (resource_contains(&cxlds->pmem_res, res))
>  		cxled->mode = CXL_DECODER_PMEM;
>  	else if (resource_contains(&cxlds->ram_res, res))
> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  		cxled->mode = CXL_DECODER_MIXED;
>  	}
>  
> +success:
>  	port->hdm_end++;
>  	get_device(&cxled->cxld.dev);
>  	return 0;
> +
> +error:
> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> +			port->id, cxled->cxld.id);
> +	return -EBUSY;
> +
>  }
>  
>  int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_dbg(dev, "unsupported mode: %d\n", mode);
> @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  		goto out;
>  	}
>  
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		int index = dc_mode_to_region_index(i);
> +
> +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> +			dev_dbg(dev, "no available dynamic capacity\n");
> +			rc = -ENXIO;
> +			goto out;
> +		}
> +	}
> +
>  	cxled->mode = mode;
>  	rc = 0;
>  out:
> @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  					 resource_size_t *skip_out)
>  {
>  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
> +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct device *dev = &cxled->cxld.dev;
>  	resource_size_t start, avail, skip;
>  	struct resource *p, *last;
> +	int index;
>  
>  	lockdep_assert_held(&cxl_dpa_rwsem);
>  
> @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  	else
>  		free_pmem_start = cxlds->pmem_res.start;
>  
> +	/*
> +	 * One HDM Decoder per DC region to map memory with different
> +	 * DSMAS entry.
> +	 */
> +	index = dc_mode_to_region_index(cxled->mode);
> +	if (index >= 0) {
> +		if (cxlds->dc_res[index].child) {
> +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
> +					index);
> +			return -EINVAL;
> +		}
> +		free_dc_start = cxlds->dc_res[index].start;
> +	}
> +
>  	if (cxled->mode == CXL_DECODER_RAM) {
>  		start = free_ram_start;
>  		avail = cxlds->ram_res.end - start + 1;
> @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  		else
>  			skip_end = start - 1;
>  		skip = skip_end - skip_start + 1;
> +	} else if (decoder_mode_is_dc(cxled->mode)) {
> +		resource_size_t skip_start, skip_end;
> +
> +		start = free_dc_start;
> +		avail = cxlds->dc_res[index].end - start + 1;
> +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> +			skip_start = free_ram_start;
> +		else
> +			skip_start = free_pmem_start;
> +		/*
> +		 * If some dc region is already mapped, then that allocation
> +		 * already handled the RAM and PMEM skip.Check for DC region
> +		 * skip.
> +		 */
> +		for (int i = index - 1; i >= 0 ; i--) {
> +			if (cxlds->dc_res[i].child) {
> +				skip_start = cxlds->dc_res[i].child->end + 1;
> +				break;
> +			}
> +		}
> +
> +		skip_end = start - 1;
> +		skip = skip_end - skip_start + 1;
>  	} else {
>  		dev_dbg(cxled_dev(cxled), "mode not set\n");
>  		avail = 0;
> @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>  
>  	avail = cxl_dpa_freespace(cxled, &start, &skip);
>  
> +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> +						start, size, skip);
>  	if (size > avail) {
> +		static const char * const names[] = {
> +			[CXL_DECODER_NONE] = "none",
> +			[CXL_DECODER_RAM] = "ram",
> +			[CXL_DECODER_PMEM] = "pmem",
> +			[CXL_DECODER_MIXED] = "mixed",
> +			[CXL_DECODER_DC0] = "dc0",
> +			[CXL_DECODER_DC1] = "dc1",
> +			[CXL_DECODER_DC2] = "dc2",
> +			[CXL_DECODER_DC3] = "dc3",
> +			[CXL_DECODER_DC4] = "dc4",
> +			[CXL_DECODER_DC5] = "dc5",
> +			[CXL_DECODER_DC6] = "dc6",
> +			[CXL_DECODER_DC7] = "dc7",
> +		};
>  		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> -			&avail);
> +			names[cxled->mode], &avail);
>  		rc = -ENOSPC;
>  		goto out;
>  	}
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 5e21b53362e6..a1a98aba24ed 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -195,6 +195,22 @@ static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
>  		mode = CXL_DECODER_PMEM;
>  	else if (sysfs_streq(buf, "ram"))
>  		mode = CXL_DECODER_RAM;
> +	else if (sysfs_streq(buf, "dc0"))
> +		mode = CXL_DECODER_DC0;
> +	else if (sysfs_streq(buf, "dc1"))
> +		mode = CXL_DECODER_DC1;
> +	else if (sysfs_streq(buf, "dc2"))
> +		mode = CXL_DECODER_DC2;
> +	else if (sysfs_streq(buf, "dc3"))
> +		mode = CXL_DECODER_DC3;
> +	else if (sysfs_streq(buf, "dc4"))
> +		mode = CXL_DECODER_DC4;
> +	else if (sysfs_streq(buf, "dc5"))
> +		mode = CXL_DECODER_DC5;
> +	else if (sysfs_streq(buf, "dc6"))
> +		mode = CXL_DECODER_DC6;
> +	else if (sysfs_streq(buf, "dc7"))
> +		mode = CXL_DECODER_DC7;
>  	else
>  		return -EINVAL;
>  
> @@ -296,6 +312,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>  	&dev_attr_target_list.attr,
>  	SET_CXL_REGION_ATTR(create_pmem_region)
>  	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_DC_REGION_ATTR(create_dc_region)
>  	SET_CXL_REGION_ATTR(delete_region)
>  	NULL,
>  };
> @@ -1691,6 +1708,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
>  		return ERR_PTR(-ENOMEM);
>  
>  	cxled->pos = -1;
> +	xa_init(&cxled->skip_res);
>  	cxld = &cxled->cxld;
>  	rc = cxl_decoder_init(port, cxld);
>  	if (rc)	 {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 543c4499379e..144232c8305e 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>  	lockdep_assert_held_write(&cxl_region_rwsem);
>  	lockdep_assert_held_read(&cxl_dpa_rwsem);
>  
> -	if (cxled->mode != cxlr->mode) {
> +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
>  		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
>  			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
>  		return -EINVAL;
> @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> +				const char *buf, enum cxl_decoder_mode mode,
> +				size_t len)
> +{
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	/*
> +	 * All DC regions use decoder mode DC0 as the region does not need the
> +	 * index information
> +	 */
> +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> +				CXL_DECODER_DC0, len);
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	return rc;
>  }
>  
> +static void cxl_dc_region_release(void *data)
> +{
> +	struct cxl_region *cxlr = data;
> +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> +
> +	xa_destroy(&cxlr_dc->dax_dev_list);
> +	kfree(cxlr_dc);
> +}
> +
> +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> +{
> +	struct cxl_dc_region *cxlr_dc;
> +	struct cxl_dax_region *cxlr_dax;
> +	struct device *dev;
> +	int rc = 0;
> +
> +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> +	if (IS_ERR(cxlr_dax))
> +		return PTR_ERR(cxlr_dax);
> +
> +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> +	if (!cxlr_dc) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	dev = &cxlr_dax->dev;
> +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> +		dev_name(dev));
> +
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> +					cxlr_dax);
> +	if (rc)
> +		goto err;
> +
> +	cxlr_dc->cxlr_dax = cxlr_dax;
> +	xa_init(&cxlr_dc->dax_dev_list);
> +	cxlr->cxlr_dc = cxlr_dc;
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> +	if (!rc)
> +		return 0;
> +err:
> +	put_device(dev);
> +	kfree(cxlr_dc);
> +	return rc;
> +}
> +
>  static int match_decoder_by_range(struct device *dev, void *data)
>  {
>  	struct range *r1, *r2 = data;
> @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
>  	return 1;
>  }
>  
> +/*
> + * The region can not be manged by CXL if any portion of
> + * it is already online as 'System RAM'
> + */
> +static bool region_is_system_ram(struct cxl_region *cxlr,
> +				 struct cxl_region_params *p)
> +{
> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> +				    p->res->start, p->res->end, cxlr,
> +				    is_system_ram) > 0);
> +}
> +
>  static int cxl_region_probe(struct device *dev)
>  {
>  	struct cxl_region *cxlr = to_cxl_region(dev);
> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_DECODER_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_DECODER_RAM:
> -		/*
> -		 * The region can not be manged by CXL if any portion of
> -		 * it is already online as 'System RAM'
> -		 */
> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> -					p->res->start, p->res->end, cxlr,
> -					is_system_ram) > 0)
> +		if (region_is_system_ram(cxlr, p))
>  			return 0;
>  
>  		/*
> @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
>  
>  		/* HDM-H routes to device-dax */
>  		return devm_cxl_add_dax_region(cxlr);
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
> +		if (region_is_system_ram(cxlr, p))
> +			return 0;
> +		return devm_cxl_add_dc_region(cxlr);
>  	default:
>  		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
>  			cxlr->mode);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 8400af85d99f..7ac1237938b7 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
>  	CXL_DECODER_NONE,
>  	CXL_DECODER_RAM,
>  	CXL_DECODER_PMEM,
> +	CXL_DECODER_DC0,
> +	CXL_DECODER_DC1,
> +	CXL_DECODER_DC2,
> +	CXL_DECODER_DC3,
> +	CXL_DECODER_DC4,
> +	CXL_DECODER_DC5,
> +	CXL_DECODER_DC6,
> +	CXL_DECODER_DC7,
>  	CXL_DECODER_MIXED,
>  	CXL_DECODER_DEAD,
>  };
> @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  		[CXL_DECODER_NONE] = "none",
>  		[CXL_DECODER_RAM] = "ram",
>  		[CXL_DECODER_PMEM] = "pmem",
> +		[CXL_DECODER_DC0] = "dc0",
> +		[CXL_DECODER_DC1] = "dc1",
> +		[CXL_DECODER_DC2] = "dc2",
> +		[CXL_DECODER_DC3] = "dc3",
> +		[CXL_DECODER_DC4] = "dc4",
> +		[CXL_DECODER_DC5] = "dc5",
> +		[CXL_DECODER_DC6] = "dc6",
> +		[CXL_DECODER_DC7] = "dc7",
>  		[CXL_DECODER_MIXED] = "mixed",
>  	};
>  
> @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  	return "mixed";
>  }
>  
> +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> +{
> +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> +}
> +
>  /*
>   * Track whether this decoder is reserved for region autodiscovery, or
>   * free for userspace provisioning.
> @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
>  	struct cxl_decoder cxld;
>  	struct resource *dpa_res;
>  	resource_size_t skip;
> +	struct xarray skip_res;
>  	enum cxl_decoder_mode mode;
>  	enum cxl_decoder_state state;
>  	int pos;
> @@ -475,6 +497,11 @@ struct cxl_region_params {
>   */
>  #define CXL_REGION_F_AUTO 1
>  
> +struct cxl_dc_region {
> +	struct xarray dax_dev_list;
> +	struct cxl_dax_region *cxlr_dax;
> +};
> +
>  /**
>   * struct cxl_region - CXL region
>   * @dev: This region's device
> @@ -493,6 +520,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dc_region *cxlr_dc;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index ccdf8de85bd5..eb5eb81bfbd7 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	if (decoder_mode_is_dc(cxlr->mode))
> +		return 0;
> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
>  		.size = range_len(&cxlr_dax->hpa_range),
>  	};
> +
>  	dev_dax = devm_create_dev_dax(&data);
>  	if (IS_ERR(dev_dax))
>  		return PTR_ERR(dev_dax);
> 
> -- 
> 2.40.0
>
Ira Weiny June 15, 2023, 6:12 p.m. UTC | #3
Dave Jiang wrote:
> 
> 
> On 6/14/23 12:16, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity. CXL Regions must be
> > created to access this capacity.
> > 
> > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > which are added to that region.
> > 
> > Below are the steps to create and delete dynamic capacity region0
> > (example).
> > 
> >      region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
> >      echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
> >      echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
> >      echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> > 
> >      echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
> >      echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> > 
> >      echo 0x400000000 > /sys/bus/cxl/devices/$region/size
> >      echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
> >      echo 1 > /sys/bus/cxl/devices/$region/commit
> >      echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> > 
> >      echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > 
> > ---
> > [iweiny: fixups]
> > [iweiny: remove unused CXL_DC_REGION_MODE macro]
> > [iweiny: Make dc_mode_to_region_index static]
> > [iweiny: simplify <sysfs>/create_dc_region]
> > [iweiny: introduce decoder_mode_is_dc]
> > [djbw: fixups, no sign-off: preview only]
> > ---
> >   drivers/cxl/Kconfig       |  11 +++
> >   drivers/cxl/core/core.h   |   7 ++
> >   drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
> >   drivers/cxl/core/port.c   |  18 ++++
> >   drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
> >   drivers/cxl/cxl.h         |  28 ++++++
> >   drivers/dax/cxl.c         |   4 +
> >   7 files changed, 409 insertions(+), 28 deletions(-)
> > 
> > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > index ff4e78117b31..df034889d053 100644
> > --- a/drivers/cxl/Kconfig
> > +++ b/drivers/cxl/Kconfig
> > @@ -121,6 +121,17 @@ config CXL_REGION
> >   
> >   	  If unsure say 'y'
> >   
> > +config CXL_DCD
> > +	bool "CXL: DCD Support"
> > +	default CXL_BUS
> > +	depends on CXL_REGION
> > +	help
> > +	  Enable the CXL core to provision CXL DCD regions.
> > +	  CXL devices optionally support dynamic capacity and DCD region
> > +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> > +
> > +	  If unsure say 'y'
> > +
> >   config CXL_REGION_INVALIDATION_TEST
> >   	bool "CXL: Region Cache Management Bypass (TEST)"
> >   	depends on CXL_REGION
> > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > index 27f0968449de..725700ab5973 100644
> > --- a/drivers/cxl/core/core.h
> > +++ b/drivers/cxl/core/core.h
> > @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
> >   
> >   extern struct attribute_group cxl_base_attribute_group;
> >   
> > +#ifdef CONFIG_CXL_DCD
> > +extern struct device_attribute dev_attr_create_dc_region;
> > +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> > +#else
> > +#define SET_CXL_DC_REGION_ATTR(x)
> > +#endif
> > +
> >   #ifdef CONFIG_CXL_REGION
> >   extern struct device_attribute dev_attr_create_pmem_region;
> >   extern struct device_attribute dev_attr_create_ram_region;
> > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > index 514d30131d92..29649b47d177 100644
> > --- a/drivers/cxl/core/hdm.c
> > +++ b/drivers/cxl/core/hdm.c
> > @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >   	struct resource *res = cxled->dpa_res;
> >   	resource_size_t skip_start;
> > +	resource_size_t skipped = cxled->skip;
> >   
> >   	lockdep_assert_held_write(&cxl_dpa_rwsem);
> >   
> >   	/* save @skip_start, before @res is released */
> > -	skip_start = res->start - cxled->skip;
> > +	skip_start = res->start - skipped;
> >   	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> > -	if (cxled->skip)
> > -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> > +	if (cxled->skip != 0) {
> > +		while (skipped != 0) {
> > +			res = xa_load(&cxled->skip_res, skip_start);
> > +			__release_region(&cxlds->dpa_res, skip_start,
> > +							resource_size(res));
> > +			xa_erase(&cxled->skip_res, skip_start);
> > +			skip_start += resource_size(res);
> > +			skipped -= resource_size(res);
> > +			}
> > +	}
> >   	cxled->skip = 0;
> >   	cxled->dpa_res = NULL;
> >   	put_device(&cxled->cxld.dev);
> > @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >   	__cxl_dpa_release(cxled);
> >   }
> >   
> > +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> > +{
> > +	int index = 0;
> > +
> > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > +		if (mode == i)
> > +			return index;
> > +		index++;
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> >   static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >   			     resource_size_t base, resource_size_t len,
> >   			     resource_size_t skipped)
> > @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >   	struct cxl_port *port = cxled_to_port(cxled);
> >   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >   	struct device *dev = &port->dev;
> > +	struct device *ed_dev = &cxled->cxld.dev;
> > +	struct resource *dpa_res = &cxlds->dpa_res;
> > +	resource_size_t skip_len = 0;
> >   	struct resource *res;
> > +	int rc, index;
> >   
> >   	lockdep_assert_held_write(&cxl_dpa_rwsem);
> >   
> > @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >   	}
> >   
> >   	if (skipped) {
> > -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> > -				       dev_name(&cxled->cxld.dev), 0);
> > -		if (!res) {
> > -			dev_dbg(dev,
> > -				"decoder%d.%d: failed to reserve skipped space\n",
> > -				port->id, cxled->cxld.id);
> > -			return -EBUSY;
> > +		resource_size_t skip_base = base - skipped;
> > +
> > +		if (decoder_mode_is_dc(cxled->mode)) {
> 
> Maybe move this entire block to a helper function to reduce the size of 
> the current function and reduce indent levels and improve readability?

:-/

I'll work on breaking it out more.  The logic here is getting kind of
crazy.

> 
> > +			if (resource_size(&cxlds->ram_res) &&
> > +					skip_base <= cxlds->ram_res.end) {
> > +				skip_len = cxlds->ram_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> > +
> > +			if (resource_size(&cxlds->ram_res) &&
                                                  ^^^^^^^
						  pmem_res?

> > +					skip_base <= cxlds->pmem_res.end) {

The 2 if statements here are almost exactly the same.  To the point I
wonder if there is a bug.

Navneet,

Why does the code check ram_res the second time but go on to use pmem_res
in the block?

> > +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> > +
> > +			index = dc_mode_to_region_index(cxled->mode);
> > +			for (int i = 0; i <= index; i++) {
> > +				struct resource *dcr = &cxlds->dc_res[i];
> > +
> > +				if (skip_base < dcr->start) {
> > +					skip_len = dcr->start - skip_base;
> > +					res = __request_region(dpa_res,
> > +							skip_base, skip_len,
> > +							dev_name(ed_dev), 0);
> > +					if (!res)
> > +						goto error;
> > +
> > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > +							res, GFP_KERNEL);
> > +					skip_base += skip_len;
> > +				}
> > +
> > +				if (skip_base == base) {
> > +					dev_dbg(dev, "skip done!\n");
> > +					break;
> > +				}
> > +
> > +				if (resource_size(dcr) &&
> > +						skip_base <= dcr->end) {
> > +					if (skip_base > base)
> > +						dev_err(dev, "Skip error\n");
> > +
> > +					skip_len = dcr->end - skip_base + 1;
> > +					res = __request_region(dpa_res, skip_base,
> > +							skip_len,
> > +							dev_name(ed_dev), 0);
> > +					if (!res)
> > +						goto error;
> > +
> > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > +							res, GFP_KERNEL);
> > +					skip_base += skip_len;
> > +				}
> > +			}
> > +		} else	{
> > +			res = __request_region(dpa_res, base - skipped, skipped,
> > +							dev_name(ed_dev), 0);
> > +			if (!res)
> > +				goto error;
> > +
> > +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> >   		}
> >   	}
> > -	res = __request_region(&cxlds->dpa_res, base, len,
> > -			       dev_name(&cxled->cxld.dev), 0);
> > +
> > +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
> >   	if (!res) {
> >   		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> > -			port->id, cxled->cxld.id);
> > -		if (skipped)
> > -			__release_region(&cxlds->dpa_res, base - skipped,
> > -					 skipped);
> > +				port->id, cxled->cxld.id);
> > +		if (skipped) {
> > +			resource_size_t skip_base = base - skipped;
> > +
> > +			while (skipped != 0) {
> > +				if (skip_base > base)
> > +					dev_err(dev, "Skip error\n");
> > +
> > +				res = xa_load(&cxled->skip_res, skip_base);
> > +				__release_region(dpa_res, skip_base,
> > +							resource_size(res));
> > +				xa_erase(&cxled->skip_res, skip_base);
> > +				skip_base += resource_size(res);
> > +				skipped -= resource_size(res);
> > +			}
> > +		}
> >   		return -EBUSY;
> >   	}
> >   	cxled->dpa_res = res;
> >   	cxled->skip = skipped;
> >   
> > +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> > +		int index = dc_mode_to_region_index(mode);
> > +
> > +		if (resource_contains(&cxlds->dc_res[index], res)) {
> > +			cxled->mode = mode;
> > +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> > +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> > +			goto success > +		}
> > +	}
> 
> This block should only happen if decoder_mode_is_dc() right? If that's 
> the case, you might be able to refactor it so the 'goto success' isn't 
> necessary.

I'll check.  I looked through this code a couple of times in my review
before posting because I'm not 100% sure I want to see 8 different modes
DC decoders and regions.

I think the 'mode' should be 'DC' with an index in the endpoint decoder to
map DC region that decoder is mapping.  But that change was much bigger to
Navneets code and I wanted to see how others felt about having DC0 - DC7
modes.  My compromise was creating decoder_mode_is_dc().

> 
> >   	if (resource_contains(&cxlds->pmem_res, res))
> >   		cxled->mode = CXL_DECODER_PMEM;
> >   	else if (resource_contains(&cxlds->ram_res, res))
> > @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >   		cxled->mode = CXL_DECODER_MIXED;
> >   	}
> >   
> > +success:
> >   	port->hdm_end++;
> >   	get_device(&cxled->cxld.dev);
> >   	return 0;
> > +
> > +error:
> > +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> > +			port->id, cxled->cxld.id);
> > +	return -EBUSY;
> > +
> >   }
> >   
> >   int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> >   	switch (mode) {
> >   	case CXL_DECODER_RAM:
> >   	case CXL_DECODER_PMEM:
> > +	case CXL_DECODER_DC0:
> > +	case CXL_DECODER_DC1:
> > +	case CXL_DECODER_DC2:
> > +	case CXL_DECODER_DC3:
> > +	case CXL_DECODER_DC4:
> > +	case CXL_DECODER_DC5:
> > +	case CXL_DECODER_DC6:
> > +	case CXL_DECODER_DC7:

For example this seems very hacky...

[snip]

> >   
> > +/*
> > + * The region can not be manged by CXL if any portion of
> > + * it is already online as 'System RAM'
> > + */
> > +static bool region_is_system_ram(struct cxl_region *cxlr,
> > +				 struct cxl_region_params *p)
> > +{
> > +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> > +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > +				    p->res->start, p->res->end, cxlr,
> > +				    is_system_ram) > 0);
> > +}
> > +
> >   static int cxl_region_probe(struct device *dev)
> >   {
> >   	struct cxl_region *cxlr = to_cxl_region(dev);
> > @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
> >   	case CXL_DECODER_PMEM:
> >   		return devm_cxl_add_pmem_region(cxlr);
> >   	case CXL_DECODER_RAM:
> > -		/*
> > -		 * The region can not be manged by CXL if any portion of
> > -		 * it is already online as 'System RAM'
> > -		 */
> > -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> > -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > -					p->res->start, p->res->end, cxlr,
> > -					is_system_ram) > 0)
> > +		if (region_is_system_ram(cxlr, p))
> 
> Maybe split this change out as a prep patch before the current patch.

That seems reasonable.  But the patch is not so large and the
justification for creating a helper is that we need this same check for DC
regions.  So it seemed ok to leave it like this.  Let me see about
splitting it out.

[snip]

> > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > index ccdf8de85bd5..eb5eb81bfbd7 100644
> > --- a/drivers/dax/cxl.c
> > +++ b/drivers/dax/cxl.c
> > @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
> >   	if (!dax_region)
> >   		return -ENOMEM;
> >   
> > +	if (decoder_mode_is_dc(cxlr->mode))
> > +		return 0;
> > +
> >   	data = (struct dev_dax_data) {
> >   		.dax_region = dax_region,
> >   		.id = -1,
> >   		.size = range_len(&cxlr_dax->hpa_range),
> >   	};
> > +
> 
> Stray blank line?

Opps!  Fixed!

Ira
Dave Jiang June 15, 2023, 6:28 p.m. UTC | #4
On 6/15/23 11:12, Ira Weiny wrote:
> Dave Jiang wrote:
>>
>>
>> On 6/14/23 12:16, ira.weiny@intel.com wrote:
>>> From: Navneet Singh <navneet.singh@intel.com>
>>>
>>> CXL devices optionally support dynamic capacity. CXL Regions must be
>>> created to access this capacity.
>>>
>>> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
>>> Dynamic Capacity decoder mode which targets dynamic capacity on devices
>>> which are added to that region.
>>>
>>> Below are the steps to create and delete dynamic capacity region0
>>> (example).
>>>
>>>       region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>>>       echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>>>       echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>>>       echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
>>>
>>>       echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>>>       echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
>>>
>>>       echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>>>       echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>>>       echo 1 > /sys/bus/cxl/devices/$region/commit
>>>       echo $region > /sys/bus/cxl/drivers/cxl_region/bind
>>>
>>>       echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
>>>
>>> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
>>>
>>> ---
>>> [iweiny: fixups]
>>> [iweiny: remove unused CXL_DC_REGION_MODE macro]
>>> [iweiny: Make dc_mode_to_region_index static]
>>> [iweiny: simplify <sysfs>/create_dc_region]
>>> [iweiny: introduce decoder_mode_is_dc]
>>> [djbw: fixups, no sign-off: preview only]
>>> ---
>>>    drivers/cxl/Kconfig       |  11 +++
>>>    drivers/cxl/core/core.h   |   7 ++
>>>    drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
>>>    drivers/cxl/core/port.c   |  18 ++++
>>>    drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
>>>    drivers/cxl/cxl.h         |  28 ++++++
>>>    drivers/dax/cxl.c         |   4 +
>>>    7 files changed, 409 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
>>> index ff4e78117b31..df034889d053 100644
>>> --- a/drivers/cxl/Kconfig
>>> +++ b/drivers/cxl/Kconfig
>>> @@ -121,6 +121,17 @@ config CXL_REGION
>>>    
>>>    	  If unsure say 'y'
>>>    
>>> +config CXL_DCD
>>> +	bool "CXL: DCD Support"
>>> +	default CXL_BUS
>>> +	depends on CXL_REGION
>>> +	help
>>> +	  Enable the CXL core to provision CXL DCD regions.
>>> +	  CXL devices optionally support dynamic capacity and DCD region
>>> +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
>>> +
>>> +	  If unsure say 'y'
>>> +
>>>    config CXL_REGION_INVALIDATION_TEST
>>>    	bool "CXL: Region Cache Management Bypass (TEST)"
>>>    	depends on CXL_REGION
>>> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
>>> index 27f0968449de..725700ab5973 100644
>>> --- a/drivers/cxl/core/core.h
>>> +++ b/drivers/cxl/core/core.h
>>> @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
>>>    
>>>    extern struct attribute_group cxl_base_attribute_group;
>>>    
>>> +#ifdef CONFIG_CXL_DCD
>>> +extern struct device_attribute dev_attr_create_dc_region;
>>> +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
>>> +#else
>>> +#define SET_CXL_DC_REGION_ATTR(x)
>>> +#endif
>>> +
>>>    #ifdef CONFIG_CXL_REGION
>>>    extern struct device_attribute dev_attr_create_pmem_region;
>>>    extern struct device_attribute dev_attr_create_ram_region;
>>> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
>>> index 514d30131d92..29649b47d177 100644
>>> --- a/drivers/cxl/core/hdm.c
>>> +++ b/drivers/cxl/core/hdm.c
>>> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>>>    	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>>>    	struct resource *res = cxled->dpa_res;
>>>    	resource_size_t skip_start;
>>> +	resource_size_t skipped = cxled->skip;
>>>    
>>>    	lockdep_assert_held_write(&cxl_dpa_rwsem);
>>>    
>>>    	/* save @skip_start, before @res is released */
>>> -	skip_start = res->start - cxled->skip;
>>> +	skip_start = res->start - skipped;
>>>    	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
>>> -	if (cxled->skip)
>>> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
>>> +	if (cxled->skip != 0) {
>>> +		while (skipped != 0) {
>>> +			res = xa_load(&cxled->skip_res, skip_start);
>>> +			__release_region(&cxlds->dpa_res, skip_start,
>>> +							resource_size(res));
>>> +			xa_erase(&cxled->skip_res, skip_start);
>>> +			skip_start += resource_size(res);
>>> +			skipped -= resource_size(res);
>>> +			}
>>> +	}
>>>    	cxled->skip = 0;
>>>    	cxled->dpa_res = NULL;
>>>    	put_device(&cxled->cxld.dev);
>>> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>>>    	__cxl_dpa_release(cxled);
>>>    }
>>>    
>>> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
>>> +{
>>> +	int index = 0;
>>> +
>>> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
>>> +		if (mode == i)
>>> +			return index;
>>> +		index++;
>>> +	}
>>> +
>>> +	return -EINVAL;
>>> +}
>>> +
>>>    static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>>>    			     resource_size_t base, resource_size_t len,
>>>    			     resource_size_t skipped)
>>> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>>>    	struct cxl_port *port = cxled_to_port(cxled);
>>>    	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>>>    	struct device *dev = &port->dev;
>>> +	struct device *ed_dev = &cxled->cxld.dev;
>>> +	struct resource *dpa_res = &cxlds->dpa_res;
>>> +	resource_size_t skip_len = 0;
>>>    	struct resource *res;
>>> +	int rc, index;
>>>    
>>>    	lockdep_assert_held_write(&cxl_dpa_rwsem);
>>>    
>>> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>>>    	}
>>>    
>>>    	if (skipped) {
>>> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
>>> -				       dev_name(&cxled->cxld.dev), 0);
>>> -		if (!res) {
>>> -			dev_dbg(dev,
>>> -				"decoder%d.%d: failed to reserve skipped space\n",
>>> -				port->id, cxled->cxld.id);
>>> -			return -EBUSY;
>>> +		resource_size_t skip_base = base - skipped;
>>> +
>>> +		if (decoder_mode_is_dc(cxled->mode)) {
>>
>> Maybe move this entire block to a helper function to reduce the size of
>> the current function and reduce indent levels and improve readability?
> 
> :-/
> 
> I'll work on breaking it out more.  The logic here is getting kind of
> crazy.
> 
>>
>>> +			if (resource_size(&cxlds->ram_res) &&
>>> +					skip_base <= cxlds->ram_res.end) {
>>> +				skip_len = cxlds->ram_res.end - skip_base + 1;
>>> +				res = __request_region(dpa_res, skip_base,
>>> +						skip_len, dev_name(ed_dev), 0);
>>> +				if (!res)
>>> +					goto error;
>>> +
>>> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
>>> +								GFP_KERNEL);
>>> +				skip_base += skip_len;
>>> +			}
>>> +
>>> +			if (resource_size(&cxlds->ram_res) &&
>                                                    ^^^^^^^
> 						  pmem_res?
> 
>>> +					skip_base <= cxlds->pmem_res.end) {
> 
> The 2 if statements here are almost exactly the same.  To the point I
> wonder if there is a bug.
> 
> Navneet,
> 
> Why does the code check ram_res the second time but go on to use pmem_res
> in the block?
> 
>>> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
>>> +				res = __request_region(dpa_res, skip_base,
>>> +						skip_len, dev_name(ed_dev), 0);
>>> +				if (!res)
>>> +					goto error;
>>> +
>>> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
>>> +								GFP_KERNEL);
>>> +				skip_base += skip_len;
>>> +			}
>>> +
>>> +			index = dc_mode_to_region_index(cxled->mode);
>>> +			for (int i = 0; i <= index; i++) {
>>> +				struct resource *dcr = &cxlds->dc_res[i];
>>> +
>>> +				if (skip_base < dcr->start) {
>>> +					skip_len = dcr->start - skip_base;
>>> +					res = __request_region(dpa_res,
>>> +							skip_base, skip_len,
>>> +							dev_name(ed_dev), 0);
>>> +					if (!res)
>>> +						goto error;
>>> +
>>> +					rc = xa_insert(&cxled->skip_res, skip_base,
>>> +							res, GFP_KERNEL);
>>> +					skip_base += skip_len;
>>> +				}
>>> +
>>> +				if (skip_base == base) {
>>> +					dev_dbg(dev, "skip done!\n");
>>> +					break;
>>> +				}
>>> +
>>> +				if (resource_size(dcr) &&
>>> +						skip_base <= dcr->end) {
>>> +					if (skip_base > base)
>>> +						dev_err(dev, "Skip error\n");
>>> +
>>> +					skip_len = dcr->end - skip_base + 1;
>>> +					res = __request_region(dpa_res, skip_base,
>>> +							skip_len,
>>> +							dev_name(ed_dev), 0);
>>> +					if (!res)
>>> +						goto error;
>>> +
>>> +					rc = xa_insert(&cxled->skip_res, skip_base,
>>> +							res, GFP_KERNEL);
>>> +					skip_base += skip_len;
>>> +				}
>>> +			}
>>> +		} else	{
>>> +			res = __request_region(dpa_res, base - skipped, skipped,
>>> +							dev_name(ed_dev), 0);
>>> +			if (!res)
>>> +				goto error;
>>> +
>>> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
>>> +								GFP_KERNEL);
>>>    		}
>>>    	}
>>> -	res = __request_region(&cxlds->dpa_res, base, len,
>>> -			       dev_name(&cxled->cxld.dev), 0);
>>> +
>>> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>>>    	if (!res) {
>>>    		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
>>> -			port->id, cxled->cxld.id);
>>> -		if (skipped)
>>> -			__release_region(&cxlds->dpa_res, base - skipped,
>>> -					 skipped);
>>> +				port->id, cxled->cxld.id);
>>> +		if (skipped) {
>>> +			resource_size_t skip_base = base - skipped;
>>> +
>>> +			while (skipped != 0) {
>>> +				if (skip_base > base)
>>> +					dev_err(dev, "Skip error\n");
>>> +
>>> +				res = xa_load(&cxled->skip_res, skip_base);
>>> +				__release_region(dpa_res, skip_base,
>>> +							resource_size(res));
>>> +				xa_erase(&cxled->skip_res, skip_base);
>>> +				skip_base += resource_size(res);
>>> +				skipped -= resource_size(res);
>>> +			}
>>> +		}
>>>    		return -EBUSY;
>>>    	}
>>>    	cxled->dpa_res = res;
>>>    	cxled->skip = skipped;
>>>    
>>> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
>>> +		int index = dc_mode_to_region_index(mode);
>>> +
>>> +		if (resource_contains(&cxlds->dc_res[index], res)) {
>>> +			cxled->mode = mode;
>>> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
>>> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
>>> +			goto success > +		}
>>> +	}
>>
>> This block should only happen if decoder_mode_is_dc() right? If that's
>> the case, you might be able to refactor it so the 'goto success' isn't
>> necessary.
> 
> I'll check.  I looked through this code a couple of times in my review
> before posting because I'm not 100% sure I want to see 8 different modes
> DC decoders and regions.
> 
> I think the 'mode' should be 'DC' with an index in the endpoint decoder to
> map DC region that decoder is mapping.  But that change was much bigger to
> Navneets code and I wanted to see how others felt about having DC0 - DC7
> modes.  My compromise was creating decoder_mode_is_dc().
> 
>>
>>>    	if (resource_contains(&cxlds->pmem_res, res))
>>>    		cxled->mode = CXL_DECODER_PMEM;
>>>    	else if (resource_contains(&cxlds->ram_res, res))
>>> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>>>    		cxled->mode = CXL_DECODER_MIXED;
>>>    	}
>>>    
>>> +success:
>>>    	port->hdm_end++;
>>>    	get_device(&cxled->cxld.dev);
>>>    	return 0;
>>> +
>>> +error:
>>> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
>>> +			port->id, cxled->cxld.id);
>>> +	return -EBUSY;
>>> +
>>>    }
>>>    
>>>    int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>>> @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>>>    	switch (mode) {
>>>    	case CXL_DECODER_RAM:
>>>    	case CXL_DECODER_PMEM:
>>> +	case CXL_DECODER_DC0:
>>> +	case CXL_DECODER_DC1:
>>> +	case CXL_DECODER_DC2:
>>> +	case CXL_DECODER_DC3:
>>> +	case CXL_DECODER_DC4:
>>> +	case CXL_DECODER_DC5:
>>> +	case CXL_DECODER_DC6:
>>> +	case CXL_DECODER_DC7:
> 
> For example this seems very hacky...

Not sure if it helps, but you can always do:
case CXL_DECODER_DC0 ... CXL_DECODER_DC7:

DJ

> 
> [snip]
> 
>>>    
>>> +/*
>>> + * The region can not be manged by CXL if any portion of
>>> + * it is already online as 'System RAM'
>>> + */
>>> +static bool region_is_system_ram(struct cxl_region *cxlr,
>>> +				 struct cxl_region_params *p)
>>> +{
>>> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
>>> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
>>> +				    p->res->start, p->res->end, cxlr,
>>> +				    is_system_ram) > 0);
>>> +}
>>> +
>>>    static int cxl_region_probe(struct device *dev)
>>>    {
>>>    	struct cxl_region *cxlr = to_cxl_region(dev);
>>> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>>>    	case CXL_DECODER_PMEM:
>>>    		return devm_cxl_add_pmem_region(cxlr);
>>>    	case CXL_DECODER_RAM:
>>> -		/*
>>> -		 * The region can not be manged by CXL if any portion of
>>> -		 * it is already online as 'System RAM'
>>> -		 */
>>> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
>>> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
>>> -					p->res->start, p->res->end, cxlr,
>>> -					is_system_ram) > 0)
>>> +		if (region_is_system_ram(cxlr, p))
>>
>> Maybe split this change out as a prep patch before the current patch.
> 
> That seems reasonable.  But the patch is not so large and the
> justification for creating a helper is that we need this same check for DC
> regions.  So it seemed ok to leave it like this.  Let me see about
> splitting it out.
> 
> [snip]
> 
>>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>>> index ccdf8de85bd5..eb5eb81bfbd7 100644
>>> --- a/drivers/dax/cxl.c
>>> +++ b/drivers/dax/cxl.c
>>> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>>    	if (!dax_region)
>>>    		return -ENOMEM;
>>>    
>>> +	if (decoder_mode_is_dc(cxlr->mode))
>>> +		return 0;
>>> +
>>>    	data = (struct dev_dax_data) {
>>>    		.dax_region = dax_region,
>>>    		.id = -1,
>>>    		.size = range_len(&cxlr_dax->hpa_range),
>>>    	};
>>> +
>>
>> Stray blank line?
> 
> Opps!  Fixed!
> 
> Ira
Navneet Singh June 15, 2023, 6:56 p.m. UTC | #5
On Thu, Jun 15, 2023 at 11:12:29AM -0700, Ira Weiny wrote:
> Dave Jiang wrote:
> > 
> > 
> > On 6/14/23 12:16, ira.weiny@intel.com wrote:
> > > From: Navneet Singh <navneet.singh@intel.com>
> > > 
> > > CXL devices optionally support dynamic capacity. CXL Regions must be
> > > created to access this capacity.
> > > 
> > > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > > which are added to that region.
> > > 
> > > Below are the steps to create and delete dynamic capacity region0
> > > (example).
> > > 
> > >      region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
> > >      echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
> > >      echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
> > >      echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> > > 
> > >      echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
> > >      echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> > > 
> > >      echo 0x400000000 > /sys/bus/cxl/devices/$region/size
> > >      echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
> > >      echo 1 > /sys/bus/cxl/devices/$region/commit
> > >      echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> > > 
> > >      echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> > > 
> > > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > > 
> > > ---
> > > [iweiny: fixups]
> > > [iweiny: remove unused CXL_DC_REGION_MODE macro]
> > > [iweiny: Make dc_mode_to_region_index static]
> > > [iweiny: simplify <sysfs>/create_dc_region]
> > > [iweiny: introduce decoder_mode_is_dc]
> > > [djbw: fixups, no sign-off: preview only]
> > > ---
> > >   drivers/cxl/Kconfig       |  11 +++
> > >   drivers/cxl/core/core.h   |   7 ++
> > >   drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
> > >   drivers/cxl/core/port.c   |  18 ++++
> > >   drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
> > >   drivers/cxl/cxl.h         |  28 ++++++
> > >   drivers/dax/cxl.c         |   4 +
> > >   7 files changed, 409 insertions(+), 28 deletions(-)
> > > 
> > > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > > index ff4e78117b31..df034889d053 100644
> > > --- a/drivers/cxl/Kconfig
> > > +++ b/drivers/cxl/Kconfig
> > > @@ -121,6 +121,17 @@ config CXL_REGION
> > >   
> > >   	  If unsure say 'y'
> > >   
> > > +config CXL_DCD
> > > +	bool "CXL: DCD Support"
> > > +	default CXL_BUS
> > > +	depends on CXL_REGION
> > > +	help
> > > +	  Enable the CXL core to provision CXL DCD regions.
> > > +	  CXL devices optionally support dynamic capacity and DCD region
> > > +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> > > +
> > > +	  If unsure say 'y'
> > > +
> > >   config CXL_REGION_INVALIDATION_TEST
> > >   	bool "CXL: Region Cache Management Bypass (TEST)"
> > >   	depends on CXL_REGION
> > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > > index 27f0968449de..725700ab5973 100644
> > > --- a/drivers/cxl/core/core.h
> > > +++ b/drivers/cxl/core/core.h
> > > @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
> > >   
> > >   extern struct attribute_group cxl_base_attribute_group;
> > >   
> > > +#ifdef CONFIG_CXL_DCD
> > > +extern struct device_attribute dev_attr_create_dc_region;
> > > +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> > > +#else
> > > +#define SET_CXL_DC_REGION_ATTR(x)
> > > +#endif
> > > +
> > >   #ifdef CONFIG_CXL_REGION
> > >   extern struct device_attribute dev_attr_create_pmem_region;
> > >   extern struct device_attribute dev_attr_create_ram_region;
> > > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > > index 514d30131d92..29649b47d177 100644
> > > --- a/drivers/cxl/core/hdm.c
> > > +++ b/drivers/cxl/core/hdm.c
> > > @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> > >   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > >   	struct resource *res = cxled->dpa_res;
> > >   	resource_size_t skip_start;
> > > +	resource_size_t skipped = cxled->skip;
> > >   
> > >   	lockdep_assert_held_write(&cxl_dpa_rwsem);
> > >   
> > >   	/* save @skip_start, before @res is released */
> > > -	skip_start = res->start - cxled->skip;
> > > +	skip_start = res->start - skipped;
> > >   	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> > > -	if (cxled->skip)
> > > -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> > > +	if (cxled->skip != 0) {
> > > +		while (skipped != 0) {
> > > +			res = xa_load(&cxled->skip_res, skip_start);
> > > +			__release_region(&cxlds->dpa_res, skip_start,
> > > +							resource_size(res));
> > > +			xa_erase(&cxled->skip_res, skip_start);
> > > +			skip_start += resource_size(res);
> > > +			skipped -= resource_size(res);
> > > +			}
> > > +	}
> > >   	cxled->skip = 0;
> > >   	cxled->dpa_res = NULL;
> > >   	put_device(&cxled->cxld.dev);
> > > @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> > >   	__cxl_dpa_release(cxled);
> > >   }
> > >   
> > > +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> > > +{
> > > +	int index = 0;
> > > +
> > > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > > +		if (mode == i)
> > > +			return index;
> > > +		index++;
> > > +	}
> > > +
> > > +	return -EINVAL;
> > > +}
> > > +
> > >   static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > >   			     resource_size_t base, resource_size_t len,
> > >   			     resource_size_t skipped)
> > > @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > >   	struct cxl_port *port = cxled_to_port(cxled);
> > >   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > >   	struct device *dev = &port->dev;
> > > +	struct device *ed_dev = &cxled->cxld.dev;
> > > +	struct resource *dpa_res = &cxlds->dpa_res;
> > > +	resource_size_t skip_len = 0;
> > >   	struct resource *res;
> > > +	int rc, index;
> > >   
> > >   	lockdep_assert_held_write(&cxl_dpa_rwsem);
> > >   
> > > @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > >   	}
> > >   
> > >   	if (skipped) {
> > > -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> > > -				       dev_name(&cxled->cxld.dev), 0);
> > > -		if (!res) {
> > > -			dev_dbg(dev,
> > > -				"decoder%d.%d: failed to reserve skipped space\n",
> > > -				port->id, cxled->cxld.id);
> > > -			return -EBUSY;
> > > +		resource_size_t skip_base = base - skipped;
> > > +
> > > +		if (decoder_mode_is_dc(cxled->mode)) {
> > 
> > Maybe move this entire block to a helper function to reduce the size of 
> > the current function and reduce indent levels and improve readability?
> 
> :-/
> 
> I'll work on breaking it out more.  The logic here is getting kind of
> crazy.
> 
> > 
> > > +			if (resource_size(&cxlds->ram_res) &&
> > > +					skip_base <= cxlds->ram_res.end) {
> > > +				skip_len = cxlds->ram_res.end - skip_base + 1;
> > > +				res = __request_region(dpa_res, skip_base,
> > > +						skip_len, dev_name(ed_dev), 0);
> > > +				if (!res)
> > > +					goto error;
> > > +
> > > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > > +								GFP_KERNEL);
> > > +				skip_base += skip_len;
> > > +			}
> > > +
> > > +			if (resource_size(&cxlds->ram_res) &&
>                                                   ^^^^^^^
> 						  pmem_res?
> 
> > > +					skip_base <= cxlds->pmem_res.end) {
> 
> The 2 if statements here are almost exactly the same.  To the point I
> wonder if there is a bug.
> 
> Navneet,
> 
> Why does the code check ram_res the second time but go on to use pmem_res
> in the block?

Navneet- Thanks for pointing out it should be pmem_res instead of
ram_res. I will fix it.
> 
> > > +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> > > +				res = __request_region(dpa_res, skip_base,
> > > +						skip_len, dev_name(ed_dev), 0);
> > > +				if (!res)
> > > +					goto error;
> > > +
> > > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > > +								GFP_KERNEL);
> > > +				skip_base += skip_len;
> > > +			}
> > > +
> > > +			index = dc_mode_to_region_index(cxled->mode);
> > > +			for (int i = 0; i <= index; i++) {
> > > +				struct resource *dcr = &cxlds->dc_res[i];
> > > +
> > > +				if (skip_base < dcr->start) {
> > > +					skip_len = dcr->start - skip_base;
> > > +					res = __request_region(dpa_res,
> > > +							skip_base, skip_len,
> > > +							dev_name(ed_dev), 0);
> > > +					if (!res)
> > > +						goto error;
> > > +
> > > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > > +							res, GFP_KERNEL);
> > > +					skip_base += skip_len;
> > > +				}
> > > +
> > > +				if (skip_base == base) {
> > > +					dev_dbg(dev, "skip done!\n");
> > > +					break;
> > > +				}
> > > +
> > > +				if (resource_size(dcr) &&
> > > +						skip_base <= dcr->end) {
> > > +					if (skip_base > base)
> > > +						dev_err(dev, "Skip error\n");
> > > +
> > > +					skip_len = dcr->end - skip_base + 1;
> > > +					res = __request_region(dpa_res, skip_base,
> > > +							skip_len,
> > > +							dev_name(ed_dev), 0);
> > > +					if (!res)
> > > +						goto error;
> > > +
> > > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > > +							res, GFP_KERNEL);
> > > +					skip_base += skip_len;
> > > +				}
> > > +			}
> > > +		} else	{
> > > +			res = __request_region(dpa_res, base - skipped, skipped,
> > > +							dev_name(ed_dev), 0);
> > > +			if (!res)
> > > +				goto error;
> > > +
> > > +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> > > +								GFP_KERNEL);
> > >   		}
> > >   	}
> > > -	res = __request_region(&cxlds->dpa_res, base, len,
> > > -			       dev_name(&cxled->cxld.dev), 0);
> > > +
> > > +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
> > >   	if (!res) {
> > >   		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> > > -			port->id, cxled->cxld.id);
> > > -		if (skipped)
> > > -			__release_region(&cxlds->dpa_res, base - skipped,
> > > -					 skipped);
> > > +				port->id, cxled->cxld.id);
> > > +		if (skipped) {
> > > +			resource_size_t skip_base = base - skipped;
> > > +
> > > +			while (skipped != 0) {
> > > +				if (skip_base > base)
> > > +					dev_err(dev, "Skip error\n");
> > > +
> > > +				res = xa_load(&cxled->skip_res, skip_base);
> > > +				__release_region(dpa_res, skip_base,
> > > +							resource_size(res));
> > > +				xa_erase(&cxled->skip_res, skip_base);
> > > +				skip_base += resource_size(res);
> > > +				skipped -= resource_size(res);
> > > +			}
> > > +		}
> > >   		return -EBUSY;
> > >   	}
> > >   	cxled->dpa_res = res;
> > >   	cxled->skip = skipped;
> > >   
> > > +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> > > +		int index = dc_mode_to_region_index(mode);
> > > +
> > > +		if (resource_contains(&cxlds->dc_res[index], res)) {
> > > +			cxled->mode = mode;
> > > +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> > > +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> > > +			goto success > +		}
> > > +	}
> > 
> > This block should only happen if decoder_mode_is_dc() right? If that's 
> > the case, you might be able to refactor it so the 'goto success' isn't 
> > necessary.
> 
> I'll check.  I looked through this code a couple of times in my review
> before posting because I'm not 100% sure I want to see 8 different modes
> DC decoders and regions.
> 
> I think the 'mode' should be 'DC' with an index in the endpoint decoder to
> map DC region that decoder is mapping.  But that change was much bigger to
> Navneets code and I wanted to see how others felt about having DC0 - DC7
> modes.  My compromise was creating decoder_mode_is_dc().
> 
> > 
> > >   	if (resource_contains(&cxlds->pmem_res, res))
> > >   		cxled->mode = CXL_DECODER_PMEM;
> > >   	else if (resource_contains(&cxlds->ram_res, res))
> > > @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > >   		cxled->mode = CXL_DECODER_MIXED;
> > >   	}
> > >   
> > > +success:
> > >   	port->hdm_end++;
> > >   	get_device(&cxled->cxld.dev);
> > >   	return 0;
> > > +
> > > +error:
> > > +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> > > +			port->id, cxled->cxld.id);
> > > +	return -EBUSY;
> > > +
> > >   }
> > >   
> > >   int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > > @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> > >   	switch (mode) {
> > >   	case CXL_DECODER_RAM:
> > >   	case CXL_DECODER_PMEM:
> > > +	case CXL_DECODER_DC0:
> > > +	case CXL_DECODER_DC1:
> > > +	case CXL_DECODER_DC2:
> > > +	case CXL_DECODER_DC3:
> > > +	case CXL_DECODER_DC4:
> > > +	case CXL_DECODER_DC5:
> > > +	case CXL_DECODER_DC6:
> > > +	case CXL_DECODER_DC7:
> 
> For example this seems very hacky...
> 
> [snip]
> 
> > >   
> > > +/*
> > > + * The region can not be manged by CXL if any portion of
> > > + * it is already online as 'System RAM'
> > > + */
> > > +static bool region_is_system_ram(struct cxl_region *cxlr,
> > > +				 struct cxl_region_params *p)
> > > +{
> > > +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> > > +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > > +				    p->res->start, p->res->end, cxlr,
> > > +				    is_system_ram) > 0);
> > > +}
> > > +
> > >   static int cxl_region_probe(struct device *dev)
> > >   {
> > >   	struct cxl_region *cxlr = to_cxl_region(dev);
> > > @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
> > >   	case CXL_DECODER_PMEM:
> > >   		return devm_cxl_add_pmem_region(cxlr);
> > >   	case CXL_DECODER_RAM:
> > > -		/*
> > > -		 * The region can not be manged by CXL if any portion of
> > > -		 * it is already online as 'System RAM'
> > > -		 */
> > > -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> > > -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > > -					p->res->start, p->res->end, cxlr,
> > > -					is_system_ram) > 0)
> > > +		if (region_is_system_ram(cxlr, p))
> > 
> > Maybe split this change out as a prep patch before the current patch.
> 
> That seems reasonable.  But the patch is not so large and the
> justification for creating a helper is that we need this same check for DC
> regions.  So it seemed ok to leave it like this.  Let me see about
> splitting it out.
> 
> [snip]
> 
> > > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > > index ccdf8de85bd5..eb5eb81bfbd7 100644
> > > --- a/drivers/dax/cxl.c
> > > +++ b/drivers/dax/cxl.c
> > > @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
> > >   	if (!dax_region)
> > >   		return -ENOMEM;
> > >   
> > > +	if (decoder_mode_is_dc(cxlr->mode))
> > > +		return 0;
> > > +
> > >   	data = (struct dev_dax_data) {
> > >   		.dax_region = dax_region,
> > >   		.id = -1,
> > >   		.size = range_len(&cxlr_dax->hpa_range),
> > >   	};
> > > +
> > 
> > Stray blank line?
> 
> Opps!  Fixed!
> 
> Ira
Ira Weiny June 16, 2023, 2:06 a.m. UTC | #6
Alison Schofield wrote:
> On Wed, Jun 14, 2023 at 12:16:29PM -0700, Ira Weiny wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity. CXL Regions must be
> > created to access this capacity.
> > 
> > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > which are added to that region.
> 
> This is a lot in one patch, especially where it weaves in and out of
> existing code. I'm wondering if this can be introduced in smaller
> pieces (patches). An introductory patch explaining the DC DPA 
> allocations might be a useful chunk to pull forward. 

The patch is < 800 lines long.  And would be closer to 700 lines if there
were not 8 different 'modes' for the various DC regions.

It is also very self contained in that it implements the region creation
for DC DPAs fully.  And I know that Dan prefers patches larger if they are
all part of the same functionality.

Dan?

Ira
Navneet Singh June 16, 2023, 3:52 a.m. UTC | #7
On Thu, Jun 15, 2023 at 11:28:26AM -0700, Dave Jiang wrote:
> 
> 
> On 6/15/23 11:12, Ira Weiny wrote:
> > Dave Jiang wrote:
> > > 
> > > 
> > > On 6/14/23 12:16, ira.weiny@intel.com wrote:
> > > > From: Navneet Singh <navneet.singh@intel.com>
> > > > 
> > > > CXL devices optionally support dynamic capacity. CXL Regions must be
> > > > created to access this capacity.
> > > > 
> > > > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > > > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > > > which are added to that region.
> > > > 
> > > > Below are the steps to create and delete dynamic capacity region0
> > > > (example).
> > > > 
> > > >       region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
> > > >       echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
> > > >       echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
> > > >       echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> > > > 
> > > >       echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
> > > >       echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> > > > 
> > > >       echo 0x400000000 > /sys/bus/cxl/devices/$region/size
> > > >       echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
> > > >       echo 1 > /sys/bus/cxl/devices/$region/commit
> > > >       echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> > > > 
> > > >       echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> > > > 
> > > > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > > > 
> > > > ---
> > > > [iweiny: fixups]
> > > > [iweiny: remove unused CXL_DC_REGION_MODE macro]
> > > > [iweiny: Make dc_mode_to_region_index static]
> > > > [iweiny: simplify <sysfs>/create_dc_region]
> > > > [iweiny: introduce decoder_mode_is_dc]
> > > > [djbw: fixups, no sign-off: preview only]
> > > > ---
> > > >    drivers/cxl/Kconfig       |  11 +++
> > > >    drivers/cxl/core/core.h   |   7 ++
> > > >    drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
> > > >    drivers/cxl/core/port.c   |  18 ++++
> > > >    drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
> > > >    drivers/cxl/cxl.h         |  28 ++++++
> > > >    drivers/dax/cxl.c         |   4 +
> > > >    7 files changed, 409 insertions(+), 28 deletions(-)
> > > > 
> > > > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > > > index ff4e78117b31..df034889d053 100644
> > > > --- a/drivers/cxl/Kconfig
> > > > +++ b/drivers/cxl/Kconfig
> > > > @@ -121,6 +121,17 @@ config CXL_REGION
> > > >    	  If unsure say 'y'
> > > > +config CXL_DCD
> > > > +	bool "CXL: DCD Support"
> > > > +	default CXL_BUS
> > > > +	depends on CXL_REGION
> > > > +	help
> > > > +	  Enable the CXL core to provision CXL DCD regions.
> > > > +	  CXL devices optionally support dynamic capacity and DCD region
> > > > +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> > > > +
> > > > +	  If unsure say 'y'
> > > > +
> > > >    config CXL_REGION_INVALIDATION_TEST
> > > >    	bool "CXL: Region Cache Management Bypass (TEST)"
> > > >    	depends on CXL_REGION
> > > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > > > index 27f0968449de..725700ab5973 100644
> > > > --- a/drivers/cxl/core/core.h
> > > > +++ b/drivers/cxl/core/core.h
> > > > @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
> > > >    extern struct attribute_group cxl_base_attribute_group;
> > > > +#ifdef CONFIG_CXL_DCD
> > > > +extern struct device_attribute dev_attr_create_dc_region;
> > > > +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> > > > +#else
> > > > +#define SET_CXL_DC_REGION_ATTR(x)
> > > > +#endif
> > > > +
> > > >    #ifdef CONFIG_CXL_REGION
> > > >    extern struct device_attribute dev_attr_create_pmem_region;
> > > >    extern struct device_attribute dev_attr_create_ram_region;
> > > > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > > > index 514d30131d92..29649b47d177 100644
> > > > --- a/drivers/cxl/core/hdm.c
> > > > +++ b/drivers/cxl/core/hdm.c
> > > > @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> > > >    	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > >    	struct resource *res = cxled->dpa_res;
> > > >    	resource_size_t skip_start;
> > > > +	resource_size_t skipped = cxled->skip;
> > > >    	lockdep_assert_held_write(&cxl_dpa_rwsem);
> > > >    	/* save @skip_start, before @res is released */
> > > > -	skip_start = res->start - cxled->skip;
> > > > +	skip_start = res->start - skipped;
> > > >    	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> > > > -	if (cxled->skip)
> > > > -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> > > > +	if (cxled->skip != 0) {
> > > > +		while (skipped != 0) {
> > > > +			res = xa_load(&cxled->skip_res, skip_start);
> > > > +			__release_region(&cxlds->dpa_res, skip_start,
> > > > +							resource_size(res));
> > > > +			xa_erase(&cxled->skip_res, skip_start);
> > > > +			skip_start += resource_size(res);
> > > > +			skipped -= resource_size(res);
> > > > +			}
> > > > +	}
> > > >    	cxled->skip = 0;
> > > >    	cxled->dpa_res = NULL;
> > > >    	put_device(&cxled->cxld.dev);
> > > > @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> > > >    	__cxl_dpa_release(cxled);
> > > >    }
> > > > +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> > > > +{
> > > > +	int index = 0;
> > > > +
> > > > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > > > +		if (mode == i)
> > > > +			return index;
> > > > +		index++;
> > > > +	}
> > > > +
> > > > +	return -EINVAL;
> > > > +}
> > > > +
> > > >    static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > > >    			     resource_size_t base, resource_size_t len,
> > > >    			     resource_size_t skipped)
> > > > @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > > >    	struct cxl_port *port = cxled_to_port(cxled);
> > > >    	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > >    	struct device *dev = &port->dev;
> > > > +	struct device *ed_dev = &cxled->cxld.dev;
> > > > +	struct resource *dpa_res = &cxlds->dpa_res;
> > > > +	resource_size_t skip_len = 0;
> > > >    	struct resource *res;
> > > > +	int rc, index;
> > > >    	lockdep_assert_held_write(&cxl_dpa_rwsem);
> > > > @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > > >    	}
> > > >    	if (skipped) {
> > > > -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> > > > -				       dev_name(&cxled->cxld.dev), 0);
> > > > -		if (!res) {
> > > > -			dev_dbg(dev,
> > > > -				"decoder%d.%d: failed to reserve skipped space\n",
> > > > -				port->id, cxled->cxld.id);
> > > > -			return -EBUSY;
> > > > +		resource_size_t skip_base = base - skipped;
> > > > +
> > > > +		if (decoder_mode_is_dc(cxled->mode)) {
> > > 
> > > Maybe move this entire block to a helper function to reduce the size of
> > > the current function and reduce indent levels and improve readability?
> > 
> > :-/
> > 
> > I'll work on breaking it out more.  The logic here is getting kind of
> > crazy.
Navneet - yeah, its like splitting the skip in ram, pmem, dc
regions and gaps between the regions.Helper can be done.
> > 
> > > 
> > > > +			if (resource_size(&cxlds->ram_res) &&
> > > > +					skip_base <= cxlds->ram_res.end) {
> > > > +				skip_len = cxlds->ram_res.end - skip_base + 1;
> > > > +				res = __request_region(dpa_res, skip_base,
> > > > +						skip_len, dev_name(ed_dev), 0);
> > > > +				if (!res)
> > > > +					goto error;
> > > > +
> > > > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > > > +								GFP_KERNEL);
> > > > +				skip_base += skip_len;
> > > > +			}
> > > > +
> > > > +			if (resource_size(&cxlds->ram_res) &&
> >                                                    ^^^^^^^
> > 						  pmem_res?
> > 
> > > > +					skip_base <= cxlds->pmem_res.end) {
> > 
> > The 2 if statements here are almost exactly the same.  To the point I
> > wonder if there is a bug.
> > 
> > Navneet,
> > 
> > Why does the code check ram_res the second time but go on to use pmem_res
> > in the block?
> > 
> > > > +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> > > > +				res = __request_region(dpa_res, skip_base,
> > > > +						skip_len, dev_name(ed_dev), 0);
> > > > +				if (!res)
> > > > +					goto error;
> > > > +
> > > > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > > > +								GFP_KERNEL);
> > > > +				skip_base += skip_len;
> > > > +			}
> > > > +
> > > > +			index = dc_mode_to_region_index(cxled->mode);
> > > > +			for (int i = 0; i <= index; i++) {
> > > > +				struct resource *dcr = &cxlds->dc_res[i];
> > > > +
> > > > +				if (skip_base < dcr->start) {
> > > > +					skip_len = dcr->start - skip_base;
> > > > +					res = __request_region(dpa_res,
> > > > +							skip_base, skip_len,
> > > > +							dev_name(ed_dev), 0);
> > > > +					if (!res)
> > > > +						goto error;
> > > > +
> > > > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > > > +							res, GFP_KERNEL);
> > > > +					skip_base += skip_len;
> > > > +				}
> > > > +
> > > > +				if (skip_base == base) {
> > > > +					dev_dbg(dev, "skip done!\n");
> > > > +					break;
> > > > +				}
> > > > +
> > > > +				if (resource_size(dcr) &&
> > > > +						skip_base <= dcr->end) {
> > > > +					if (skip_base > base)
> > > > +						dev_err(dev, "Skip error\n");
> > > > +
> > > > +					skip_len = dcr->end - skip_base + 1;
> > > > +					res = __request_region(dpa_res, skip_base,
> > > > +							skip_len,
> > > > +							dev_name(ed_dev), 0);
> > > > +					if (!res)
> > > > +						goto error;
> > > > +
> > > > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > > > +							res, GFP_KERNEL);
> > > > +					skip_base += skip_len;
> > > > +				}
> > > > +			}
> > > > +		} else	{
> > > > +			res = __request_region(dpa_res, base - skipped, skipped,
> > > > +							dev_name(ed_dev), 0);
> > > > +			if (!res)
> > > > +				goto error;
> > > > +
> > > > +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> > > > +								GFP_KERNEL);
> > > >    		}
> > > >    	}
> > > > -	res = __request_region(&cxlds->dpa_res, base, len,
> > > > -			       dev_name(&cxled->cxld.dev), 0);
> > > > +
> > > > +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
> > > >    	if (!res) {
> > > >    		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> > > > -			port->id, cxled->cxld.id);
> > > > -		if (skipped)
> > > > -			__release_region(&cxlds->dpa_res, base - skipped,
> > > > -					 skipped);
> > > > +				port->id, cxled->cxld.id);
> > > > +		if (skipped) {
> > > > +			resource_size_t skip_base = base - skipped;
> > > > +
> > > > +			while (skipped != 0) {
> > > > +				if (skip_base > base)
> > > > +					dev_err(dev, "Skip error\n");
> > > > +
> > > > +				res = xa_load(&cxled->skip_res, skip_base);
> > > > +				__release_region(dpa_res, skip_base,
> > > > +							resource_size(res));
> > > > +				xa_erase(&cxled->skip_res, skip_base);
> > > > +				skip_base += resource_size(res);
> > > > +				skipped -= resource_size(res);
> > > > +			}
> > > > +		}
> > > >    		return -EBUSY;
> > > >    	}
> > > >    	cxled->dpa_res = res;
> > > >    	cxled->skip = skipped;
> > > > +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> > > > +		int index = dc_mode_to_region_index(mode);
> > > > +
> > > > +		if (resource_contains(&cxlds->dc_res[index], res)) {
> > > > +			cxled->mode = mode;
> > > > +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> > > > +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> > > > +			goto success > +		}
> > > > +	}
> > > 
> > > This block should only happen if decoder_mode_is_dc() right? If that's
> > > the case, you might be able to refactor it so the 'goto success' isn't
> > > necessary.
> > 
> > I'll check.  I looked through this code a couple of times in my review
> > before posting because I'm not 100% sure I want to see 8 different modes
> > DC decoders and regions.
> > 
> > I think the 'mode' should be 'DC' with an index in the endpoint decoder to
> > map DC region that decoder is mapping.  But that change was much bigger to
> > Navneets code and I wanted to see how others felt about having DC0 - DC7
> > modes.  My compromise was creating decoder_mode_is_dc().
> > 
Navneet - Discussed with Dan before splitting the modes in dc0-dc7.
Intent is to keep the Linux definition simple and enforce one decoder per DC region.
Each DC region will have its own DSMAS entry.The primary reason to 
have multiple DC regions is to have different performance properties.


> > > 
> > > >    	if (resource_contains(&cxlds->pmem_res, res))
> > > >    		cxled->mode = CXL_DECODER_PMEM;
> > > >    	else if (resource_contains(&cxlds->ram_res, res))
> > > > @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > > >    		cxled->mode = CXL_DECODER_MIXED;
> > > >    	}
> > > > +success:
> > > >    	port->hdm_end++;
> > > >    	get_device(&cxled->cxld.dev);
> > > >    	return 0;
> > > > +
> > > > +error:
> > > > +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> > > > +			port->id, cxled->cxld.id);
> > > > +	return -EBUSY;
> > > > +
> > > >    }
> > > >    int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > > > @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> > > >    	switch (mode) {
> > > >    	case CXL_DECODER_RAM:
> > > >    	case CXL_DECODER_PMEM:
> > > > +	case CXL_DECODER_DC0:
> > > > +	case CXL_DECODER_DC1:
> > > > +	case CXL_DECODER_DC2:
> > > > +	case CXL_DECODER_DC3:
> > > > +	case CXL_DECODER_DC4:
> > > > +	case CXL_DECODER_DC5:
> > > > +	case CXL_DECODER_DC6:
> > > > +	case CXL_DECODER_DC7:
> > 
> > For example this seems very hacky...
> 
> Not sure if it helps, but you can always do:
> case CXL_DECODER_DC0 ... CXL_DECODER_DC7:
> 
> DJ
> 
Navneet - Definetly, thanks.
> > 
> > [snip]
> > 
> > > > +/*
> > > > + * The region can not be manged by CXL if any portion of
> > > > + * it is already online as 'System RAM'
> > > > + */
> > > > +static bool region_is_system_ram(struct cxl_region *cxlr,
> > > > +				 struct cxl_region_params *p)
> > > > +{
> > > > +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> > > > +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > > > +				    p->res->start, p->res->end, cxlr,
> > > > +				    is_system_ram) > 0);
> > > > +}
> > > > +
> > > >    static int cxl_region_probe(struct device *dev)
> > > >    {
> > > >    	struct cxl_region *cxlr = to_cxl_region(dev);
> > > > @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
> > > >    	case CXL_DECODER_PMEM:
> > > >    		return devm_cxl_add_pmem_region(cxlr);
> > > >    	case CXL_DECODER_RAM:
> > > > -		/*
> > > > -		 * The region can not be manged by CXL if any portion of
> > > > -		 * it is already online as 'System RAM'
> > > > -		 */
> > > > -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> > > > -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > > > -					p->res->start, p->res->end, cxlr,
> > > > -					is_system_ram) > 0)
> > > > +		if (region_is_system_ram(cxlr, p))
> > > 
> > > Maybe split this change out as a prep patch before the current patch.
> > 
> > That seems reasonable.  But the patch is not so large and the
> > justification for creating a helper is that we need this same check for DC
> > regions.  So it seemed ok to leave it like this.  Let me see about
> > splitting it out.
> > 
> > [snip]
> > 
> > > > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > > > index ccdf8de85bd5..eb5eb81bfbd7 100644
> > > > --- a/drivers/dax/cxl.c
> > > > +++ b/drivers/dax/cxl.c
> > > > @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
> > > >    	if (!dax_region)
> > > >    		return -ENOMEM;
> > > > +	if (decoder_mode_is_dc(cxlr->mode))
> > > > +		return 0;
> > > > +
> > > >    	data = (struct dev_dax_data) {
> > > >    		.dax_region = dax_region,
> > > >    		.id = -1,
> > > >    		.size = range_len(&cxlr_dax->hpa_range),
> > > >    	};
> > > > +
> > > 
> > > Stray blank line?
> > 
> > Opps!  Fixed!
> > 
> > Ira
Alison Schofield June 16, 2023, 3:56 p.m. UTC | #8
On Thu, Jun 15, 2023 at 07:06:15PM -0700, Ira Weiny wrote:
> Alison Schofield wrote:
> > On Wed, Jun 14, 2023 at 12:16:29PM -0700, Ira Weiny wrote:
> > > From: Navneet Singh <navneet.singh@intel.com>
> > > 
> > > CXL devices optionally support dynamic capacity. CXL Regions must be
> > > created to access this capacity.
> > > 
> > > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > > which are added to that region.
> > 
> > This is a lot in one patch, especially where it weaves in and out of
> > existing code. I'm wondering if this can be introduced in smaller
> > pieces (patches). An introductory patch explaining the DC DPA 
> > allocations might be a useful chunk to pull forward. 
> 
> The patch is < 800 lines long.  And would be closer to 700 lines if there
> were not 8 different 'modes' for the various DC regions.
> 
> It is also very self contained in that it implements the region creation
> for DC DPAs fully.  And I know that Dan prefers patches larger if they are
> all part of the same functionality.
> 
> Dan?

Ira,
I found the patch difficult to review, and hope that it can be 
presented in a way that is easier to review. I don't know if
that results in separate patches.

It's hard for me to imagine that it wasn't conceived of in 
smaller chunks, that could be presented, but I don't know.

I'll go back and review the patch now, and point out where,
I found it difficult to follow.

Alison




> 
> Ira
Alison Schofield June 16, 2023, 4:51 p.m. UTC | #9
On Wed, Jun 14, 2023 at 12:16:29PM -0700, Ira Weiny wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity. CXL Regions must be
> created to access this capacity.
> 
> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> Dynamic Capacity decoder mode which targets dynamic capacity on devices
> which are added to that region.
> 
> Below are the steps to create and delete dynamic capacity region0
> (example).
> 
>     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> 
>     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> 
>     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>     echo 1 > /sys/bus/cxl/devices/$region/commit
>     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> 
>     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>

Hi,
I took another pass at this and offered more feedback.
I do think that if the big part - the cxl_dpa_reserve()
was more 'chunkified' it would be easier to review for
actual functionality.

I'd also like to see the commit log be a bit more specific
in enumerated the things this patch intends to do.

Many of my comments are about style. Some checkpatch --strict
would call out and some are addressed in the kernel coding
style - Documentation/process/coding-style.rst

But really, my goal is that when this code merges, that as
I scroll through a file, say region.c, I see a consistent
coding style. I shouldn't be able to notice that oh, Dan
wrote that, and Ira that, and Navneet wrote that piece.

I think it's important because differences in style distract
from focusing on the functionality of the code.

(off my soap box now ;)

Alison


> 
> ---
> [iweiny: fixups]
> [iweiny: remove unused CXL_DC_REGION_MODE macro]
> [iweiny: Make dc_mode_to_region_index static]
> [iweiny: simplify <sysfs>/create_dc_region]
> [iweiny: introduce decoder_mode_is_dc]
> [djbw: fixups, no sign-off: preview only]
> ---
>  drivers/cxl/Kconfig       |  11 +++
>  drivers/cxl/core/core.h   |   7 ++
>  drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
>  drivers/cxl/core/port.c   |  18 ++++
>  drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
>  drivers/cxl/cxl.h         |  28 ++++++
>  drivers/dax/cxl.c         |   4 +
>  7 files changed, 409 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index ff4e78117b31..df034889d053 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -121,6 +121,17 @@ config CXL_REGION
>  
>  	  If unsure say 'y'
>  
> +config CXL_DCD
> +	bool "CXL: DCD Support"

"CXL DCD: Dynamic Capacity Device Support"
is more in line with others in this file, and expands the acronym onetime.

> +	default CXL_BUS
> +	depends on CXL_REGION
> +	help
> +	  Enable the CXL core to provision CXL DCD regions.
> +	  CXL devices optionally support dynamic capacity and DCD region
> +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> +
> +	  If unsure say 'y'
> +
>  config CXL_REGION_INVALIDATION_TEST
>  	bool "CXL: Region Cache Management Bypass (TEST)"
>  	depends on CXL_REGION
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 27f0968449de..725700ab5973 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
>  
>  extern struct attribute_group cxl_base_attribute_group;
>  
> +#ifdef CONFIG_CXL_DCD
> +extern struct device_attribute dev_attr_create_dc_region;
> +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> +#else
> +#define SET_CXL_DC_REGION_ATTR(x)
> +#endif
> +
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 514d30131d92..29649b47d177 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct resource *res = cxled->dpa_res;
>  	resource_size_t skip_start;
> +	resource_size_t skipped = cxled->skip;

Reverse x-tree.

>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
>  	/* save @skip_start, before @res is released */
> -	skip_start = res->start - cxled->skip;
> +	skip_start = res->start - skipped;

Why did the assignment of skip_start need to change here?

>  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> -	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +	if (cxled->skip != 0) {
> +		while (skipped != 0) {
> +			res = xa_load(&cxled->skip_res, skip_start);
> +			__release_region(&cxlds->dpa_res, skip_start,
> +							resource_size(res));

The above appears poorlty aligned.

> +			xa_erase(&cxled->skip_res, skip_start);
> +			skip_start += resource_size(res);
> +			skipped -= resource_size(res);
> +			}

This bracket appears poorly aligned.

> +	}
>  	cxled->skip = 0;
>  	cxled->dpa_res = NULL;
>  	put_device(&cxled->cxld.dev);
> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	__cxl_dpa_release(cxled);
>  }
>  
> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> +{
> +	int index = 0;
> +
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		if (mode == i)
> +			return index;
> +		index++;
> +	}
> +
> +	return -EINVAL;
> +}
> +
>  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  			     resource_size_t base, resource_size_t len,
>  			     resource_size_t skipped)
> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	struct cxl_port *port = cxled_to_port(cxled);
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct device *dev = &port->dev;
> +	struct device *ed_dev = &cxled->cxld.dev;
> +	struct resource *dpa_res = &cxlds->dpa_res;
> +	resource_size_t skip_len = 0;
>  	struct resource *res;
> +	int rc, index;
>  

Above poorly aligned.

>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	}
>  
>  	if (skipped) {

This has excessive indentation, so started out with a monster
if skipped is begging for a refactoring.

I find it odd that the DCD case got inserted before the 'default'
or non-DCD case here.


> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> +		resource_size_t skip_base = base - skipped;
> +
> +		if (decoder_mode_is_dc(cxled->mode)) {

This may be cleaner to introduce as a separate function for
handling _mode_id_dc.

> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->ram_res.end) {
> +				skip_len = cxlds->ram_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->pmem_res.end) {
> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}

The above 2 if (resource_size() cases have redundant code. 
Pull it out, refactor.

> +
> +			index = dc_mode_to_region_index(cxled->mode);
> +			for (int i = 0; i <= index; i++) {
> +				struct resource *dcr = &cxlds->dc_res[i];
> +
> +				if (skip_base < dcr->start) {
> +					skip_len = dcr->start - skip_base;
> +					res = __request_region(dpa_res,
> +							skip_base, skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +
> +				if (skip_base == base) {
> +					dev_dbg(dev, "skip done!\n");
> +					break;
> +				}
> +
> +				if (resource_size(dcr) &&
> +						skip_base <= dcr->end) {
> +					if (skip_base > base)
> +						dev_err(dev, "Skip error\n");
> +
> +					skip_len = dcr->end - skip_base + 1;
> +					res = __request_region(dpa_res, skip_base,
> +							skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +			}


And, below,we are back to the original code.
This would be more readable, reviewable if the DCD support was
added in separate function that are then called from here.

> +		} else	{
> +			res = __request_region(dpa_res, base - skipped, skipped,
> +							dev_name(ed_dev), 0);
> +			if (!res)
> +				goto error;
> +
> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
>  		}
>  	}
> -	res = __request_region(&cxlds->dpa_res, base, len,
> -			       dev_name(&cxled->cxld.dev), 0);
> +
> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>  	if (!res) {
>  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> -			port->id, cxled->cxld.id);

General comment - look over the dev_dbg() messages and consider placing
them after the code. I recall, others that were needlessly between lines
of code.


> -		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +				port->id, cxled->cxld.id);
> +		if (skipped) {
> +			resource_size_t skip_base = base - skipped;
> +
> +			while (skipped != 0) {
> +				if (skip_base > base)
> +					dev_err(dev, "Skip error\n");
> +
> +				res = xa_load(&cxled->skip_res, skip_base);
> +				__release_region(dpa_res, skip_base,
> +							resource_size(res));
> +				xa_erase(&cxled->skip_res, skip_base);
> +				skip_base += resource_size(res);
> +				skipped -= resource_size(res);
> +			}
> +		}

		Can that debug message go here ?

>  		return -EBUSY;
>  	}
>  	cxled->dpa_res = res;
>  	cxled->skip = skipped;
>  
> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> +		int index = dc_mode_to_region_index(mode);
> +
> +		if (resource_contains(&cxlds->dc_res[index], res)) {
> +			cxled->mode = mode;
> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);

Can this move to ....


> +			goto success;
> +		}
> +	}
>  	if (resource_contains(&cxlds->pmem_res, res))
>  		cxled->mode = CXL_DECODER_PMEM;
>  	else if (resource_contains(&cxlds->ram_res, res))
> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  		cxled->mode = CXL_DECODER_MIXED;
>  	}
>  
> +success:
>  	port->hdm_end++;
>  	get_device(&cxled->cxld.dev);

here...dev_dbg() success message. That pairs it nicely with the
error message below.

>  	return 0;
> +
> +error:
> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> +			port->id, cxled->cxld.id);
> +	return -EBUSY;
> +
>  }
>  
>  int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_dbg(dev, "unsupported mode: %d\n", mode);
> @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  		goto out;
>  	}
>  
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		int index = dc_mode_to_region_index(i);
> +
> +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> +			dev_dbg(dev, "no available dynamic capacity\n");

I see this one is following the pattern in the function :)


> +			rc = -ENXIO;
> +			goto out;
> +		}
> +	}
> +
>  	cxled->mode = mode;
>  	rc = 0;
>  out:
> @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,

Hmmm...I don't have cxl_dpa_freespace() in my cxl/next? Where's that?


>  					 resource_size_t *skip_out)
>  {
>  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
> +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct device *dev = &cxled->cxld.dev;
>  	resource_size_t start, avail, skip;
>  	struct resource *p, *last;
> +	int index;

Why break the alignment above?

>  
>  	lockdep_assert_held(&cxl_dpa_rwsem);
>  
> @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  	else
>  		free_pmem_start = cxlds->pmem_res.start;
>  
> +	/*
> +	 * One HDM Decoder per DC region to map memory with different
> +	 * DSMAS entry.
> +	 */

It seems this comment is missing a verb. Why not align?

> +	index = dc_mode_to_region_index(cxled->mode);
> +	if (index >= 0) {
> +		if (cxlds->dc_res[index].child) {
> +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n"

s/allocated/allocate

,
> +					index);
> +			return -EINVAL;
> +		}
> +		free_dc_start = cxlds->dc_res[index].start;
> +	}
> +
>  	if (cxled->mode == CXL_DECODER_RAM) {
>  		start = free_ram_start;
>  		avail = cxlds->ram_res.end - start + 1;
> @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  		else
>  			skip_end = start - 1;
>  		skip = skip_end - skip_start + 1;
> +	} else if (decoder_mode_is_dc(cxled->mode)) {
> +		resource_size_t skip_start, skip_end;
> +
> +		start = free_dc_start;
> +		avail = cxlds->dc_res[index].end - start + 1;
> +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> +			skip_start = free_ram_start;
> +		else
> +			skip_start = free_pmem_start;
> +		/*
> +		 * If some dc region is already mapped, then that allocation

maybe s/some/any ?

> +		 * already handled the RAM and PMEM skip.Check for DC region
> +		 * skip.
> +		 */
> +		for (int i = index - 1; i >= 0 ; i--) {
> +			if (cxlds->dc_res[i].child) {
> +				skip_start = cxlds->dc_res[i].child->end + 1;
> +				break;
> +			}
> +		}
> +
> +		skip_end = start - 1;
> +		skip = skip_end - skip_start + 1;
>  	} else {
>  		dev_dbg(cxled_dev(cxled), "mode not set\n");
>  		avail = 0;
> @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>  
>  	avail = cxl_dpa_freespace(cxled, &start, &skip);
>  
> +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> +						start, size, skip);
>  	if (size > avail) {
> +		static const char * const names[] = {
> +			[CXL_DECODER_NONE] = "none",
> +			[CXL_DECODER_RAM] = "ram",
> +			[CXL_DECODER_PMEM] = "pmem",
> +			[CXL_DECODER_MIXED] = "mixed",
> +			[CXL_DECODER_DC0] = "dc0",
> +			[CXL_DECODER_DC1] = "dc1",
> +			[CXL_DECODER_DC2] = "dc2",
> +			[CXL_DECODER_DC3] = "dc3",
> +			[CXL_DECODER_DC4] = "dc4",
> +			[CXL_DECODER_DC5] = "dc5",
> +			[CXL_DECODER_DC6] = "dc6",
> +			[CXL_DECODER_DC7] = "dc7",
> +		};
>  		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> -			&avail);
> +			names[cxled->mode], &avail);
>  		rc = -ENOSPC;
>  		goto out;
>  	}
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 5e21b53362e6..a1a98aba24ed 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -195,6 +195,22 @@ static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
>  		mode = CXL_DECODER_PMEM;
>  	else if (sysfs_streq(buf, "ram"))
>  		mode = CXL_DECODER_RAM;
> +	else if (sysfs_streq(buf, "dc0"))
> +		mode = CXL_DECODER_DC0;
> +	else if (sysfs_streq(buf, "dc1"))
> +		mode = CXL_DECODER_DC1;
> +	else if (sysfs_streq(buf, "dc2"))
> +		mode = CXL_DECODER_DC2;
> +	else if (sysfs_streq(buf, "dc3"))
> +		mode = CXL_DECODER_DC3;
> +	else if (sysfs_streq(buf, "dc4"))
> +		mode = CXL_DECODER_DC4;
> +	else if (sysfs_streq(buf, "dc5"))
> +		mode = CXL_DECODER_DC5;
> +	else if (sysfs_streq(buf, "dc6"))
> +		mode = CXL_DECODER_DC6;
> +	else if (sysfs_streq(buf, "dc7"))
> +		mode = CXL_DECODER_DC7;
>  	else
>  		return -EINVAL;
>  
> @@ -296,6 +312,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>  	&dev_attr_target_list.attr,
>  	SET_CXL_REGION_ATTR(create_pmem_region)
>  	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_DC_REGION_ATTR(create_dc_region)
>  	SET_CXL_REGION_ATTR(delete_region)
>  	NULL,
>  };
> @@ -1691,6 +1708,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
>  		return ERR_PTR(-ENOMEM);
>  
>  	cxled->pos = -1;
> +	xa_init(&cxled->skip_res);
>  	cxld = &cxled->cxld;
>  	rc = cxl_decoder_init(port, cxld);
>  	if (rc)	 {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 543c4499379e..144232c8305e 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>  	lockdep_assert_held_write(&cxl_region_rwsem);
>  	lockdep_assert_held_read(&cxl_dpa_rwsem);
>  
> -	if (cxled->mode != cxlr->mode) {
> +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
>  		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
>  			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
>  		return -EINVAL;
> @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> +				const char *buf, enum cxl_decoder_mode mode,
> +				size_t len)
> +{
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	/*
> +	 * All DC regions use decoder mode DC0 as the region does not need the
> +	 * index information
> +	 */
> +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> +				CXL_DECODER_DC0, len);
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	return rc;
>  }
>  
> +static void cxl_dc_region_release(void *data)
> +{
> +	struct cxl_region *cxlr = data;
> +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> +
> +	xa_destroy(&cxlr_dc->dax_dev_list);
> +	kfree(cxlr_dc);
> +}
> +
> +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> +{
> +	struct cxl_dc_region *cxlr_dc;
> +	struct cxl_dax_region *cxlr_dax;
> +	struct device *dev;
> +	int rc = 0;
> +
> +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> +	if (IS_ERR(cxlr_dax))
> +		return PTR_ERR(cxlr_dax);
> +
> +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> +	if (!cxlr_dc) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	dev = &cxlr_dax->dev;
> +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> +		dev_name(dev));
> +
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> +					cxlr_dax);
> +	if (rc)
> +		goto err;
> +
> +	cxlr_dc->cxlr_dax = cxlr_dax;
> +	xa_init(&cxlr_dc->dax_dev_list);
> +	cxlr->cxlr_dc = cxlr_dc;
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> +	if (!rc)
> +		return 0;
> +err:
> +	put_device(dev);
> +	kfree(cxlr_dc);
> +	return rc;
> +}
> +
>  static int match_decoder_by_range(struct device *dev, void *data)
>  {
>  	struct range *r1, *r2 = data;
> @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
>  	return 1;
>  }
>  
> +/*
> + * The region can not be manged by CXL if any portion of
> + * it is already online as 'System RAM'
> + */
> +static bool region_is_system_ram(struct cxl_region *cxlr,
> +				 struct cxl_region_params *p)
> +{
> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> +				    p->res->start, p->res->end, cxlr,
> +				    is_system_ram) > 0);
> +}
> +
>  static int cxl_region_probe(struct device *dev)
>  {
>  	struct cxl_region *cxlr = to_cxl_region(dev);
> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_DECODER_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_DECODER_RAM:
> -		/*
> -		 * The region can not be manged by CXL if any portion of
> -		 * it is already online as 'System RAM'
> -		 */
> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> -					p->res->start, p->res->end, cxlr,
> -					is_system_ram) > 0)
> +		if (region_is_system_ram(cxlr, p))
>  			return 0;
>  
>  		/*
> @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
>  
>  		/* HDM-H routes to device-dax */
>  		return devm_cxl_add_dax_region(cxlr);
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
> +		if (region_is_system_ram(cxlr, p))
> +			return 0;
> +		return devm_cxl_add_dc_region(cxlr);
>  	default:
>  		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
>  			cxlr->mode);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 8400af85d99f..7ac1237938b7 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
>  	CXL_DECODER_NONE,
>  	CXL_DECODER_RAM,
>  	CXL_DECODER_PMEM,
> +	CXL_DECODER_DC0,
> +	CXL_DECODER_DC1,
> +	CXL_DECODER_DC2,
> +	CXL_DECODER_DC3,
> +	CXL_DECODER_DC4,
> +	CXL_DECODER_DC5,
> +	CXL_DECODER_DC6,
> +	CXL_DECODER_DC7,
>  	CXL_DECODER_MIXED,
>  	CXL_DECODER_DEAD,
>  };
> @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  		[CXL_DECODER_NONE] = "none",
>  		[CXL_DECODER_RAM] = "ram",
>  		[CXL_DECODER_PMEM] = "pmem",
> +		[CXL_DECODER_DC0] = "dc0",
> +		[CXL_DECODER_DC1] = "dc1",
> +		[CXL_DECODER_DC2] = "dc2",
> +		[CXL_DECODER_DC3] = "dc3",
> +		[CXL_DECODER_DC4] = "dc4",
> +		[CXL_DECODER_DC5] = "dc5",
> +		[CXL_DECODER_DC6] = "dc6",
> +		[CXL_DECODER_DC7] = "dc7",
>  		[CXL_DECODER_MIXED] = "mixed",
>  	};
>  
> @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  	return "mixed";
>  }
>  
> +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> +{
> +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> +}
> +
>  /*
>   * Track whether this decoder is reserved for region autodiscovery, or
>   * free for userspace provisioning.
> @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
>  	struct cxl_decoder cxld;
>  	struct resource *dpa_res;
>  	resource_size_t skip;
> +	struct xarray skip_res;
>  	enum cxl_decoder_mode mode;
>  	enum cxl_decoder_state state;
>  	int pos;
> @@ -475,6 +497,11 @@ struct cxl_region_params {
>   */
>  #define CXL_REGION_F_AUTO 1
>  
> +struct cxl_dc_region {
> +	struct xarray dax_dev_list;
> +	struct cxl_dax_region *cxlr_dax;
> +};
> +
>  /**
>   * struct cxl_region - CXL region
>   * @dev: This region's device
> @@ -493,6 +520,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dc_region *cxlr_dc;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index ccdf8de85bd5..eb5eb81bfbd7 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	if (decoder_mode_is_dc(cxlr->mode))
> +		return 0;
> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
>  		.size = range_len(&cxlr_dax->hpa_range),
>  	};
> +
>  	dev_dax = devm_create_dev_dax(&data);
>  	if (IS_ERR(dev_dax))
>  		return PTR_ERR(dev_dax);
> 
> -- 
> 2.40.0
>
nifan@outlook.com June 20, 2023, 5:55 p.m. UTC | #10
The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity. CXL Regions must be
> created to access this capacity.
> 
> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> Dynamic Capacity decoder mode which targets dynamic capacity on devices
> which are added to that region.
> 
> Below are the steps to create and delete dynamic capacity region0
> (example).
> 
>     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> 
>     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> 
>     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>     echo 1 > /sys/bus/cxl/devices/$region/commit
>     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> 
>     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: fixups]
> [iweiny: remove unused CXL_DC_REGION_MODE macro]
> [iweiny: Make dc_mode_to_region_index static]
> [iweiny: simplify <sysfs>/create_dc_region]
> [iweiny: introduce decoder_mode_is_dc]
> [djbw: fixups, no sign-off: preview only]
> ---
>  drivers/cxl/Kconfig       |  11 +++
>  drivers/cxl/core/core.h   |   7 ++
>  drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
>  drivers/cxl/core/port.c   |  18 ++++
>  drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
>  drivers/cxl/cxl.h         |  28 ++++++
>  drivers/dax/cxl.c         |   4 +
>  7 files changed, 409 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index ff4e78117b31..df034889d053 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -121,6 +121,17 @@ config CXL_REGION
>  
>  	  If unsure say 'y'
>  
> +config CXL_DCD
> +	bool "CXL: DCD Support"
> +	default CXL_BUS
> +	depends on CXL_REGION
> +	help
> +	  Enable the CXL core to provision CXL DCD regions.
> +	  CXL devices optionally support dynamic capacity and DCD region
> +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> +
> +	  If unsure say 'y'
> +
>  config CXL_REGION_INVALIDATION_TEST
>  	bool "CXL: Region Cache Management Bypass (TEST)"
>  	depends on CXL_REGION
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 27f0968449de..725700ab5973 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
>  
>  extern struct attribute_group cxl_base_attribute_group;
>  
> +#ifdef CONFIG_CXL_DCD
> +extern struct device_attribute dev_attr_create_dc_region;
> +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> +#else
> +#define SET_CXL_DC_REGION_ATTR(x)
> +#endif
> +
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 514d30131d92..29649b47d177 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct resource *res = cxled->dpa_res;
>  	resource_size_t skip_start;
> +	resource_size_t skipped = cxled->skip;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
>  	/* save @skip_start, before @res is released */
> -	skip_start = res->start - cxled->skip;
> +	skip_start = res->start - skipped;
>  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> -	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +	if (cxled->skip != 0) {
> +		while (skipped != 0) {
> +			res = xa_load(&cxled->skip_res, skip_start);
> +			__release_region(&cxlds->dpa_res, skip_start,
> +							resource_size(res));
> +			xa_erase(&cxled->skip_res, skip_start);
> +			skip_start += resource_size(res);
> +			skipped -= resource_size(res);
> +			}
> +	}
>  	cxled->skip = 0;
>  	cxled->dpa_res = NULL;
>  	put_device(&cxled->cxld.dev);
> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	__cxl_dpa_release(cxled);
>  }
>  
> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> +{
> +	int index = 0;
> +
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		if (mode == i)
> +			return index;
> +		index++;
> +	}
> +
> +	return -EINVAL;
> +}
> +
>  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  			     resource_size_t base, resource_size_t len,
>  			     resource_size_t skipped)
> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	struct cxl_port *port = cxled_to_port(cxled);
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct device *dev = &port->dev;
> +	struct device *ed_dev = &cxled->cxld.dev;
> +	struct resource *dpa_res = &cxlds->dpa_res;
> +	resource_size_t skip_len = 0;
>  	struct resource *res;
> +	int rc, index;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	}
>  
>  	if (skipped) {
> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> +		resource_size_t skip_base = base - skipped;
> +
> +		if (decoder_mode_is_dc(cxled->mode)) {
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->ram_res.end) {
> +				skip_len = cxlds->ram_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			if (resource_size(&cxlds->ram_res) &&
Should it be cxlds->pmem_res here?

Fan
> +					skip_base <= cxlds->pmem_res.end) {
> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			index = dc_mode_to_region_index(cxled->mode);
> +			for (int i = 0; i <= index; i++) {
> +				struct resource *dcr = &cxlds->dc_res[i];
> +
> +				if (skip_base < dcr->start) {
> +					skip_len = dcr->start - skip_base;
> +					res = __request_region(dpa_res,
> +							skip_base, skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +
> +				if (skip_base == base) {
> +					dev_dbg(dev, "skip done!\n");
> +					break;
> +				}
> +
> +				if (resource_size(dcr) &&
> +						skip_base <= dcr->end) {
> +					if (skip_base > base)
> +						dev_err(dev, "Skip error\n");
> +
> +					skip_len = dcr->end - skip_base + 1;
> +					res = __request_region(dpa_res, skip_base,
> +							skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +			}
> +		} else	{
> +			res = __request_region(dpa_res, base - skipped, skipped,
> +							dev_name(ed_dev), 0);
> +			if (!res)
> +				goto error;
> +
> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
>  		}
>  	}
> -	res = __request_region(&cxlds->dpa_res, base, len,
> -			       dev_name(&cxled->cxld.dev), 0);
> +
> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>  	if (!res) {
>  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> -			port->id, cxled->cxld.id);
> -		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +				port->id, cxled->cxld.id);
> +		if (skipped) {
> +			resource_size_t skip_base = base - skipped;
> +
> +			while (skipped != 0) {
> +				if (skip_base > base)
> +					dev_err(dev, "Skip error\n");
> +
> +				res = xa_load(&cxled->skip_res, skip_base);
> +				__release_region(dpa_res, skip_base,
> +							resource_size(res));
> +				xa_erase(&cxled->skip_res, skip_base);
> +				skip_base += resource_size(res);
> +				skipped -= resource_size(res);
> +			}
> +		}
>  		return -EBUSY;
>  	}
>  	cxled->dpa_res = res;
>  	cxled->skip = skipped;
>  
> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> +		int index = dc_mode_to_region_index(mode);
> +
> +		if (resource_contains(&cxlds->dc_res[index], res)) {
> +			cxled->mode = mode;
> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> +			goto success;
> +		}
> +	}
>  	if (resource_contains(&cxlds->pmem_res, res))
>  		cxled->mode = CXL_DECODER_PMEM;
>  	else if (resource_contains(&cxlds->ram_res, res))
> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  		cxled->mode = CXL_DECODER_MIXED;
>  	}
>  
> +success:
>  	port->hdm_end++;
>  	get_device(&cxled->cxld.dev);
>  	return 0;
> +
> +error:
> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> +			port->id, cxled->cxld.id);
> +	return -EBUSY;
> +
>  }
>  
>  int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_dbg(dev, "unsupported mode: %d\n", mode);
> @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  		goto out;
>  	}
>  
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		int index = dc_mode_to_region_index(i);
> +
> +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> +			dev_dbg(dev, "no available dynamic capacity\n");
> +			rc = -ENXIO;
> +			goto out;
> +		}
> +	}
> +
>  	cxled->mode = mode;
>  	rc = 0;
>  out:
> @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  					 resource_size_t *skip_out)
>  {
>  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
> +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct device *dev = &cxled->cxld.dev;
>  	resource_size_t start, avail, skip;
>  	struct resource *p, *last;
> +	int index;
>  
>  	lockdep_assert_held(&cxl_dpa_rwsem);
>  
> @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  	else
>  		free_pmem_start = cxlds->pmem_res.start;
>  
> +	/*
> +	 * One HDM Decoder per DC region to map memory with different
> +	 * DSMAS entry.
> +	 */
> +	index = dc_mode_to_region_index(cxled->mode);
> +	if (index >= 0) {
> +		if (cxlds->dc_res[index].child) {
> +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
> +					index);
> +			return -EINVAL;
> +		}
> +		free_dc_start = cxlds->dc_res[index].start;
> +	}
> +
>  	if (cxled->mode == CXL_DECODER_RAM) {
>  		start = free_ram_start;
>  		avail = cxlds->ram_res.end - start + 1;
> @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  		else
>  			skip_end = start - 1;
>  		skip = skip_end - skip_start + 1;
> +	} else if (decoder_mode_is_dc(cxled->mode)) {
> +		resource_size_t skip_start, skip_end;
> +
> +		start = free_dc_start;
> +		avail = cxlds->dc_res[index].end - start + 1;
> +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> +			skip_start = free_ram_start;
> +		else
> +			skip_start = free_pmem_start;
> +		/*
> +		 * If some dc region is already mapped, then that allocation
> +		 * already handled the RAM and PMEM skip.Check for DC region
> +		 * skip.
> +		 */
> +		for (int i = index - 1; i >= 0 ; i--) {
> +			if (cxlds->dc_res[i].child) {
> +				skip_start = cxlds->dc_res[i].child->end + 1;
> +				break;
> +			}
> +		}
> +
> +		skip_end = start - 1;
> +		skip = skip_end - skip_start + 1;
>  	} else {
>  		dev_dbg(cxled_dev(cxled), "mode not set\n");
>  		avail = 0;
> @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>  
>  	avail = cxl_dpa_freespace(cxled, &start, &skip);
>  
> +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> +						start, size, skip);
>  	if (size > avail) {
> +		static const char * const names[] = {
> +			[CXL_DECODER_NONE] = "none",
> +			[CXL_DECODER_RAM] = "ram",
> +			[CXL_DECODER_PMEM] = "pmem",
> +			[CXL_DECODER_MIXED] = "mixed",
> +			[CXL_DECODER_DC0] = "dc0",
> +			[CXL_DECODER_DC1] = "dc1",
> +			[CXL_DECODER_DC2] = "dc2",
> +			[CXL_DECODER_DC3] = "dc3",
> +			[CXL_DECODER_DC4] = "dc4",
> +			[CXL_DECODER_DC5] = "dc5",
> +			[CXL_DECODER_DC6] = "dc6",
> +			[CXL_DECODER_DC7] = "dc7",
> +		};
>  		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> -			&avail);
> +			names[cxled->mode], &avail);
>  		rc = -ENOSPC;
>  		goto out;
>  	}
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 5e21b53362e6..a1a98aba24ed 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -195,6 +195,22 @@ static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
>  		mode = CXL_DECODER_PMEM;
>  	else if (sysfs_streq(buf, "ram"))
>  		mode = CXL_DECODER_RAM;
> +	else if (sysfs_streq(buf, "dc0"))
> +		mode = CXL_DECODER_DC0;
> +	else if (sysfs_streq(buf, "dc1"))
> +		mode = CXL_DECODER_DC1;
> +	else if (sysfs_streq(buf, "dc2"))
> +		mode = CXL_DECODER_DC2;
> +	else if (sysfs_streq(buf, "dc3"))
> +		mode = CXL_DECODER_DC3;
> +	else if (sysfs_streq(buf, "dc4"))
> +		mode = CXL_DECODER_DC4;
> +	else if (sysfs_streq(buf, "dc5"))
> +		mode = CXL_DECODER_DC5;
> +	else if (sysfs_streq(buf, "dc6"))
> +		mode = CXL_DECODER_DC6;
> +	else if (sysfs_streq(buf, "dc7"))
> +		mode = CXL_DECODER_DC7;
>  	else
>  		return -EINVAL;
>  
> @@ -296,6 +312,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>  	&dev_attr_target_list.attr,
>  	SET_CXL_REGION_ATTR(create_pmem_region)
>  	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_DC_REGION_ATTR(create_dc_region)
>  	SET_CXL_REGION_ATTR(delete_region)
>  	NULL,
>  };
> @@ -1691,6 +1708,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
>  		return ERR_PTR(-ENOMEM);
>  
>  	cxled->pos = -1;
> +	xa_init(&cxled->skip_res);
>  	cxld = &cxled->cxld;
>  	rc = cxl_decoder_init(port, cxld);
>  	if (rc)	 {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 543c4499379e..144232c8305e 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>  	lockdep_assert_held_write(&cxl_region_rwsem);
>  	lockdep_assert_held_read(&cxl_dpa_rwsem);
>  
> -	if (cxled->mode != cxlr->mode) {
> +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
>  		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
>  			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
>  		return -EINVAL;
> @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> +				const char *buf, enum cxl_decoder_mode mode,
> +				size_t len)
> +{
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	/*
> +	 * All DC regions use decoder mode DC0 as the region does not need the
> +	 * index information
> +	 */
> +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> +				CXL_DECODER_DC0, len);
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	return rc;
>  }
>  
> +static void cxl_dc_region_release(void *data)
> +{
> +	struct cxl_region *cxlr = data;
> +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> +
> +	xa_destroy(&cxlr_dc->dax_dev_list);
> +	kfree(cxlr_dc);
> +}
> +
> +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> +{
> +	struct cxl_dc_region *cxlr_dc;
> +	struct cxl_dax_region *cxlr_dax;
> +	struct device *dev;
> +	int rc = 0;
> +
> +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> +	if (IS_ERR(cxlr_dax))
> +		return PTR_ERR(cxlr_dax);
> +
> +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> +	if (!cxlr_dc) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	dev = &cxlr_dax->dev;
> +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> +		dev_name(dev));
> +
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> +					cxlr_dax);
> +	if (rc)
> +		goto err;
> +
> +	cxlr_dc->cxlr_dax = cxlr_dax;
> +	xa_init(&cxlr_dc->dax_dev_list);
> +	cxlr->cxlr_dc = cxlr_dc;
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> +	if (!rc)
> +		return 0;
> +err:
> +	put_device(dev);
> +	kfree(cxlr_dc);
> +	return rc;
> +}
> +
>  static int match_decoder_by_range(struct device *dev, void *data)
>  {
>  	struct range *r1, *r2 = data;
> @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
>  	return 1;
>  }
>  
> +/*
> + * The region can not be manged by CXL if any portion of
> + * it is already online as 'System RAM'
> + */
> +static bool region_is_system_ram(struct cxl_region *cxlr,
> +				 struct cxl_region_params *p)
> +{
> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> +				    p->res->start, p->res->end, cxlr,
> +				    is_system_ram) > 0);
> +}
> +
>  static int cxl_region_probe(struct device *dev)
>  {
>  	struct cxl_region *cxlr = to_cxl_region(dev);
> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_DECODER_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_DECODER_RAM:
> -		/*
> -		 * The region can not be manged by CXL if any portion of
> -		 * it is already online as 'System RAM'
> -		 */
> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> -					p->res->start, p->res->end, cxlr,
> -					is_system_ram) > 0)
> +		if (region_is_system_ram(cxlr, p))
>  			return 0;
>  
>  		/*
> @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
>  
>  		/* HDM-H routes to device-dax */
>  		return devm_cxl_add_dax_region(cxlr);
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
> +		if (region_is_system_ram(cxlr, p))
> +			return 0;
> +		return devm_cxl_add_dc_region(cxlr);
>  	default:
>  		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
>  			cxlr->mode);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 8400af85d99f..7ac1237938b7 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
>  	CXL_DECODER_NONE,
>  	CXL_DECODER_RAM,
>  	CXL_DECODER_PMEM,
> +	CXL_DECODER_DC0,
> +	CXL_DECODER_DC1,
> +	CXL_DECODER_DC2,
> +	CXL_DECODER_DC3,
> +	CXL_DECODER_DC4,
> +	CXL_DECODER_DC5,
> +	CXL_DECODER_DC6,
> +	CXL_DECODER_DC7,
>  	CXL_DECODER_MIXED,
>  	CXL_DECODER_DEAD,
>  };
> @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  		[CXL_DECODER_NONE] = "none",
>  		[CXL_DECODER_RAM] = "ram",
>  		[CXL_DECODER_PMEM] = "pmem",
> +		[CXL_DECODER_DC0] = "dc0",
> +		[CXL_DECODER_DC1] = "dc1",
> +		[CXL_DECODER_DC2] = "dc2",
> +		[CXL_DECODER_DC3] = "dc3",
> +		[CXL_DECODER_DC4] = "dc4",
> +		[CXL_DECODER_DC5] = "dc5",
> +		[CXL_DECODER_DC6] = "dc6",
> +		[CXL_DECODER_DC7] = "dc7",
>  		[CXL_DECODER_MIXED] = "mixed",
>  	};
>  
> @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  	return "mixed";
>  }
>  
> +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> +{
> +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> +}
> +
>  /*
>   * Track whether this decoder is reserved for region autodiscovery, or
>   * free for userspace provisioning.
> @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
>  	struct cxl_decoder cxld;
>  	struct resource *dpa_res;
>  	resource_size_t skip;
> +	struct xarray skip_res;
>  	enum cxl_decoder_mode mode;
>  	enum cxl_decoder_state state;
>  	int pos;
> @@ -475,6 +497,11 @@ struct cxl_region_params {
>   */
>  #define CXL_REGION_F_AUTO 1
>  
> +struct cxl_dc_region {
> +	struct xarray dax_dev_list;
> +	struct cxl_dax_region *cxlr_dax;
> +};
> +
>  /**
>   * struct cxl_region - CXL region
>   * @dev: This region's device
> @@ -493,6 +520,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dc_region *cxlr_dc;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index ccdf8de85bd5..eb5eb81bfbd7 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	if (decoder_mode_is_dc(cxlr->mode))
> +		return 0;
> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
>  		.size = range_len(&cxlr_dax->hpa_range),
>  	};
> +
>  	dev_dax = devm_create_dev_dax(&data);
>  	if (IS_ERR(dev_dax))
>  		return PTR_ERR(dev_dax);
> 
> -- 
> 2.40.0
>
Ira Weiny June 20, 2023, 8:33 p.m. UTC | #11
Fan Ni wrote:
> The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity. CXL Regions must be
> > created to access this capacity.
> > 
> > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > which are added to that region.
> > 
> > Below are the steps to create and delete dynamic capacity region0
> > (example).
> > 
> >     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
> >     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
> >     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
> >     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> > 
> >     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
> >     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> > 
> >     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
> >     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
> >     echo 1 > /sys/bus/cxl/devices/$region/commit
> >     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> > 
> >     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > 

[snip]

> > @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  	}
> >  
> >  	if (skipped) {
> > -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> > -				       dev_name(&cxled->cxld.dev), 0);
> > -		if (!res) {
> > -			dev_dbg(dev,
> > -				"decoder%d.%d: failed to reserve skipped space\n",
> > -				port->id, cxled->cxld.id);
> > -			return -EBUSY;
> > +		resource_size_t skip_base = base - skipped;
> > +
> > +		if (decoder_mode_is_dc(cxled->mode)) {
> > +			if (resource_size(&cxlds->ram_res) &&
> > +					skip_base <= cxlds->ram_res.end) {
> > +				skip_len = cxlds->ram_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> > +
> > +			if (resource_size(&cxlds->ram_res) &&
> Should it be cxlds->pmem_res here?

Yep.  I think I mentioned that in the thread somewhere...

yea here it is: https://lore.kernel.org/all/648b548db05f5_1c7ab42944a@iweiny-mobl.notmuch/

And Navneet agreed:  https://lore.kernel.org/all/ZIte4QozSm+n2zI3@fedora/

Thanks for looking,
Ira

> 
> Fan
Ira Weiny June 21, 2023, 2:44 a.m. UTC | #12
Alison Schofield wrote:
> On Wed, Jun 14, 2023 at 12:16:29PM -0700, Ira Weiny wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity. CXL Regions must be
> > created to access this capacity.
> > 
> > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > which are added to that region.
> > 
> > Below are the steps to create and delete dynamic capacity region0
> > (example).
> > 
> >     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
> >     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
> >     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
> >     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> > 
> >     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
> >     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> > 
> >     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
> >     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
> >     echo 1 > /sys/bus/cxl/devices/$region/commit
> >     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> > 
> >     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> Hi,
> I took another pass at this and offered more feedback.
> I do think that if the big part - the cxl_dpa_reserve()
> was more 'chunkified' it would be easier to review for
> actual functionality.
> 
> I'd also like to see the commit log be a bit more specific
> in enumerated the things this patch intends to do.
> 
> Many of my comments are about style. Some checkpatch --strict
> would call out and some are addressed in the kernel coding
> style - Documentation/process/coding-style.rst

As I said before I did not run with --strict

I've done a quick run through with --strict and will ensure it is done
again after I refactor the code.

> 
> But really, my goal is that when this code merges, that as
> I scroll through a file, say region.c, I see a consistent
> coding style. I shouldn't be able to notice that oh, Dan
> wrote that, and Ira that, and Navneet wrote that piece.

I agree.

> 
> I think it's important because differences in style distract
> from focusing on the functionality of the code.
> 
> (off my soap box now ;)
> 
> Alison
> 
> 
> > 
> > ---
> > [iweiny: fixups]
> > [iweiny: remove unused CXL_DC_REGION_MODE macro]
> > [iweiny: Make dc_mode_to_region_index static]
> > [iweiny: simplify <sysfs>/create_dc_region]
> > [iweiny: introduce decoder_mode_is_dc]
> > [djbw: fixups, no sign-off: preview only]
> > ---
> >  drivers/cxl/Kconfig       |  11 +++
> >  drivers/cxl/core/core.h   |   7 ++
> >  drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
> >  drivers/cxl/core/port.c   |  18 ++++
> >  drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
> >  drivers/cxl/cxl.h         |  28 ++++++
> >  drivers/dax/cxl.c         |   4 +
> >  7 files changed, 409 insertions(+), 28 deletions(-)
> > 
> > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > index ff4e78117b31..df034889d053 100644
> > --- a/drivers/cxl/Kconfig
> > +++ b/drivers/cxl/Kconfig
> > @@ -121,6 +121,17 @@ config CXL_REGION
> >  
> >  	  If unsure say 'y'
> >  
> > +config CXL_DCD
> > +	bool "CXL: DCD Support"
> 
> "CXL DCD: Dynamic Capacity Device Support"
> is more in line with others in this file, and expands the acronym onetime.

done.

> 
> > +	default CXL_BUS
> > +	depends on CXL_REGION
> > +	help
> > +	  Enable the CXL core to provision CXL DCD regions.
> > +	  CXL devices optionally support dynamic capacity and DCD region
> > +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> > +
> > +	  If unsure say 'y'
> > +
> >  config CXL_REGION_INVALIDATION_TEST
> >  	bool "CXL: Region Cache Management Bypass (TEST)"
> >  	depends on CXL_REGION
> > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > index 27f0968449de..725700ab5973 100644
> > --- a/drivers/cxl/core/core.h
> > +++ b/drivers/cxl/core/core.h
> > @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
> >  
> >  extern struct attribute_group cxl_base_attribute_group;
> >  
> > +#ifdef CONFIG_CXL_DCD
> > +extern struct device_attribute dev_attr_create_dc_region;
> > +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> > +#else
> > +#define SET_CXL_DC_REGION_ATTR(x)
> > +#endif
> > +
> >  #ifdef CONFIG_CXL_REGION
> >  extern struct device_attribute dev_attr_create_pmem_region;
> >  extern struct device_attribute dev_attr_create_ram_region;
> > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > index 514d30131d92..29649b47d177 100644
> > --- a/drivers/cxl/core/hdm.c
> > +++ b/drivers/cxl/core/hdm.c
> > @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >  	struct resource *res = cxled->dpa_res;
> >  	resource_size_t skip_start;
> > +	resource_size_t skipped = cxled->skip;
> 
> Reverse x-tree.

Done.

> 
> >  
> >  	lockdep_assert_held_write(&cxl_dpa_rwsem);
> >  
> >  	/* save @skip_start, before @res is released */
> > -	skip_start = res->start - cxled->skip;
> > +	skip_start = res->start - skipped;
> 
> Why did the assignment of skip_start need to change here?

I believe this was done for consistency because skipped now represents
cxled->skip, however...

> 
> >  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> > -	if (cxled->skip)
> > -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> > +	if (cxled->skip != 0) {
> > +		while (skipped != 0) {

... what is more concerning is we now effectively have.

	if (skipped != 0) {
		while (skipped != 0) {
			...

:-(

> > +			res = xa_load(&cxled->skip_res, skip_start);
> > +			__release_region(&cxlds->dpa_res, skip_start,
> > +							resource_size(res));
> 
> The above appears poorlty aligned.

fixed.

> 
> > +			xa_erase(&cxled->skip_res, skip_start);
> > +			skip_start += resource_size(res);
> > +			skipped -= resource_size(res);
> > +			}
> 
> This bracket appears poorly aligned.

This is very poorly aligned.  I'll run --strict before sending V2.

> 
> > +	}
> >  	cxled->skip = 0;
> >  	cxled->dpa_res = NULL;
> >  	put_device(&cxled->cxld.dev);
> > @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >  	__cxl_dpa_release(cxled);
> >  }
> >  
> > +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> > +{
> > +	int index = 0;
> > +
> > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > +		if (mode == i)
> > +			return index;
> > +		index++;
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> >  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  			     resource_size_t base, resource_size_t len,
> >  			     resource_size_t skipped)
> > @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  	struct cxl_port *port = cxled_to_port(cxled);
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >  	struct device *dev = &port->dev;
> > +	struct device *ed_dev = &cxled->cxld.dev;
> > +	struct resource *dpa_res = &cxlds->dpa_res;
> > +	resource_size_t skip_len = 0;
> >  	struct resource *res;
> > +	int rc, index;
> >  
> 
> Above poorly aligned.

Do you mean reverse x-tree?

> 
> >  	lockdep_assert_held_write(&cxl_dpa_rwsem);
> >  
> > @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  	}
> >  
> >  	if (skipped) {
> 
> This has excessive indentation, so started out with a monster
> if skipped is begging for a refactoring.

Yea I agree.  Dave pointed this out as well.  What I have to be sure about
is the logic here.

> 
> I find it odd that the DCD case got inserted before the 'default'
> or non-DCD case here.

Yea I'm working with Navneet on this.

> 
> 
> > -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> > -				       dev_name(&cxled->cxld.dev), 0);
> > -		if (!res) {
> > -			dev_dbg(dev,
> > -				"decoder%d.%d: failed to reserve skipped space\n",
> > -				port->id, cxled->cxld.id);
> > -			return -EBUSY;
> > +		resource_size_t skip_base = base - skipped;
> > +
> > +		if (decoder_mode_is_dc(cxled->mode)) {
> 
> This may be cleaner to introduce as a separate function for
> handling _mode_id_dc.

Yes I think all the DC in this function should be handled in it's own
function to clarify how that is handled.

I think it will make the diff/review easier as well because it will be
more clear how things are with and without DCD.

> 
> > +			if (resource_size(&cxlds->ram_res) &&
> > +					skip_base <= cxlds->ram_res.end) {
> > +				skip_len = cxlds->ram_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> > +
> > +			if (resource_size(&cxlds->ram_res) &&
> > +					skip_base <= cxlds->pmem_res.end) {
> > +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> 
> The above 2 if (resource_size() cases have redundant code. 
> Pull it out, refactor.

Redundant except that the second ram_res needs to be pmem_res.  After
that change I'll have to evaluate how much is duplicated.  As I said above
I'm working with Navneet to see how this logic can be broken down.  It is
a big function now.

> 
> > +
> > +			index = dc_mode_to_region_index(cxled->mode);
> > +			for (int i = 0; i <= index; i++) {
> > +				struct resource *dcr = &cxlds->dc_res[i];
> > +
> > +				if (skip_base < dcr->start) {
> > +					skip_len = dcr->start - skip_base;
> > +					res = __request_region(dpa_res,
> > +							skip_base, skip_len,
> > +							dev_name(ed_dev), 0);
> > +					if (!res)
> > +						goto error;
> > +
> > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > +							res, GFP_KERNEL);
> > +					skip_base += skip_len;
> > +				}
> > +
> > +				if (skip_base == base) {
> > +					dev_dbg(dev, "skip done!\n");
> > +					break;
> > +				}
> > +
> > +				if (resource_size(dcr) &&
> > +						skip_base <= dcr->end) {
> > +					if (skip_base > base)
> > +						dev_err(dev, "Skip error\n");
> > +
> > +					skip_len = dcr->end - skip_base + 1;
> > +					res = __request_region(dpa_res, skip_base,
> > +							skip_len,
> > +							dev_name(ed_dev), 0);
> > +					if (!res)
> > +						goto error;
> > +
> > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > +							res, GFP_KERNEL);
> > +					skip_base += skip_len;
> > +				}
> > +			}
> 
> 
> And, below,we are back to the original code.
> This would be more readable, reviewable if the DCD support was
> added in separate function that are then called from here.

Yep!

> 
> > +		} else	{
> > +			res = __request_region(dpa_res, base - skipped, skipped,
> > +							dev_name(ed_dev), 0);
> > +			if (!res)
> > +				goto error;
> > +
> > +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> >  		}
> >  	}
> > -	res = __request_region(&cxlds->dpa_res, base, len,
> > -			       dev_name(&cxled->cxld.dev), 0);
> > +
> > +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
> >  	if (!res) {
> >  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> > -			port->id, cxled->cxld.id);
> 
> General comment - look over the dev_dbg() messages and consider placing
> them after the code. I recall, others that were needlessly between lines
> of code.

At the end of the block?

> 
> 
> > -		if (skipped)
> > -			__release_region(&cxlds->dpa_res, base - skipped,
> > -					 skipped);
> > +				port->id, cxled->cxld.id);
> > +		if (skipped) {
> > +			resource_size_t skip_base = base - skipped;
> > +
> > +			while (skipped != 0) {
> > +				if (skip_base > base)
> > +					dev_err(dev, "Skip error\n");
> > +
> > +				res = xa_load(&cxled->skip_res, skip_base);
> > +				__release_region(dpa_res, skip_base,
> > +							resource_size(res));
> > +				xa_erase(&cxled->skip_res, skip_base);
> > +				skip_base += resource_size(res);
> > +				skipped -= resource_size(res);
> > +			}
> > +		}
> 
> 		Can that debug message go here ?

Not sure.  But we have another issue of:

if (skipped) {
	while (skipped) {
	...

which is redundant I think.  I'll have to see about skip_base.

> 
> >  		return -EBUSY;
> >  	}
> >  	cxled->dpa_res = res;
> >  	cxled->skip = skipped;
> >  
> > +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> > +		int index = dc_mode_to_region_index(mode);
> > +
> > +		if (resource_contains(&cxlds->dc_res[index], res)) {
> > +			cxled->mode = mode;
> > +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> > +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> 
> Can this move to ....
> 
> 
> > +			goto success;
> > +		}
> > +	}
> >  	if (resource_contains(&cxlds->pmem_res, res))
> >  		cxled->mode = CXL_DECODER_PMEM;
> >  	else if (resource_contains(&cxlds->ram_res, res))
> > @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  		cxled->mode = CXL_DECODER_MIXED;
> >  	}
> >  
> > +success:
> >  	port->hdm_end++;
> >  	get_device(&cxled->cxld.dev);
> 
> here...dev_dbg() success message. That pairs it nicely with the
> error message below.

I think it can.  I think we have a case here where there was an attempt
not to change the initial behavior of the code even so much as adding a
debug message.  But I think the over all flow would be better with the
debug here.

> 
> >  	return 0;
> > +
> > +error:
> > +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> > +			port->id, cxled->cxld.id);
> > +	return -EBUSY;
> > +
> >  }
> >  
> >  int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> >  	switch (mode) {
> >  	case CXL_DECODER_RAM:
> >  	case CXL_DECODER_PMEM:
> > +	case CXL_DECODER_DC0:
> > +	case CXL_DECODER_DC1:
> > +	case CXL_DECODER_DC2:
> > +	case CXL_DECODER_DC3:
> > +	case CXL_DECODER_DC4:
> > +	case CXL_DECODER_DC5:
> > +	case CXL_DECODER_DC6:
> > +	case CXL_DECODER_DC7:
> >  		break;
> >  	default:
> >  		dev_dbg(dev, "unsupported mode: %d\n", mode);
> > @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> >  		goto out;
> >  	}
> >  
> > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > +		int index = dc_mode_to_region_index(i);
> > +
> > +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> > +			dev_dbg(dev, "no available dynamic capacity\n");
> 
> I see this one is following the pattern in the function :)
> 
> 
> > +			rc = -ENXIO;
> > +			goto out;
> > +		}
> > +	}
> > +
> >  	cxled->mode = mode;
> >  	rc = 0;
> >  out:
> > @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
> 
> Hmmm...I don't have cxl_dpa_freespace() in my cxl/next? Where's that?

That was in the patches from Dan which this series depends on.

https://lore.kernel.org/all/168592158743.1948938.7622563891193802610.stgit@dwillia2-xfh.jf.intel.com/

> 
> 
> >  					 resource_size_t *skip_out)
> >  {
> >  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> > -	resource_size_t free_ram_start, free_pmem_start;
> > +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	struct device *dev = &cxled->cxld.dev;
> >  	resource_size_t start, avail, skip;
> >  	struct resource *p, *last;
> > +	int index;
> 
> Why break the alignment above?

What do you mean?

> 
> >  
> >  	lockdep_assert_held(&cxl_dpa_rwsem);
> >  
> > @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
> >  	else
> >  		free_pmem_start = cxlds->pmem_res.start;
> >  
> > +	/*
> > +	 * One HDM Decoder per DC region to map memory with different
> > +	 * DSMAS entry.
> > +	 */
> 
> It seems this comment is missing a verb. Why not align?

align?

with DSMAS on the end like this?

	/*
	 * One HDM Decoder per DC region to map memory with different DSMAS
	 * entry.
	 */

> > +	index = dc_mode_to_region_index(cxled->mode);
> > +	if (index >= 0) {
> > +		if (cxlds->dc_res[index].child) {
> > +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n"
> 
> s/allocated/allocate

Fixed.

> 
> ,
> > +					index);
> > +			return -EINVAL;
> > +		}
> > +		free_dc_start = cxlds->dc_res[index].start;
> > +	}
> > +
> >  	if (cxled->mode == CXL_DECODER_RAM) {
> >  		start = free_ram_start;
> >  		avail = cxlds->ram_res.end - start + 1;
> > @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
> >  		else
> >  			skip_end = start - 1;
> >  		skip = skip_end - skip_start + 1;
> > +	} else if (decoder_mode_is_dc(cxled->mode)) {
> > +		resource_size_t skip_start, skip_end;
> > +
> > +		start = free_dc_start;
> > +		avail = cxlds->dc_res[index].end - start + 1;
> > +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> > +			skip_start = free_ram_start;
> > +		else
> > +			skip_start = free_pmem_start;
> > +		/*
> > +		 * If some dc region is already mapped, then that allocation
> 
> maybe s/some/any ?

Fixed.

Ira
Navneet Singh June 21, 2023, 3:13 a.m. UTC | #13
On Tue, Jun 20, 2023 at 10:55:15AM -0700, Fan Ni wrote:
> The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity. CXL Regions must be
> > created to access this capacity.
> > 
> > Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> > Dynamic Capacity decoder mode which targets dynamic capacity on devices
> > which are added to that region.
> > 
> > Below are the steps to create and delete dynamic capacity region0
> > (example).
> > 
> >     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
> >     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
> >     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
> >     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> > 
> >     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
> >     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> > 
> >     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
> >     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
> >     echo 1 > /sys/bus/cxl/devices/$region/commit
> >     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> > 
> >     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > 
> > ---
> > [iweiny: fixups]
> > [iweiny: remove unused CXL_DC_REGION_MODE macro]
> > [iweiny: Make dc_mode_to_region_index static]
> > [iweiny: simplify <sysfs>/create_dc_region]
> > [iweiny: introduce decoder_mode_is_dc]
> > [djbw: fixups, no sign-off: preview only]
> > ---
> >  drivers/cxl/Kconfig       |  11 +++
> >  drivers/cxl/core/core.h   |   7 ++
> >  drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
> >  drivers/cxl/core/port.c   |  18 ++++
> >  drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
> >  drivers/cxl/cxl.h         |  28 ++++++
> >  drivers/dax/cxl.c         |   4 +
> >  7 files changed, 409 insertions(+), 28 deletions(-)
> > 
> > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > index ff4e78117b31..df034889d053 100644
> > --- a/drivers/cxl/Kconfig
> > +++ b/drivers/cxl/Kconfig
> > @@ -121,6 +121,17 @@ config CXL_REGION
> >  
> >  	  If unsure say 'y'
> >  
> > +config CXL_DCD
> > +	bool "CXL: DCD Support"
> > +	default CXL_BUS
> > +	depends on CXL_REGION
> > +	help
> > +	  Enable the CXL core to provision CXL DCD regions.
> > +	  CXL devices optionally support dynamic capacity and DCD region
> > +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> > +
> > +	  If unsure say 'y'
> > +
> >  config CXL_REGION_INVALIDATION_TEST
> >  	bool "CXL: Region Cache Management Bypass (TEST)"
> >  	depends on CXL_REGION
> > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > index 27f0968449de..725700ab5973 100644
> > --- a/drivers/cxl/core/core.h
> > +++ b/drivers/cxl/core/core.h
> > @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
> >  
> >  extern struct attribute_group cxl_base_attribute_group;
> >  
> > +#ifdef CONFIG_CXL_DCD
> > +extern struct device_attribute dev_attr_create_dc_region;
> > +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> > +#else
> > +#define SET_CXL_DC_REGION_ATTR(x)
> > +#endif
> > +
> >  #ifdef CONFIG_CXL_REGION
> >  extern struct device_attribute dev_attr_create_pmem_region;
> >  extern struct device_attribute dev_attr_create_ram_region;
> > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > index 514d30131d92..29649b47d177 100644
> > --- a/drivers/cxl/core/hdm.c
> > +++ b/drivers/cxl/core/hdm.c
> > @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >  	struct resource *res = cxled->dpa_res;
> >  	resource_size_t skip_start;
> > +	resource_size_t skipped = cxled->skip;
> >  
> >  	lockdep_assert_held_write(&cxl_dpa_rwsem);
> >  
> >  	/* save @skip_start, before @res is released */
> > -	skip_start = res->start - cxled->skip;
> > +	skip_start = res->start - skipped;
> >  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> > -	if (cxled->skip)
> > -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> > +	if (cxled->skip != 0) {
> > +		while (skipped != 0) {
> > +			res = xa_load(&cxled->skip_res, skip_start);
> > +			__release_region(&cxlds->dpa_res, skip_start,
> > +							resource_size(res));
> > +			xa_erase(&cxled->skip_res, skip_start);
> > +			skip_start += resource_size(res);
> > +			skipped -= resource_size(res);
> > +			}
> > +	}
> >  	cxled->skip = 0;
> >  	cxled->dpa_res = NULL;
> >  	put_device(&cxled->cxld.dev);
> > @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >  	__cxl_dpa_release(cxled);
> >  }
> >  
> > +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> > +{
> > +	int index = 0;
> > +
> > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > +		if (mode == i)
> > +			return index;
> > +		index++;
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> >  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  			     resource_size_t base, resource_size_t len,
> >  			     resource_size_t skipped)
> > @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  	struct cxl_port *port = cxled_to_port(cxled);
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >  	struct device *dev = &port->dev;
> > +	struct device *ed_dev = &cxled->cxld.dev;
> > +	struct resource *dpa_res = &cxlds->dpa_res;
> > +	resource_size_t skip_len = 0;
> >  	struct resource *res;
> > +	int rc, index;
> >  
> >  	lockdep_assert_held_write(&cxl_dpa_rwsem);
> >  
> > @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  	}
> >  
> >  	if (skipped) {
> > -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> > -				       dev_name(&cxled->cxld.dev), 0);
> > -		if (!res) {
> > -			dev_dbg(dev,
> > -				"decoder%d.%d: failed to reserve skipped space\n",
> > -				port->id, cxled->cxld.id);
> > -			return -EBUSY;
> > +		resource_size_t skip_base = base - skipped;
> > +
> > +		if (decoder_mode_is_dc(cxled->mode)) {
> > +			if (resource_size(&cxlds->ram_res) &&
> > +					skip_base <= cxlds->ram_res.end) {
> > +				skip_len = cxlds->ram_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> > +
> > +			if (resource_size(&cxlds->ram_res) &&
> Should it be cxlds->pmem_res here?
> 
> Fan
Navneet - Yes , This is already in the change list. 
> > +					skip_base <= cxlds->pmem_res.end) {
> > +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> > +				res = __request_region(dpa_res, skip_base,
> > +						skip_len, dev_name(ed_dev), 0);
> > +				if (!res)
> > +					goto error;
> > +
> > +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> > +				skip_base += skip_len;
> > +			}
> > +
> > +			index = dc_mode_to_region_index(cxled->mode);
> > +			for (int i = 0; i <= index; i++) {
> > +				struct resource *dcr = &cxlds->dc_res[i];
> > +
> > +				if (skip_base < dcr->start) {
> > +					skip_len = dcr->start - skip_base;
> > +					res = __request_region(dpa_res,
> > +							skip_base, skip_len,
> > +							dev_name(ed_dev), 0);
> > +					if (!res)
> > +						goto error;
> > +
> > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > +							res, GFP_KERNEL);
> > +					skip_base += skip_len;
> > +				}
> > +
> > +				if (skip_base == base) {
> > +					dev_dbg(dev, "skip done!\n");
> > +					break;
> > +				}
> > +
> > +				if (resource_size(dcr) &&
> > +						skip_base <= dcr->end) {
> > +					if (skip_base > base)
> > +						dev_err(dev, "Skip error\n");
> > +
> > +					skip_len = dcr->end - skip_base + 1;
> > +					res = __request_region(dpa_res, skip_base,
> > +							skip_len,
> > +							dev_name(ed_dev), 0);
> > +					if (!res)
> > +						goto error;
> > +
> > +					rc = xa_insert(&cxled->skip_res, skip_base,
> > +							res, GFP_KERNEL);
> > +					skip_base += skip_len;
> > +				}
> > +			}
> > +		} else	{
> > +			res = __request_region(dpa_res, base - skipped, skipped,
> > +							dev_name(ed_dev), 0);
> > +			if (!res)
> > +				goto error;
> > +
> > +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> > +								GFP_KERNEL);
> >  		}
> >  	}
> > -	res = __request_region(&cxlds->dpa_res, base, len,
> > -			       dev_name(&cxled->cxld.dev), 0);
> > +
> > +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
> >  	if (!res) {
> >  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> > -			port->id, cxled->cxld.id);
> > -		if (skipped)
> > -			__release_region(&cxlds->dpa_res, base - skipped,
> > -					 skipped);
> > +				port->id, cxled->cxld.id);
> > +		if (skipped) {
> > +			resource_size_t skip_base = base - skipped;
> > +
> > +			while (skipped != 0) {
> > +				if (skip_base > base)
> > +					dev_err(dev, "Skip error\n");
> > +
> > +				res = xa_load(&cxled->skip_res, skip_base);
> > +				__release_region(dpa_res, skip_base,
> > +							resource_size(res));
> > +				xa_erase(&cxled->skip_res, skip_base);
> > +				skip_base += resource_size(res);
> > +				skipped -= resource_size(res);
> > +			}
> > +		}
> >  		return -EBUSY;
> >  	}
> >  	cxled->dpa_res = res;
> >  	cxled->skip = skipped;
> >  
> > +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> > +		int index = dc_mode_to_region_index(mode);
> > +
> > +		if (resource_contains(&cxlds->dc_res[index], res)) {
> > +			cxled->mode = mode;
> > +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> > +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> > +			goto success;
> > +		}
> > +	}
> >  	if (resource_contains(&cxlds->pmem_res, res))
> >  		cxled->mode = CXL_DECODER_PMEM;
> >  	else if (resource_contains(&cxlds->ram_res, res))
> > @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> >  		cxled->mode = CXL_DECODER_MIXED;
> >  	}
> >  
> > +success:
> >  	port->hdm_end++;
> >  	get_device(&cxled->cxld.dev);
> >  	return 0;
> > +
> > +error:
> > +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> > +			port->id, cxled->cxld.id);
> > +	return -EBUSY;
> > +
> >  }
> >  
> >  int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> > @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> >  	switch (mode) {
> >  	case CXL_DECODER_RAM:
> >  	case CXL_DECODER_PMEM:
> > +	case CXL_DECODER_DC0:
> > +	case CXL_DECODER_DC1:
> > +	case CXL_DECODER_DC2:
> > +	case CXL_DECODER_DC3:
> > +	case CXL_DECODER_DC4:
> > +	case CXL_DECODER_DC5:
> > +	case CXL_DECODER_DC6:
> > +	case CXL_DECODER_DC7:
> >  		break;
> >  	default:
> >  		dev_dbg(dev, "unsupported mode: %d\n", mode);
> > @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
> >  		goto out;
> >  	}
> >  
> > +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> > +		int index = dc_mode_to_region_index(i);
> > +
> > +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> > +			dev_dbg(dev, "no available dynamic capacity\n");
> > +			rc = -ENXIO;
> > +			goto out;
> > +		}
> > +	}
> > +
> >  	cxled->mode = mode;
> >  	rc = 0;
> >  out:
> > @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
> >  					 resource_size_t *skip_out)
> >  {
> >  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> > -	resource_size_t free_ram_start, free_pmem_start;
> > +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	struct device *dev = &cxled->cxld.dev;
> >  	resource_size_t start, avail, skip;
> >  	struct resource *p, *last;
> > +	int index;
> >  
> >  	lockdep_assert_held(&cxl_dpa_rwsem);
> >  
> > @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
> >  	else
> >  		free_pmem_start = cxlds->pmem_res.start;
> >  
> > +	/*
> > +	 * One HDM Decoder per DC region to map memory with different
> > +	 * DSMAS entry.
> > +	 */
> > +	index = dc_mode_to_region_index(cxled->mode);
> > +	if (index >= 0) {
> > +		if (cxlds->dc_res[index].child) {
> > +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
> > +					index);
> > +			return -EINVAL;
> > +		}
> > +		free_dc_start = cxlds->dc_res[index].start;
> > +	}
> > +
> >  	if (cxled->mode == CXL_DECODER_RAM) {
> >  		start = free_ram_start;
> >  		avail = cxlds->ram_res.end - start + 1;
> > @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
> >  		else
> >  			skip_end = start - 1;
> >  		skip = skip_end - skip_start + 1;
> > +	} else if (decoder_mode_is_dc(cxled->mode)) {
> > +		resource_size_t skip_start, skip_end;
> > +
> > +		start = free_dc_start;
> > +		avail = cxlds->dc_res[index].end - start + 1;
> > +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> > +			skip_start = free_ram_start;
> > +		else
> > +			skip_start = free_pmem_start;
> > +		/*
> > +		 * If some dc region is already mapped, then that allocation
> > +		 * already handled the RAM and PMEM skip.Check for DC region
> > +		 * skip.
> > +		 */
> > +		for (int i = index - 1; i >= 0 ; i--) {
> > +			if (cxlds->dc_res[i].child) {
> > +				skip_start = cxlds->dc_res[i].child->end + 1;
> > +				break;
> > +			}
> > +		}
> > +
> > +		skip_end = start - 1;
> > +		skip = skip_end - skip_start + 1;
> >  	} else {
> >  		dev_dbg(cxled_dev(cxled), "mode not set\n");
> >  		avail = 0;
> > @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
> >  
> >  	avail = cxl_dpa_freespace(cxled, &start, &skip);
> >  
> > +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> > +						start, size, skip);
> >  	if (size > avail) {
> > +		static const char * const names[] = {
> > +			[CXL_DECODER_NONE] = "none",
> > +			[CXL_DECODER_RAM] = "ram",
> > +			[CXL_DECODER_PMEM] = "pmem",
> > +			[CXL_DECODER_MIXED] = "mixed",
> > +			[CXL_DECODER_DC0] = "dc0",
> > +			[CXL_DECODER_DC1] = "dc1",
> > +			[CXL_DECODER_DC2] = "dc2",
> > +			[CXL_DECODER_DC3] = "dc3",
> > +			[CXL_DECODER_DC4] = "dc4",
> > +			[CXL_DECODER_DC5] = "dc5",
> > +			[CXL_DECODER_DC6] = "dc6",
> > +			[CXL_DECODER_DC7] = "dc7",
> > +		};
> >  		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> > -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> > -			&avail);
> > +			names[cxled->mode], &avail);
> >  		rc = -ENOSPC;
> >  		goto out;
> >  	}
> > diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> > index 5e21b53362e6..a1a98aba24ed 100644
> > --- a/drivers/cxl/core/port.c
> > +++ b/drivers/cxl/core/port.c
> > @@ -195,6 +195,22 @@ static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
> >  		mode = CXL_DECODER_PMEM;
> >  	else if (sysfs_streq(buf, "ram"))
> >  		mode = CXL_DECODER_RAM;
> > +	else if (sysfs_streq(buf, "dc0"))
> > +		mode = CXL_DECODER_DC0;
> > +	else if (sysfs_streq(buf, "dc1"))
> > +		mode = CXL_DECODER_DC1;
> > +	else if (sysfs_streq(buf, "dc2"))
> > +		mode = CXL_DECODER_DC2;
> > +	else if (sysfs_streq(buf, "dc3"))
> > +		mode = CXL_DECODER_DC3;
> > +	else if (sysfs_streq(buf, "dc4"))
> > +		mode = CXL_DECODER_DC4;
> > +	else if (sysfs_streq(buf, "dc5"))
> > +		mode = CXL_DECODER_DC5;
> > +	else if (sysfs_streq(buf, "dc6"))
> > +		mode = CXL_DECODER_DC6;
> > +	else if (sysfs_streq(buf, "dc7"))
> > +		mode = CXL_DECODER_DC7;
> >  	else
> >  		return -EINVAL;
> >  
> > @@ -296,6 +312,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
> >  	&dev_attr_target_list.attr,
> >  	SET_CXL_REGION_ATTR(create_pmem_region)
> >  	SET_CXL_REGION_ATTR(create_ram_region)
> > +	SET_CXL_DC_REGION_ATTR(create_dc_region)
> >  	SET_CXL_REGION_ATTR(delete_region)
> >  	NULL,
> >  };
> > @@ -1691,6 +1708,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
> >  		return ERR_PTR(-ENOMEM);
> >  
> >  	cxled->pos = -1;
> > +	xa_init(&cxled->skip_res);
> >  	cxld = &cxled->cxld;
> >  	rc = cxl_decoder_init(port, cxld);
> >  	if (rc)	 {
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index 543c4499379e..144232c8305e 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
> >  	lockdep_assert_held_write(&cxl_region_rwsem);
> >  	lockdep_assert_held_read(&cxl_dpa_rwsem);
> >  
> > -	if (cxled->mode != cxlr->mode) {
> > +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
> >  		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
> >  			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
> >  		return -EINVAL;
> > @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
> >  	switch (mode) {
> >  	case CXL_DECODER_RAM:
> >  	case CXL_DECODER_PMEM:
> > +	case CXL_DECODER_DC0:
> > +	case CXL_DECODER_DC1:
> > +	case CXL_DECODER_DC2:
> > +	case CXL_DECODER_DC3:
> > +	case CXL_DECODER_DC4:
> > +	case CXL_DECODER_DC5:
> > +	case CXL_DECODER_DC6:
> > +	case CXL_DECODER_DC7:
> >  		break;
> >  	default:
> >  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> > @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
> >  }
> >  DEVICE_ATTR_RW(create_ram_region);
> >  
> > +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> > +				const char *buf, enum cxl_decoder_mode mode,
> > +				size_t len)
> > +{
> > +	struct cxl_region *cxlr;
> > +	int rc, id;
> > +
> > +	rc = sscanf(buf, "region%d\n", &id);
> > +	if (rc != 1)
> > +		return -EINVAL;
> > +
> > +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> > +	if (IS_ERR(cxlr))
> > +		return PTR_ERR(cxlr);
> > +
> > +	return len;
> > +}
> > +
> > +static ssize_t create_dc_region_show(struct device *dev,
> > +				     struct device_attribute *attr, char *buf)
> > +{
> > +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> > +}
> > +
> > +static ssize_t create_dc_region_store(struct device *dev,
> > +				      struct device_attribute *attr,
> > +				      const char *buf, size_t len)
> > +{
> > +	/*
> > +	 * All DC regions use decoder mode DC0 as the region does not need the
> > +	 * index information
> > +	 */
> > +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> > +				CXL_DECODER_DC0, len);
> > +}
> > +DEVICE_ATTR_RW(create_dc_region);
> > +
> >  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
> >  			   char *buf)
> >  {
> > @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
> >  	return rc;
> >  }
> >  
> > +static void cxl_dc_region_release(void *data)
> > +{
> > +	struct cxl_region *cxlr = data;
> > +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> > +
> > +	xa_destroy(&cxlr_dc->dax_dev_list);
> > +	kfree(cxlr_dc);
> > +}
> > +
> > +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> > +{
> > +	struct cxl_dc_region *cxlr_dc;
> > +	struct cxl_dax_region *cxlr_dax;
> > +	struct device *dev;
> > +	int rc = 0;
> > +
> > +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> > +	if (IS_ERR(cxlr_dax))
> > +		return PTR_ERR(cxlr_dax);
> > +
> > +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> > +	if (!cxlr_dc) {
> > +		rc = -ENOMEM;
> > +		goto err;
> > +	}
> > +
> > +	dev = &cxlr_dax->dev;
> > +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> > +	if (rc)
> > +		goto err;
> > +
> > +	rc = device_add(dev);
> > +	if (rc)
> > +		goto err;
> > +
> > +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> > +		dev_name(dev));
> > +
> > +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> > +					cxlr_dax);
> > +	if (rc)
> > +		goto err;
> > +
> > +	cxlr_dc->cxlr_dax = cxlr_dax;
> > +	xa_init(&cxlr_dc->dax_dev_list);
> > +	cxlr->cxlr_dc = cxlr_dc;
> > +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> > +	if (!rc)
> > +		return 0;
> > +err:
> > +	put_device(dev);
> > +	kfree(cxlr_dc);
> > +	return rc;
> > +}
> > +
> >  static int match_decoder_by_range(struct device *dev, void *data)
> >  {
> >  	struct range *r1, *r2 = data;
> > @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
> >  	return 1;
> >  }
> >  
> > +/*
> > + * The region can not be manged by CXL if any portion of
> > + * it is already online as 'System RAM'
> > + */
> > +static bool region_is_system_ram(struct cxl_region *cxlr,
> > +				 struct cxl_region_params *p)
> > +{
> > +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> > +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > +				    p->res->start, p->res->end, cxlr,
> > +				    is_system_ram) > 0);
> > +}
> > +
> >  static int cxl_region_probe(struct device *dev)
> >  {
> >  	struct cxl_region *cxlr = to_cxl_region(dev);
> > @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
> >  	case CXL_DECODER_PMEM:
> >  		return devm_cxl_add_pmem_region(cxlr);
> >  	case CXL_DECODER_RAM:
> > -		/*
> > -		 * The region can not be manged by CXL if any portion of
> > -		 * it is already online as 'System RAM'
> > -		 */
> > -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> > -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> > -					p->res->start, p->res->end, cxlr,
> > -					is_system_ram) > 0)
> > +		if (region_is_system_ram(cxlr, p))
> >  			return 0;
> >  
> >  		/*
> > @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
> >  
> >  		/* HDM-H routes to device-dax */
> >  		return devm_cxl_add_dax_region(cxlr);
> > +	case CXL_DECODER_DC0:
> > +	case CXL_DECODER_DC1:
> > +	case CXL_DECODER_DC2:
> > +	case CXL_DECODER_DC3:
> > +	case CXL_DECODER_DC4:
> > +	case CXL_DECODER_DC5:
> > +	case CXL_DECODER_DC6:
> > +	case CXL_DECODER_DC7:
> > +		if (region_is_system_ram(cxlr, p))
> > +			return 0;
> > +		return devm_cxl_add_dc_region(cxlr);
> >  	default:
> >  		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
> >  			cxlr->mode);
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index 8400af85d99f..7ac1237938b7 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> > @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
> >  	CXL_DECODER_NONE,
> >  	CXL_DECODER_RAM,
> >  	CXL_DECODER_PMEM,
> > +	CXL_DECODER_DC0,
> > +	CXL_DECODER_DC1,
> > +	CXL_DECODER_DC2,
> > +	CXL_DECODER_DC3,
> > +	CXL_DECODER_DC4,
> > +	CXL_DECODER_DC5,
> > +	CXL_DECODER_DC6,
> > +	CXL_DECODER_DC7,
> >  	CXL_DECODER_MIXED,
> >  	CXL_DECODER_DEAD,
> >  };
> > @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
> >  		[CXL_DECODER_NONE] = "none",
> >  		[CXL_DECODER_RAM] = "ram",
> >  		[CXL_DECODER_PMEM] = "pmem",
> > +		[CXL_DECODER_DC0] = "dc0",
> > +		[CXL_DECODER_DC1] = "dc1",
> > +		[CXL_DECODER_DC2] = "dc2",
> > +		[CXL_DECODER_DC3] = "dc3",
> > +		[CXL_DECODER_DC4] = "dc4",
> > +		[CXL_DECODER_DC5] = "dc5",
> > +		[CXL_DECODER_DC6] = "dc6",
> > +		[CXL_DECODER_DC7] = "dc7",
> >  		[CXL_DECODER_MIXED] = "mixed",
> >  	};
> >  
> > @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
> >  	return "mixed";
> >  }
> >  
> > +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> > +{
> > +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> > +}
> > +
> >  /*
> >   * Track whether this decoder is reserved for region autodiscovery, or
> >   * free for userspace provisioning.
> > @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
> >  	struct cxl_decoder cxld;
> >  	struct resource *dpa_res;
> >  	resource_size_t skip;
> > +	struct xarray skip_res;
> >  	enum cxl_decoder_mode mode;
> >  	enum cxl_decoder_state state;
> >  	int pos;
> > @@ -475,6 +497,11 @@ struct cxl_region_params {
> >   */
> >  #define CXL_REGION_F_AUTO 1
> >  
> > +struct cxl_dc_region {
> > +	struct xarray dax_dev_list;
> > +	struct cxl_dax_region *cxlr_dax;
> > +};
> > +
> >  /**
> >   * struct cxl_region - CXL region
> >   * @dev: This region's device
> > @@ -493,6 +520,7 @@ struct cxl_region {
> >  	enum cxl_decoder_type type;
> >  	struct cxl_nvdimm_bridge *cxl_nvb;
> >  	struct cxl_pmem_region *cxlr_pmem;
> > +	struct cxl_dc_region *cxlr_dc;
> >  	unsigned long flags;
> >  	struct cxl_region_params params;
> >  };
> > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > index ccdf8de85bd5..eb5eb81bfbd7 100644
> > --- a/drivers/dax/cxl.c
> > +++ b/drivers/dax/cxl.c
> > @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
> >  	if (!dax_region)
> >  		return -ENOMEM;
> >  
> > +	if (decoder_mode_is_dc(cxlr->mode))
> > +		return 0;
> > +
> >  	data = (struct dev_dax_data) {
> >  		.dax_region = dax_region,
> >  		.id = -1,
> >  		.size = range_len(&cxlr_dax->hpa_range),
> >  	};
> > +
> >  	dev_dax = devm_create_dev_dax(&data);
> >  	if (IS_ERR(dev_dax))
> >  		return PTR_ERR(dev_dax);
> > 
> > -- 
> > 2.40.0
> > 
> 
> -- 
> Fan Ni <nifan@outlook.com>
nifan@outlook.com June 21, 2023, 5:20 p.m. UTC | #14
The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity. CXL Regions must be
> created to access this capacity.
> 
> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> Dynamic Capacity decoder mode which targets dynamic capacity on devices
> which are added to that region.
> 
> Below are the steps to create and delete dynamic capacity region0
> (example).
> 
>     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> 
>     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> 
>     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>     echo 1 > /sys/bus/cxl/devices/$region/commit
>     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> 
>     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 
> ---
> [iweiny: fixups]
> [iweiny: remove unused CXL_DC_REGION_MODE macro]
> [iweiny: Make dc_mode_to_region_index static]
> [iweiny: simplify <sysfs>/create_dc_region]
> [iweiny: introduce decoder_mode_is_dc]
> [djbw: fixups, no sign-off: preview only]
> ---
>  drivers/cxl/Kconfig       |  11 +++
>  drivers/cxl/core/core.h   |   7 ++
>  drivers/cxl/core/hdm.c    | 234 ++++++++++++++++++++++++++++++++++++++++++----
>  drivers/cxl/core/port.c   |  18 ++++
>  drivers/cxl/core/region.c | 135 ++++++++++++++++++++++++--
>  drivers/cxl/cxl.h         |  28 ++++++
>  drivers/dax/cxl.c         |   4 +
>  7 files changed, 409 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index ff4e78117b31..df034889d053 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -121,6 +121,17 @@ config CXL_REGION
>  
>  	  If unsure say 'y'
>  
> +config CXL_DCD
> +	bool "CXL: DCD Support"
> +	default CXL_BUS
> +	depends on CXL_REGION
> +	help
> +	  Enable the CXL core to provision CXL DCD regions.
> +	  CXL devices optionally support dynamic capacity and DCD region
> +	  maps the dynamic capacity regions DPA's into Host HPA ranges.
> +
> +	  If unsure say 'y'
> +
>  config CXL_REGION_INVALIDATION_TEST
>  	bool "CXL: Region Cache Management Bypass (TEST)"
>  	depends on CXL_REGION
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 27f0968449de..725700ab5973 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -9,6 +9,13 @@ extern const struct device_type cxl_nvdimm_type;
>  
>  extern struct attribute_group cxl_base_attribute_group;
>  
> +#ifdef CONFIG_CXL_DCD
> +extern struct device_attribute dev_attr_create_dc_region;
> +#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
> +#else
> +#define SET_CXL_DC_REGION_ATTR(x)
> +#endif
> +
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 514d30131d92..29649b47d177 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct resource *res = cxled->dpa_res;
>  	resource_size_t skip_start;
> +	resource_size_t skipped = cxled->skip;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
>  	/* save @skip_start, before @res is released */
> -	skip_start = res->start - cxled->skip;
> +	skip_start = res->start - skipped;
>  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> -	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +	if (cxled->skip != 0) {
> +		while (skipped != 0) {
> +			res = xa_load(&cxled->skip_res, skip_start);
> +			__release_region(&cxlds->dpa_res, skip_start,
> +							resource_size(res));
> +			xa_erase(&cxled->skip_res, skip_start);
> +			skip_start += resource_size(res);
> +			skipped -= resource_size(res);
> +			}
> +	}
>  	cxled->skip = 0;
>  	cxled->dpa_res = NULL;
>  	put_device(&cxled->cxld.dev);
> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	__cxl_dpa_release(cxled);
>  }
>  
> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> +{
> +	int index = 0;
> +
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		if (mode == i)
> +			return index;
> +		index++;
> +	}
> +
> +	return -EINVAL;
> +}
> +
>  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  			     resource_size_t base, resource_size_t len,
>  			     resource_size_t skipped)
> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	struct cxl_port *port = cxled_to_port(cxled);
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct device *dev = &port->dev;
> +	struct device *ed_dev = &cxled->cxld.dev;
> +	struct resource *dpa_res = &cxlds->dpa_res;
> +	resource_size_t skip_len = 0;
>  	struct resource *res;
> +	int rc, index;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	}
>  
>  	if (skipped) {
> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> +		resource_size_t skip_base = base - skipped;
> +
> +		if (decoder_mode_is_dc(cxled->mode)) {
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->ram_res.end) {
> +				skip_len = cxlds->ram_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->pmem_res.end) {
> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			index = dc_mode_to_region_index(cxled->mode);
> +			for (int i = 0; i <= index; i++) {
> +				struct resource *dcr = &cxlds->dc_res[i];
> +
> +				if (skip_base < dcr->start) {
> +					skip_len = dcr->start - skip_base;
> +					res = __request_region(dpa_res,
> +							skip_base, skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +
> +				if (skip_base == base) {
> +					dev_dbg(dev, "skip done!\n");
> +					break;
> +				}
> +
> +				if (resource_size(dcr) &&
> +						skip_base <= dcr->end) {
> +					if (skip_base > base)
> +						dev_err(dev, "Skip error\n");
> +
> +					skip_len = dcr->end - skip_base + 1;
> +					res = __request_region(dpa_res, skip_base,
> +							skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +			}
> +		} else	{
> +			res = __request_region(dpa_res, base - skipped, skipped,
> +							dev_name(ed_dev), 0);
> +			if (!res)
> +				goto error;
> +
> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
>  		}
>  	}
> -	res = __request_region(&cxlds->dpa_res, base, len,
> -			       dev_name(&cxled->cxld.dev), 0);
> +
> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>  	if (!res) {
>  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> -			port->id, cxled->cxld.id);
> -		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +				port->id, cxled->cxld.id);
> +		if (skipped) {
> +			resource_size_t skip_base = base - skipped;
> +
> +			while (skipped != 0) {
> +				if (skip_base > base)
> +					dev_err(dev, "Skip error\n");
> +
> +				res = xa_load(&cxled->skip_res, skip_base);
> +				__release_region(dpa_res, skip_base,
> +							resource_size(res));
> +				xa_erase(&cxled->skip_res, skip_base);
> +				skip_base += resource_size(res);
> +				skipped -= resource_size(res);
> +			}
> +		}
>  		return -EBUSY;
>  	}
>  	cxled->dpa_res = res;
>  	cxled->skip = skipped;
>  
> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> +		int index = dc_mode_to_region_index(mode);
> +
> +		if (resource_contains(&cxlds->dc_res[index], res)) {
> +			cxled->mode = mode;
> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> +			goto success;
> +		}
> +	}
>  	if (resource_contains(&cxlds->pmem_res, res))
>  		cxled->mode = CXL_DECODER_PMEM;
>  	else if (resource_contains(&cxlds->ram_res, res))
> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  		cxled->mode = CXL_DECODER_MIXED;
>  	}
>  
> +success:
>  	port->hdm_end++;
>  	get_device(&cxled->cxld.dev);
>  	return 0;
> +
> +error:
> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> +			port->id, cxled->cxld.id);
> +	return -EBUSY;
> +
>  }
>  
>  int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
> @@ -429,6 +553,14 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_dbg(dev, "unsupported mode: %d\n", mode);
> @@ -456,6 +588,16 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>  		goto out;
>  	}
>  
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		int index = dc_mode_to_region_index(i);
> +
> +		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
> +			dev_dbg(dev, "no available dynamic capacity\n");
> +			rc = -ENXIO;
> +			goto out;
> +		}
> +	}
> +
>  	cxled->mode = mode;
>  	rc = 0;
>  out:
> @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  					 resource_size_t *skip_out)
>  {
>  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
> +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct device *dev = &cxled->cxld.dev;
>  	resource_size_t start, avail, skip;
>  	struct resource *p, *last;
> +	int index;
>  
>  	lockdep_assert_held(&cxl_dpa_rwsem);
>  
> @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  	else
>  		free_pmem_start = cxlds->pmem_res.start;
>  
> +	/*
> +	 * One HDM Decoder per DC region to map memory with different
> +	 * DSMAS entry.
> +	 */
> +	index = dc_mode_to_region_index(cxled->mode);
> +	if (index >= 0) {
> +		if (cxlds->dc_res[index].child) {
> +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
> +					index);
> +			return -EINVAL;
> +		}
> +		free_dc_start = cxlds->dc_res[index].start;
> +	}
> +
>  	if (cxled->mode == CXL_DECODER_RAM) {
>  		start = free_ram_start;
>  		avail = cxlds->ram_res.end - start + 1;
> @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  		else
>  			skip_end = start - 1;
>  		skip = skip_end - skip_start + 1;
> +	} else if (decoder_mode_is_dc(cxled->mode)) {
> +		resource_size_t skip_start, skip_end;
> +
> +		start = free_dc_start;
> +		avail = cxlds->dc_res[index].end - start + 1;
> +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> +			skip_start = free_ram_start;
> +		else
> +			skip_start = free_pmem_start;
> +		/*
> +		 * If some dc region is already mapped, then that allocation
> +		 * already handled the RAM and PMEM skip.Check for DC region
> +		 * skip.
> +		 */
> +		for (int i = index - 1; i >= 0 ; i--) {
> +			if (cxlds->dc_res[i].child) {
> +				skip_start = cxlds->dc_res[i].child->end + 1;
> +				break;
> +			}
> +		}
> +
> +		skip_end = start - 1;
> +		skip = skip_end - skip_start + 1;
>  	} else {
>  		dev_dbg(cxled_dev(cxled), "mode not set\n");
>  		avail = 0;
> @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>  
>  	avail = cxl_dpa_freespace(cxled, &start, &skip);
>  
> +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> +						start, size, skip);
>  	if (size > avail) {
> +		static const char * const names[] = {
> +			[CXL_DECODER_NONE] = "none",
> +			[CXL_DECODER_RAM] = "ram",
> +			[CXL_DECODER_PMEM] = "pmem",
> +			[CXL_DECODER_MIXED] = "mixed",
> +			[CXL_DECODER_DC0] = "dc0",
> +			[CXL_DECODER_DC1] = "dc1",
> +			[CXL_DECODER_DC2] = "dc2",
> +			[CXL_DECODER_DC3] = "dc3",
> +			[CXL_DECODER_DC4] = "dc4",
> +			[CXL_DECODER_DC5] = "dc5",
> +			[CXL_DECODER_DC6] = "dc6",
> +			[CXL_DECODER_DC7] = "dc7",
> +		};
>  		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> -			&avail);
> +			names[cxled->mode], &avail);
>  		rc = -ENOSPC;
>  		goto out;
>  	}
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 5e21b53362e6..a1a98aba24ed 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -195,6 +195,22 @@ static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
>  		mode = CXL_DECODER_PMEM;
>  	else if (sysfs_streq(buf, "ram"))
>  		mode = CXL_DECODER_RAM;
> +	else if (sysfs_streq(buf, "dc0"))
> +		mode = CXL_DECODER_DC0;
> +	else if (sysfs_streq(buf, "dc1"))
> +		mode = CXL_DECODER_DC1;
> +	else if (sysfs_streq(buf, "dc2"))
> +		mode = CXL_DECODER_DC2;
> +	else if (sysfs_streq(buf, "dc3"))
> +		mode = CXL_DECODER_DC3;
> +	else if (sysfs_streq(buf, "dc4"))
> +		mode = CXL_DECODER_DC4;
> +	else if (sysfs_streq(buf, "dc5"))
> +		mode = CXL_DECODER_DC5;
> +	else if (sysfs_streq(buf, "dc6"))
> +		mode = CXL_DECODER_DC6;
> +	else if (sysfs_streq(buf, "dc7"))
> +		mode = CXL_DECODER_DC7;
>  	else
>  		return -EINVAL;
>  
> @@ -296,6 +312,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>  	&dev_attr_target_list.attr,
>  	SET_CXL_REGION_ATTR(create_pmem_region)
>  	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_DC_REGION_ATTR(create_dc_region)
>  	SET_CXL_REGION_ATTR(delete_region)
>  	NULL,
>  };
> @@ -1691,6 +1708,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
>  		return ERR_PTR(-ENOMEM);
>  
>  	cxled->pos = -1;
> +	xa_init(&cxled->skip_res);
>  	cxld = &cxled->cxld;
>  	rc = cxl_decoder_init(port, cxld);
>  	if (rc)	 {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 543c4499379e..144232c8305e 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>  	lockdep_assert_held_write(&cxl_region_rwsem);
>  	lockdep_assert_held_read(&cxl_dpa_rwsem);
>  
> -	if (cxled->mode != cxlr->mode) {
> +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
For mode other than dc, no check will be performed, is that what we
want?


>  		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
>  			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
>  		return -EINVAL;
> @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> +				const char *buf, enum cxl_decoder_mode mode,
> +				size_t len)
> +{
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	/*
> +	 * All DC regions use decoder mode DC0 as the region does not need the
> +	 * index information
> +	 */
> +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> +				CXL_DECODER_DC0, len);
If all DC regions use DC0, what will CXL_DECODER_DC1~7 be used for?

Fan
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	return rc;
>  }
>  
> +static void cxl_dc_region_release(void *data)
> +{
> +	struct cxl_region *cxlr = data;
> +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> +
> +	xa_destroy(&cxlr_dc->dax_dev_list);
> +	kfree(cxlr_dc);
> +}
> +
> +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> +{
> +	struct cxl_dc_region *cxlr_dc;
> +	struct cxl_dax_region *cxlr_dax;
> +	struct device *dev;
> +	int rc = 0;
> +
> +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> +	if (IS_ERR(cxlr_dax))
> +		return PTR_ERR(cxlr_dax);
> +
> +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> +	if (!cxlr_dc) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	dev = &cxlr_dax->dev;
> +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> +		dev_name(dev));
> +
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> +					cxlr_dax);
> +	if (rc)
> +		goto err;
> +
> +	cxlr_dc->cxlr_dax = cxlr_dax;
> +	xa_init(&cxlr_dc->dax_dev_list);
> +	cxlr->cxlr_dc = cxlr_dc;
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> +	if (!rc)
> +		return 0;
> +err:
> +	put_device(dev);
> +	kfree(cxlr_dc);
> +	return rc;
> +}
> +
>  static int match_decoder_by_range(struct device *dev, void *data)
>  {
>  	struct range *r1, *r2 = data;
> @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
>  	return 1;
>  }
>  
> +/*
> + * The region can not be manged by CXL if any portion of
> + * it is already online as 'System RAM'
> + */
> +static bool region_is_system_ram(struct cxl_region *cxlr,
> +				 struct cxl_region_params *p)
> +{
> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> +				    p->res->start, p->res->end, cxlr,
> +				    is_system_ram) > 0);
> +}
> +
>  static int cxl_region_probe(struct device *dev)
>  {
>  	struct cxl_region *cxlr = to_cxl_region(dev);
> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_DECODER_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_DECODER_RAM:
> -		/*
> -		 * The region can not be manged by CXL if any portion of
> -		 * it is already online as 'System RAM'
> -		 */
> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> -					p->res->start, p->res->end, cxlr,
> -					is_system_ram) > 0)
> +		if (region_is_system_ram(cxlr, p))
>  			return 0;
>  
>  		/*
> @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
>  
>  		/* HDM-H routes to device-dax */
>  		return devm_cxl_add_dax_region(cxlr);
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
> +		if (region_is_system_ram(cxlr, p))
> +			return 0;
> +		return devm_cxl_add_dc_region(cxlr);
>  	default:
>  		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
>  			cxlr->mode);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 8400af85d99f..7ac1237938b7 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
>  	CXL_DECODER_NONE,
>  	CXL_DECODER_RAM,
>  	CXL_DECODER_PMEM,
> +	CXL_DECODER_DC0,
> +	CXL_DECODER_DC1,
> +	CXL_DECODER_DC2,
> +	CXL_DECODER_DC3,
> +	CXL_DECODER_DC4,
> +	CXL_DECODER_DC5,
> +	CXL_DECODER_DC6,
> +	CXL_DECODER_DC7,
>  	CXL_DECODER_MIXED,
>  	CXL_DECODER_DEAD,
>  };
> @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  		[CXL_DECODER_NONE] = "none",
>  		[CXL_DECODER_RAM] = "ram",
>  		[CXL_DECODER_PMEM] = "pmem",
> +		[CXL_DECODER_DC0] = "dc0",
> +		[CXL_DECODER_DC1] = "dc1",
> +		[CXL_DECODER_DC2] = "dc2",
> +		[CXL_DECODER_DC3] = "dc3",
> +		[CXL_DECODER_DC4] = "dc4",
> +		[CXL_DECODER_DC5] = "dc5",
> +		[CXL_DECODER_DC6] = "dc6",
> +		[CXL_DECODER_DC7] = "dc7",
>  		[CXL_DECODER_MIXED] = "mixed",
>  	};
>  
> @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  	return "mixed";
>  }
>  
> +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> +{
> +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> +}
> +
>  /*
>   * Track whether this decoder is reserved for region autodiscovery, or
>   * free for userspace provisioning.
> @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
>  	struct cxl_decoder cxld;
>  	struct resource *dpa_res;
>  	resource_size_t skip;
> +	struct xarray skip_res;
>  	enum cxl_decoder_mode mode;
>  	enum cxl_decoder_state state;
>  	int pos;
> @@ -475,6 +497,11 @@ struct cxl_region_params {
>   */
>  #define CXL_REGION_F_AUTO 1
>  
> +struct cxl_dc_region {
> +	struct xarray dax_dev_list;
> +	struct cxl_dax_region *cxlr_dax;
> +};
> +
>  /**
>   * struct cxl_region - CXL region
>   * @dev: This region's device
> @@ -493,6 +520,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dc_region *cxlr_dc;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index ccdf8de85bd5..eb5eb81bfbd7 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	if (decoder_mode_is_dc(cxlr->mode))
> +		return 0;
> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
>  		.size = range_len(&cxlr_dax->hpa_range),
>  	};
> +
>  	dev_dax = devm_create_dev_dax(&data);
>  	if (IS_ERR(dev_dax))
>  		return PTR_ERR(dev_dax);
> 
> -- 
> 2.40.0
>
Jonathan Cameron June 22, 2023, 4:34 p.m. UTC | #15
On Wed, 14 Jun 2023 12:16:29 -0700
ira.weiny@intel.com wrote:

> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity. CXL Regions must be
> created to access this capacity.
> 
> Add sysfs entries to create dynamic capacity cxl regions. Provide a new
> Dynamic Capacity decoder mode which targets dynamic capacity on devices
> which are added to that region.
> 
> Below are the steps to create and delete dynamic capacity region0
> (example).
> 
>     region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
>     echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
>     echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
>     echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
> 
>     echo "dc0" >/sys/bus/cxl/devices/decoder1.0/mode
>     echo 0x400000000 >/sys/bus/cxl/devices/decoder1.0/dpa_size
> 
>     echo 0x400000000 > /sys/bus/cxl/devices/$region/size
>     echo  "decoder1.0" > /sys/bus/cxl/devices/$region/target0
>     echo 1 > /sys/bus/cxl/devices/$region/commit
>     echo $region > /sys/bus/cxl/drivers/cxl_region/bind
> 
>     echo $region> /sys/bus/cxl/devices/decoder0.0/delete_region
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> 

I'd like some additional info here on why the skip stuff needs to be
so complicated.  I think it's juts a way of tracking the skip value
needed in the HDM decoders and that's just one value. So why can't
we just have one resource reservation for the skip?  Is it related to them needing
to be nested in some way?

Jonathan



>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 514d30131d92..29649b47d177 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -233,14 +233,23 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct resource *res = cxled->dpa_res;
>  	resource_size_t skip_start;
> +	resource_size_t skipped = cxled->skip;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
>  	/* save @skip_start, before @res is released */
> -	skip_start = res->start - cxled->skip;
> +	skip_start = res->start - skipped;
>  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> -	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +	if (cxled->skip != 0) {
> +		while (skipped != 0) {
> +			res = xa_load(&cxled->skip_res, skip_start);
> +			__release_region(&cxlds->dpa_res, skip_start,
> +							resource_size(res));
> +			xa_erase(&cxled->skip_res, skip_start);
> +			skip_start += resource_size(res);
> +			skipped -= resource_size(res);
> +			}
} indented too far..

> +	}
>  	cxled->skip = 0;
>  	cxled->dpa_res = NULL;
>  	put_device(&cxled->cxld.dev);
> @@ -267,6 +276,19 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	__cxl_dpa_release(cxled);
>  }
>  
> +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> +{
> +	int index = 0;
> +
> +	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
> +		if (mode == i)
> +			return index;
> +		index++;

Might as well increment index in the loop as well.
i++, index++;  Though given you are looping over a bunch of enum
entries and relying on them being in a row...

	if (mode < CXL_DECODER_DC0 || i > CXL_DECODER_DC7)
		return -EINVAL;
	return mode - CXL_DECODER0;


> +	}
> +
> +	return -EINVAL;
> +}
> +
>  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  			     resource_size_t base, resource_size_t len,
>  			     resource_size_t skipped)
> @@ -275,7 +297,11 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	struct cxl_port *port = cxled_to_port(cxled);
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  	struct device *dev = &port->dev;
> +	struct device *ed_dev = &cxled->cxld.dev;
> +	struct resource *dpa_res = &cxlds->dpa_res;
> +	resource_size_t skip_len = 0;
>  	struct resource *res;
> +	int rc, index;
>  
>  	lockdep_assert_held_write(&cxl_dpa_rwsem);
>  
> @@ -304,28 +330,119 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  	}
>  
>  	if (skipped) {
> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> +		resource_size_t skip_base = base - skipped;
> +
> +		if (decoder_mode_is_dc(cxled->mode)) {
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->ram_res.end) {

Fix alignment to after (

> +				skip_len = cxlds->ram_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);


Does it make sense to have all these potential skip regions in a row?
Why not just add one potentially including ram, pmem, and some of the
dc regions and remove one below?

I may be missing some subtlety here though.

> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			if (resource_size(&cxlds->ram_res) &&
> +					skip_base <= cxlds->pmem_res.end) {
> +				skip_len = cxlds->pmem_res.end - skip_base + 1;
> +				res = __request_region(dpa_res, skip_base,
> +						skip_len, dev_name(ed_dev), 0);
> +				if (!res)
> +					goto error;
> +
> +				rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
> +				skip_base += skip_len;
> +			}
> +
> +			index = dc_mode_to_region_index(cxled->mode);
> +			for (int i = 0; i <= index; i++) {
> +				struct resource *dcr = &cxlds->dc_res[i];
> +
> +				if (skip_base < dcr->start) {
> +					skip_len = dcr->start - skip_base;
> +					res = __request_region(dpa_res,
> +							skip_base, skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +
> +				if (skip_base == base) {
> +					dev_dbg(dev, "skip done!\n");
> +					break;
> +				}
> +
> +				if (resource_size(dcr) &&
> +						skip_base <= dcr->end) {
> +					if (skip_base > base)
> +						dev_err(dev, "Skip error\n");
> +
> +					skip_len = dcr->end - skip_base + 1;
> +					res = __request_region(dpa_res, skip_base,
> +							skip_len,
> +							dev_name(ed_dev), 0);
> +					if (!res)
> +						goto error;
> +
> +					rc = xa_insert(&cxled->skip_res, skip_base,
> +							res, GFP_KERNEL);
> +					skip_base += skip_len;
> +				}
> +			}
> +		} else	{
> +			res = __request_region(dpa_res, base - skipped, skipped,
> +							dev_name(ed_dev), 0);
> +			if (!res)
> +				goto error;
> +
> +			rc = xa_insert(&cxled->skip_res, skip_base, res,
> +								GFP_KERNEL);
Can we have a precursor patch introducing the xarray for skip res?
Might make that bit easy to understand even if it start with few entries.

Also, is rc checked?


>  		}
>  	}
> -	res = __request_region(&cxlds->dpa_res, base, len,
> -			       dev_name(&cxled->cxld.dev), 0);
> +
> +	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
>  	if (!res) {
>  		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
> -			port->id, cxled->cxld.id);
> -		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +				port->id, cxled->cxld.id);

Odd indent of line above that is making this noisier than it needs to be.

> +		if (skipped) {

I'd invert at cost of two places you exit.
		if (!skipped)
			return -EBUSY;

		skip_base = base - skipped;
		...

> +			resource_size_t skip_base = base - skipped;
> +
> +			while (skipped != 0) {
> +				if (skip_base > base)
> +					dev_err(dev, "Skip error\n");
> +
> +				res = xa_load(&cxled->skip_res, skip_base);
> +				__release_region(dpa_res, skip_base,
> +							resource_size(res));
> +				xa_erase(&cxled->skip_res, skip_base);
> +				skip_base += resource_size(res);
> +				skipped -= resource_size(res);
> +			}
> +		}
>  		return -EBUSY;
>  	}
>  	cxled->dpa_res = res;
>  	cxled->skip = skipped;
>  
> +	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
> +		int index = dc_mode_to_region_index(mode);
> +
> +		if (resource_contains(&cxlds->dc_res[index], res)) {
> +			cxled->mode = mode;
> +			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
> +				cxled->cxld.id, cxled->dpa_res, cxled->mode);
> +			goto success;
> +		}
> +	}
>  	if (resource_contains(&cxlds->pmem_res, res))
>  		cxled->mode = CXL_DECODER_PMEM;
>  	else if (resource_contains(&cxlds->ram_res, res))
> @@ -336,9 +453,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>  		cxled->mode = CXL_DECODER_MIXED;
>  	}
>  
> +success:
>  	port->hdm_end++;
>  	get_device(&cxled->cxld.dev);
>  	return 0;
> +
> +error:

Unless other stuff is coming here, drag the debug print up to callers, make
it more specific and return directly.  Makes for an easier flow to read.


> +	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
> +			port->id, cxled->cxld.id);
> +	return -EBUSY;
> +
>  }


> @@ -469,10 +611,12 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  					 resource_size_t *skip_out)
>  {
>  	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
> +	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct device *dev = &cxled->cxld.dev;

Pull this out as a precursor.  Also note Dan used cxled_dev() in the patch adding cxl_dpa_freespace.
Probably best bet is just push this change into Dan's patch on basis it'll make the history neater.


>  	resource_size_t start, avail, skip;
>  	struct resource *p, *last;
> +	int index;
>  
>  	lockdep_assert_held(&cxl_dpa_rwsem);
>  
> @@ -490,6 +634,20 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  	else
>  		free_pmem_start = cxlds->pmem_res.start;
>  
> +	/*
> +	 * One HDM Decoder per DC region to map memory with different
> +	 * DSMAS entry.
> +	 */

Push all the dc stuff into one place?  Perhaps that becomes impossible
in later patches...

> +	index = dc_mode_to_region_index(cxled->mode);
> +	if (index >= 0) {
> +		if (cxlds->dc_res[index].child) {
> +			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
> +					index);
> +			return -EINVAL;
> +		}
> +		free_dc_start = cxlds->dc_res[index].start;
> +	}
> +
>  	if (cxled->mode == CXL_DECODER_RAM) {
>  		start = free_ram_start;
>  		avail = cxlds->ram_res.end - start + 1;
> @@ -511,6 +669,29 @@ static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
>  		else
>  			skip_end = start - 1;
>  		skip = skip_end - skip_start + 1;
> +	} else if (decoder_mode_is_dc(cxled->mode)) {
> +		resource_size_t skip_start, skip_end;
> +
> +		start = free_dc_start;
> +		avail = cxlds->dc_res[index].end - start + 1;
> +		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
> +			skip_start = free_ram_start;
> +		else
> +			skip_start = free_pmem_start;
> +		/*
> +		 * If some dc region is already mapped, then that allocation
> +		 * already handled the RAM and PMEM skip.Check for DC region
> +		 * skip.
> +		 */
> +		for (int i = index - 1; i >= 0 ; i--) {
> +			if (cxlds->dc_res[i].child) {
> +				skip_start = cxlds->dc_res[i].child->end + 1;
> +				break;
> +			}
> +		}
> +
> +		skip_end = start - 1;
> +		skip = skip_end - skip_start + 1;
>  	} else {
>  		dev_dbg(cxled_dev(cxled), "mode not set\n");
>  		avail = 0;
> @@ -548,10 +729,25 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>  
>  	avail = cxl_dpa_freespace(cxled, &start, &skip);
>  
> +	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
> +						start, size, skip);
>  	if (size > avail) {
> +		static const char * const names[] = {
> +			[CXL_DECODER_NONE] = "none",
> +			[CXL_DECODER_RAM] = "ram",
> +			[CXL_DECODER_PMEM] = "pmem",
> +			[CXL_DECODER_MIXED] = "mixed",
> +			[CXL_DECODER_DC0] = "dc0",
> +			[CXL_DECODER_DC1] = "dc1",
> +			[CXL_DECODER_DC2] = "dc2",
> +			[CXL_DECODER_DC3] = "dc3",
> +			[CXL_DECODER_DC4] = "dc4",
> +			[CXL_DECODER_DC5] = "dc5",
> +			[CXL_DECODER_DC6] = "dc6",
> +			[CXL_DECODER_DC7] = "dc7",

Hmm. 8 is on the boundary for being better to just do this programaticaly.
I guess it's fine though and is nice and easy to follow.

> +		};
>  		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
> -			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> -			&avail);
> +			names[cxled->mode], &avail);
>  		rc = -ENOSPC;
>  		goto out;
>  	}


> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 543c4499379e..144232c8305e 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>  	lockdep_assert_held_write(&cxl_region_rwsem);
>  	lockdep_assert_held_read(&cxl_dpa_rwsem);
>  
> -	if (cxled->mode != cxlr->mode) {
> +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
>  		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
>  			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
>  		return -EINVAL;
> @@ -2211,6 +2211,14 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_DECODER_RAM:
>  	case CXL_DECODER_PMEM:
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
> @@ -2321,6 +2329,43 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
> +				const char *buf, enum cxl_decoder_mode mode,
> +				size_t len)
> +{
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	/*
> +	 * All DC regions use decoder mode DC0 as the region does not need the
> +	 * index information
> +	 */
> +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> +				CXL_DECODER_DC0, len);
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2799,6 +2844,61 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	return rc;
>  }
>  
> +static void cxl_dc_region_release(void *data)
> +{
> +	struct cxl_region *cxlr = data;
> +	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
> +
> +	xa_destroy(&cxlr_dc->dax_dev_list);
> +	kfree(cxlr_dc);
> +}
> +
> +static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
> +{
> +	struct cxl_dc_region *cxlr_dc;
> +	struct cxl_dax_region *cxlr_dax;
> +	struct device *dev;
> +	int rc = 0;
> +
> +	cxlr_dax = cxl_dax_region_alloc(cxlr);
> +	if (IS_ERR(cxlr_dax))
> +		return PTR_ERR(cxlr_dax);
> +
> +	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
> +	if (!cxlr_dc) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	dev = &cxlr_dax->dev;
> +	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> +		dev_name(dev));
> +
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
> +					cxlr_dax);
> +	if (rc)
> +		goto err;
> +
> +	cxlr_dc->cxlr_dax = cxlr_dax;
> +	xa_init(&cxlr_dc->dax_dev_list);
> +	cxlr->cxlr_dc = cxlr_dc;
> +	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
> +	if (!rc)
> +		return 0;
> +err:
> +	put_device(dev);
> +	kfree(cxlr_dc);
> +	return rc;
> +}
> +
>  static int match_decoder_by_range(struct device *dev, void *data)
>  {
>  	struct range *r1, *r2 = data;
> @@ -3140,6 +3240,19 @@ static int is_system_ram(struct resource *res, void *arg)
>  	return 1;
>  }
>  
> +/*
> + * The region can not be manged by CXL if any portion of
> + * it is already online as 'System RAM'
> + */
> +static bool region_is_system_ram(struct cxl_region *cxlr,
> +				 struct cxl_region_params *p)
> +{
> +	return (walk_iomem_res_desc(IORES_DESC_NONE,
> +				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> +				    p->res->start, p->res->end, cxlr,
> +				    is_system_ram) > 0);
> +}
> +
>  static int cxl_region_probe(struct device *dev)
>  {
>  	struct cxl_region *cxlr = to_cxl_region(dev);
> @@ -3174,14 +3287,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_DECODER_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_DECODER_RAM:
> -		/*
> -		 * The region can not be manged by CXL if any portion of
> -		 * it is already online as 'System RAM'
> -		 */
> -		if (walk_iomem_res_desc(IORES_DESC_NONE,
> -					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
> -					p->res->start, p->res->end, cxlr,
> -					is_system_ram) > 0)
> +		if (region_is_system_ram(cxlr, p))
>  			return 0;
>  
>  		/*
> @@ -3193,6 +3299,17 @@ static int cxl_region_probe(struct device *dev)
>  
>  		/* HDM-H routes to device-dax */
>  		return devm_cxl_add_dax_region(cxlr);
> +	case CXL_DECODER_DC0:
> +	case CXL_DECODER_DC1:
> +	case CXL_DECODER_DC2:
> +	case CXL_DECODER_DC3:
> +	case CXL_DECODER_DC4:
> +	case CXL_DECODER_DC5:
> +	case CXL_DECODER_DC6:
> +	case CXL_DECODER_DC7:
> +		if (region_is_system_ram(cxlr, p))
> +			return 0;
> +		return devm_cxl_add_dc_region(cxlr);
>  	default:
>  		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
>  			cxlr->mode);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 8400af85d99f..7ac1237938b7 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -335,6 +335,14 @@ enum cxl_decoder_mode {
>  	CXL_DECODER_NONE,
>  	CXL_DECODER_RAM,
>  	CXL_DECODER_PMEM,
> +	CXL_DECODER_DC0,
> +	CXL_DECODER_DC1,
> +	CXL_DECODER_DC2,
> +	CXL_DECODER_DC3,
> +	CXL_DECODER_DC4,
> +	CXL_DECODER_DC5,
> +	CXL_DECODER_DC6,
> +	CXL_DECODER_DC7,
>  	CXL_DECODER_MIXED,
>  	CXL_DECODER_DEAD,
>  };
> @@ -345,6 +353,14 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  		[CXL_DECODER_NONE] = "none",
>  		[CXL_DECODER_RAM] = "ram",
>  		[CXL_DECODER_PMEM] = "pmem",
> +		[CXL_DECODER_DC0] = "dc0",
> +		[CXL_DECODER_DC1] = "dc1",
> +		[CXL_DECODER_DC2] = "dc2",
> +		[CXL_DECODER_DC3] = "dc3",
> +		[CXL_DECODER_DC4] = "dc4",
> +		[CXL_DECODER_DC5] = "dc5",
> +		[CXL_DECODER_DC6] = "dc6",
> +		[CXL_DECODER_DC7] = "dc7",
>  		[CXL_DECODER_MIXED] = "mixed",
>  	};
>  
> @@ -353,6 +369,11 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
>  	return "mixed";
>  }
>  
> +static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
> +{
> +	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
> +}
> +
>  /*
>   * Track whether this decoder is reserved for region autodiscovery, or
>   * free for userspace provisioning.
> @@ -375,6 +396,7 @@ struct cxl_endpoint_decoder {
>  	struct cxl_decoder cxld;
>  	struct resource *dpa_res;
>  	resource_size_t skip;
> +	struct xarray skip_res;
>  	enum cxl_decoder_mode mode;
>  	enum cxl_decoder_state state;
>  	int pos;
> @@ -475,6 +497,11 @@ struct cxl_region_params {
>   */

> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index ccdf8de85bd5..eb5eb81bfbd7 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -23,11 +23,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	if (decoder_mode_is_dc(cxlr->mode))
Comment for this would be good to let people know why (even if
it goes away in the future).

> +		return 0;
> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
>  		.size = range_len(&cxlr_dax->hpa_range),
>  	};
> +

*grumble*

>  	dev_dax = devm_create_dev_dax(&data);
>  	if (IS_ERR(dev_dax))
>  		return PTR_ERR(dev_dax);
>
Ira Weiny June 23, 2023, 6:02 p.m. UTC | #16
Fan Ni wrote:
> The 06/14/2023 12:16, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 

[snip]

> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index 543c4499379e..144232c8305e 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -1733,7 +1733,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
> >  	lockdep_assert_held_write(&cxl_region_rwsem);
> >  	lockdep_assert_held_read(&cxl_dpa_rwsem);
> >  
> > -	if (cxled->mode != cxlr->mode) {
> > +	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
> For mode other than dc, no check will be performed, is that what we
> want?
> 

:-/  Yes, looks like I may have screwed up the logic here thanks.  But this
code is changing because after this thread Navneet and I decided to introduce a
new cxl_region_mode enum which should clarify this check.

[snip]

> > +
> > +static ssize_t create_dc_region_store(struct device *dev,
> > +				      struct device_attribute *attr,
> > +				      const char *buf, size_t len)
> > +{
> > +	/*
> > +	 * All DC regions use decoder mode DC0 as the region does not need the
> > +	 * index information
> > +	 */
> > +	return store_dcN_region(to_cxl_root_decoder(dev), buf,
> > +				CXL_DECODER_DC0, len);
> If all DC regions use DC0, what will CXL_DECODER_DC1~7 be used for?

Before sending the patches it did not set well with me that the mode for cxl
region was not longer 1:1 with endpoint decoder mode.  I basically hacked in
the idea that DC0 decoder mode would represent DC region mode.  But this is
really hacky.  So this is why we have introduced cxl_region_mode which
represents ram, pmem, or DC in v2.  I'm still squashing in all the changes and
clean ups and should post something soon.

Ira
Davidlohr Bueso July 5, 2023, 2:49 p.m. UTC | #17
On Wed, 14 Jun 2023, ira.weiny@intel.com wrote:

>+config CXL_DCD
>+	bool "CXL: DCD Support"
>+	default CXL_BUS
>+	depends on CXL_REGION
>+	help
>+	  Enable the CXL core to provision CXL DCD regions.
>+	  CXL devices optionally support dynamic capacity and DCD region
>+	  maps the dynamic capacity regions DPA's into Host HPA ranges.
>+
>+	  If unsure say 'y'

Does this really merit another Kconfig option? What are the usecases for
this ever to be shipped as disabled?

Thanks,
Davidlohr
diff mbox series

Patch

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index ff4e78117b31..df034889d053 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -121,6 +121,17 @@  config CXL_REGION
 
 	  If unsure say 'y'
 
+config CXL_DCD
+	bool "CXL: DCD Support"
+	default CXL_BUS
+	depends on CXL_REGION
+	help
+	  Enable the CXL core to provision CXL DCD regions.
+	  CXL devices optionally support dynamic capacity and DCD region
+	  maps the dynamic capacity regions DPA's into Host HPA ranges.
+
+	  If unsure say 'y'
+
 config CXL_REGION_INVALIDATION_TEST
 	bool "CXL: Region Cache Management Bypass (TEST)"
 	depends on CXL_REGION
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 27f0968449de..725700ab5973 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -9,6 +9,13 @@  extern const struct device_type cxl_nvdimm_type;
 
 extern struct attribute_group cxl_base_attribute_group;
 
+#ifdef CONFIG_CXL_DCD
+extern struct device_attribute dev_attr_create_dc_region;
+#define SET_CXL_DC_REGION_ATTR(x) (&dev_attr_##x.attr),
+#else
+#define SET_CXL_DC_REGION_ATTR(x)
+#endif
+
 #ifdef CONFIG_CXL_REGION
 extern struct device_attribute dev_attr_create_pmem_region;
 extern struct device_attribute dev_attr_create_ram_region;
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 514d30131d92..29649b47d177 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -233,14 +233,23 @@  static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct resource *res = cxled->dpa_res;
 	resource_size_t skip_start;
+	resource_size_t skipped = cxled->skip;
 
 	lockdep_assert_held_write(&cxl_dpa_rwsem);
 
 	/* save @skip_start, before @res is released */
-	skip_start = res->start - cxled->skip;
+	skip_start = res->start - skipped;
 	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
-	if (cxled->skip)
-		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
+	if (cxled->skip != 0) {
+		while (skipped != 0) {
+			res = xa_load(&cxled->skip_res, skip_start);
+			__release_region(&cxlds->dpa_res, skip_start,
+							resource_size(res));
+			xa_erase(&cxled->skip_res, skip_start);
+			skip_start += resource_size(res);
+			skipped -= resource_size(res);
+			}
+	}
 	cxled->skip = 0;
 	cxled->dpa_res = NULL;
 	put_device(&cxled->cxld.dev);
@@ -267,6 +276,19 @@  static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 	__cxl_dpa_release(cxled);
 }
 
+static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
+{
+	int index = 0;
+
+	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
+		if (mode == i)
+			return index;
+		index++;
+	}
+
+	return -EINVAL;
+}
+
 static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 			     resource_size_t base, resource_size_t len,
 			     resource_size_t skipped)
@@ -275,7 +297,11 @@  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 	struct cxl_port *port = cxled_to_port(cxled);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct device *dev = &port->dev;
+	struct device *ed_dev = &cxled->cxld.dev;
+	struct resource *dpa_res = &cxlds->dpa_res;
+	resource_size_t skip_len = 0;
 	struct resource *res;
+	int rc, index;
 
 	lockdep_assert_held_write(&cxl_dpa_rwsem);
 
@@ -304,28 +330,119 @@  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 	}
 
 	if (skipped) {
-		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
-				       dev_name(&cxled->cxld.dev), 0);
-		if (!res) {
-			dev_dbg(dev,
-				"decoder%d.%d: failed to reserve skipped space\n",
-				port->id, cxled->cxld.id);
-			return -EBUSY;
+		resource_size_t skip_base = base - skipped;
+
+		if (decoder_mode_is_dc(cxled->mode)) {
+			if (resource_size(&cxlds->ram_res) &&
+					skip_base <= cxlds->ram_res.end) {
+				skip_len = cxlds->ram_res.end - skip_base + 1;
+				res = __request_region(dpa_res, skip_base,
+						skip_len, dev_name(ed_dev), 0);
+				if (!res)
+					goto error;
+
+				rc = xa_insert(&cxled->skip_res, skip_base, res,
+								GFP_KERNEL);
+				skip_base += skip_len;
+			}
+
+			if (resource_size(&cxlds->ram_res) &&
+					skip_base <= cxlds->pmem_res.end) {
+				skip_len = cxlds->pmem_res.end - skip_base + 1;
+				res = __request_region(dpa_res, skip_base,
+						skip_len, dev_name(ed_dev), 0);
+				if (!res)
+					goto error;
+
+				rc = xa_insert(&cxled->skip_res, skip_base, res,
+								GFP_KERNEL);
+				skip_base += skip_len;
+			}
+
+			index = dc_mode_to_region_index(cxled->mode);
+			for (int i = 0; i <= index; i++) {
+				struct resource *dcr = &cxlds->dc_res[i];
+
+				if (skip_base < dcr->start) {
+					skip_len = dcr->start - skip_base;
+					res = __request_region(dpa_res,
+							skip_base, skip_len,
+							dev_name(ed_dev), 0);
+					if (!res)
+						goto error;
+
+					rc = xa_insert(&cxled->skip_res, skip_base,
+							res, GFP_KERNEL);
+					skip_base += skip_len;
+				}
+
+				if (skip_base == base) {
+					dev_dbg(dev, "skip done!\n");
+					break;
+				}
+
+				if (resource_size(dcr) &&
+						skip_base <= dcr->end) {
+					if (skip_base > base)
+						dev_err(dev, "Skip error\n");
+
+					skip_len = dcr->end - skip_base + 1;
+					res = __request_region(dpa_res, skip_base,
+							skip_len,
+							dev_name(ed_dev), 0);
+					if (!res)
+						goto error;
+
+					rc = xa_insert(&cxled->skip_res, skip_base,
+							res, GFP_KERNEL);
+					skip_base += skip_len;
+				}
+			}
+		} else	{
+			res = __request_region(dpa_res, base - skipped, skipped,
+							dev_name(ed_dev), 0);
+			if (!res)
+				goto error;
+
+			rc = xa_insert(&cxled->skip_res, skip_base, res,
+								GFP_KERNEL);
 		}
 	}
-	res = __request_region(&cxlds->dpa_res, base, len,
-			       dev_name(&cxled->cxld.dev), 0);
+
+	res = __request_region(dpa_res, base, len, dev_name(ed_dev), 0);
 	if (!res) {
 		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
-			port->id, cxled->cxld.id);
-		if (skipped)
-			__release_region(&cxlds->dpa_res, base - skipped,
-					 skipped);
+				port->id, cxled->cxld.id);
+		if (skipped) {
+			resource_size_t skip_base = base - skipped;
+
+			while (skipped != 0) {
+				if (skip_base > base)
+					dev_err(dev, "Skip error\n");
+
+				res = xa_load(&cxled->skip_res, skip_base);
+				__release_region(dpa_res, skip_base,
+							resource_size(res));
+				xa_erase(&cxled->skip_res, skip_base);
+				skip_base += resource_size(res);
+				skipped -= resource_size(res);
+			}
+		}
 		return -EBUSY;
 	}
 	cxled->dpa_res = res;
 	cxled->skip = skipped;
 
+	for (int mode = CXL_DECODER_DC0; mode <= CXL_DECODER_DC7; mode++) {
+		int index = dc_mode_to_region_index(mode);
+
+		if (resource_contains(&cxlds->dc_res[index], res)) {
+			cxled->mode = mode;
+			dev_dbg(dev, "decoder%d.%d: %pr mode: %d\n", port->id,
+				cxled->cxld.id, cxled->dpa_res, cxled->mode);
+			goto success;
+		}
+	}
 	if (resource_contains(&cxlds->pmem_res, res))
 		cxled->mode = CXL_DECODER_PMEM;
 	else if (resource_contains(&cxlds->ram_res, res))
@@ -336,9 +453,16 @@  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 		cxled->mode = CXL_DECODER_MIXED;
 	}
 
+success:
 	port->hdm_end++;
 	get_device(&cxled->cxld.dev);
 	return 0;
+
+error:
+	dev_dbg(dev, "decoder%d.%d: failed to reserve skipped space\n",
+			port->id, cxled->cxld.id);
+	return -EBUSY;
+
 }
 
 int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
@@ -429,6 +553,14 @@  int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
 	switch (mode) {
 	case CXL_DECODER_RAM:
 	case CXL_DECODER_PMEM:
+	case CXL_DECODER_DC0:
+	case CXL_DECODER_DC1:
+	case CXL_DECODER_DC2:
+	case CXL_DECODER_DC3:
+	case CXL_DECODER_DC4:
+	case CXL_DECODER_DC5:
+	case CXL_DECODER_DC6:
+	case CXL_DECODER_DC7:
 		break;
 	default:
 		dev_dbg(dev, "unsupported mode: %d\n", mode);
@@ -456,6 +588,16 @@  int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
 		goto out;
 	}
 
+	for (int i = CXL_DECODER_DC0; i <= CXL_DECODER_DC7; i++) {
+		int index = dc_mode_to_region_index(i);
+
+		if (mode == i && !resource_size(&cxlds->dc_res[index])) {
+			dev_dbg(dev, "no available dynamic capacity\n");
+			rc = -ENXIO;
+			goto out;
+		}
+	}
+
 	cxled->mode = mode;
 	rc = 0;
 out:
@@ -469,10 +611,12 @@  static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
 					 resource_size_t *skip_out)
 {
 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
-	resource_size_t free_ram_start, free_pmem_start;
+	resource_size_t free_ram_start, free_pmem_start, free_dc_start;
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &cxled->cxld.dev;
 	resource_size_t start, avail, skip;
 	struct resource *p, *last;
+	int index;
 
 	lockdep_assert_held(&cxl_dpa_rwsem);
 
@@ -490,6 +634,20 @@  static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
 	else
 		free_pmem_start = cxlds->pmem_res.start;
 
+	/*
+	 * One HDM Decoder per DC region to map memory with different
+	 * DSMAS entry.
+	 */
+	index = dc_mode_to_region_index(cxled->mode);
+	if (index >= 0) {
+		if (cxlds->dc_res[index].child) {
+			dev_err(dev, "Cannot allocated DPA from DC Region: %d\n",
+					index);
+			return -EINVAL;
+		}
+		free_dc_start = cxlds->dc_res[index].start;
+	}
+
 	if (cxled->mode == CXL_DECODER_RAM) {
 		start = free_ram_start;
 		avail = cxlds->ram_res.end - start + 1;
@@ -511,6 +669,29 @@  static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,
 		else
 			skip_end = start - 1;
 		skip = skip_end - skip_start + 1;
+	} else if (decoder_mode_is_dc(cxled->mode)) {
+		resource_size_t skip_start, skip_end;
+
+		start = free_dc_start;
+		avail = cxlds->dc_res[index].end - start + 1;
+		if ((resource_size(&cxlds->pmem_res) == 0) || !cxlds->pmem_res.child)
+			skip_start = free_ram_start;
+		else
+			skip_start = free_pmem_start;
+		/*
+		 * If some dc region is already mapped, then that allocation
+		 * already handled the RAM and PMEM skip.Check for DC region
+		 * skip.
+		 */
+		for (int i = index - 1; i >= 0 ; i--) {
+			if (cxlds->dc_res[i].child) {
+				skip_start = cxlds->dc_res[i].child->end + 1;
+				break;
+			}
+		}
+
+		skip_end = start - 1;
+		skip = skip_end - skip_start + 1;
 	} else {
 		dev_dbg(cxled_dev(cxled), "mode not set\n");
 		avail = 0;
@@ -548,10 +729,25 @@  int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
 
 	avail = cxl_dpa_freespace(cxled, &start, &skip);
 
+	dev_dbg(dev, "DPA Allocation start: %llx len: %llx Skip: %llx\n",
+						start, size, skip);
 	if (size > avail) {
+		static const char * const names[] = {
+			[CXL_DECODER_NONE] = "none",
+			[CXL_DECODER_RAM] = "ram",
+			[CXL_DECODER_PMEM] = "pmem",
+			[CXL_DECODER_MIXED] = "mixed",
+			[CXL_DECODER_DC0] = "dc0",
+			[CXL_DECODER_DC1] = "dc1",
+			[CXL_DECODER_DC2] = "dc2",
+			[CXL_DECODER_DC3] = "dc3",
+			[CXL_DECODER_DC4] = "dc4",
+			[CXL_DECODER_DC5] = "dc5",
+			[CXL_DECODER_DC6] = "dc6",
+			[CXL_DECODER_DC7] = "dc7",
+		};
 		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
-			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
-			&avail);
+			names[cxled->mode], &avail);
 		rc = -ENOSPC;
 		goto out;
 	}
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 5e21b53362e6..a1a98aba24ed 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -195,6 +195,22 @@  static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
 		mode = CXL_DECODER_PMEM;
 	else if (sysfs_streq(buf, "ram"))
 		mode = CXL_DECODER_RAM;
+	else if (sysfs_streq(buf, "dc0"))
+		mode = CXL_DECODER_DC0;
+	else if (sysfs_streq(buf, "dc1"))
+		mode = CXL_DECODER_DC1;
+	else if (sysfs_streq(buf, "dc2"))
+		mode = CXL_DECODER_DC2;
+	else if (sysfs_streq(buf, "dc3"))
+		mode = CXL_DECODER_DC3;
+	else if (sysfs_streq(buf, "dc4"))
+		mode = CXL_DECODER_DC4;
+	else if (sysfs_streq(buf, "dc5"))
+		mode = CXL_DECODER_DC5;
+	else if (sysfs_streq(buf, "dc6"))
+		mode = CXL_DECODER_DC6;
+	else if (sysfs_streq(buf, "dc7"))
+		mode = CXL_DECODER_DC7;
 	else
 		return -EINVAL;
 
@@ -296,6 +312,7 @@  static struct attribute *cxl_decoder_root_attrs[] = {
 	&dev_attr_target_list.attr,
 	SET_CXL_REGION_ATTR(create_pmem_region)
 	SET_CXL_REGION_ATTR(create_ram_region)
+	SET_CXL_DC_REGION_ATTR(create_dc_region)
 	SET_CXL_REGION_ATTR(delete_region)
 	NULL,
 };
@@ -1691,6 +1708,7 @@  struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
 		return ERR_PTR(-ENOMEM);
 
 	cxled->pos = -1;
+	xa_init(&cxled->skip_res);
 	cxld = &cxled->cxld;
 	rc = cxl_decoder_init(port, cxld);
 	if (rc)	 {
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 543c4499379e..144232c8305e 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1733,7 +1733,7 @@  static int cxl_region_attach(struct cxl_region *cxlr,
 	lockdep_assert_held_write(&cxl_region_rwsem);
 	lockdep_assert_held_read(&cxl_dpa_rwsem);
 
-	if (cxled->mode != cxlr->mode) {
+	if (decoder_mode_is_dc(cxlr->mode) && !decoder_mode_is_dc(cxled->mode)) {
 		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
 			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
 		return -EINVAL;
@@ -2211,6 +2211,14 @@  static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
 	switch (mode) {
 	case CXL_DECODER_RAM:
 	case CXL_DECODER_PMEM:
+	case CXL_DECODER_DC0:
+	case CXL_DECODER_DC1:
+	case CXL_DECODER_DC2:
+	case CXL_DECODER_DC3:
+	case CXL_DECODER_DC4:
+	case CXL_DECODER_DC5:
+	case CXL_DECODER_DC6:
+	case CXL_DECODER_DC7:
 		break;
 	default:
 		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
@@ -2321,6 +2329,43 @@  static ssize_t create_ram_region_store(struct device *dev,
 }
 DEVICE_ATTR_RW(create_ram_region);
 
+static ssize_t store_dcN_region(struct cxl_root_decoder *cxlrd,
+				const char *buf, enum cxl_decoder_mode mode,
+				size_t len)
+{
+	struct cxl_region *cxlr;
+	int rc, id;
+
+	rc = sscanf(buf, "region%d\n", &id);
+	if (rc != 1)
+		return -EINVAL;
+
+	cxlr = __create_region(cxlrd, id, mode, CXL_DECODER_HOSTMEM);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	return len;
+}
+
+static ssize_t create_dc_region_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
+
+static ssize_t create_dc_region_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	/*
+	 * All DC regions use decoder mode DC0 as the region does not need the
+	 * index information
+	 */
+	return store_dcN_region(to_cxl_root_decoder(dev), buf,
+				CXL_DECODER_DC0, len);
+}
+DEVICE_ATTR_RW(create_dc_region);
+
 static ssize_t region_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
 {
@@ -2799,6 +2844,61 @@  static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
 	return rc;
 }
 
+static void cxl_dc_region_release(void *data)
+{
+	struct cxl_region *cxlr = data;
+	struct cxl_dc_region *cxlr_dc = cxlr->cxlr_dc;
+
+	xa_destroy(&cxlr_dc->dax_dev_list);
+	kfree(cxlr_dc);
+}
+
+static int devm_cxl_add_dc_region(struct cxl_region *cxlr)
+{
+	struct cxl_dc_region *cxlr_dc;
+	struct cxl_dax_region *cxlr_dax;
+	struct device *dev;
+	int rc = 0;
+
+	cxlr_dax = cxl_dax_region_alloc(cxlr);
+	if (IS_ERR(cxlr_dax))
+		return PTR_ERR(cxlr_dax);
+
+	cxlr_dc = kzalloc(sizeof(*cxlr_dc), GFP_KERNEL);
+	if (!cxlr_dc) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	dev = &cxlr_dax->dev;
+	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+		dev_name(dev));
+
+	rc = devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
+					cxlr_dax);
+	if (rc)
+		goto err;
+
+	cxlr_dc->cxlr_dax = cxlr_dax;
+	xa_init(&cxlr_dc->dax_dev_list);
+	cxlr->cxlr_dc = cxlr_dc;
+	rc = devm_add_action_or_reset(&cxlr->dev, cxl_dc_region_release, cxlr);
+	if (!rc)
+		return 0;
+err:
+	put_device(dev);
+	kfree(cxlr_dc);
+	return rc;
+}
+
 static int match_decoder_by_range(struct device *dev, void *data)
 {
 	struct range *r1, *r2 = data;
@@ -3140,6 +3240,19 @@  static int is_system_ram(struct resource *res, void *arg)
 	return 1;
 }
 
+/*
+ * The region can not be manged by CXL if any portion of
+ * it is already online as 'System RAM'
+ */
+static bool region_is_system_ram(struct cxl_region *cxlr,
+				 struct cxl_region_params *p)
+{
+	return (walk_iomem_res_desc(IORES_DESC_NONE,
+				    IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+				    p->res->start, p->res->end, cxlr,
+				    is_system_ram) > 0);
+}
+
 static int cxl_region_probe(struct device *dev)
 {
 	struct cxl_region *cxlr = to_cxl_region(dev);
@@ -3174,14 +3287,7 @@  static int cxl_region_probe(struct device *dev)
 	case CXL_DECODER_PMEM:
 		return devm_cxl_add_pmem_region(cxlr);
 	case CXL_DECODER_RAM:
-		/*
-		 * The region can not be manged by CXL if any portion of
-		 * it is already online as 'System RAM'
-		 */
-		if (walk_iomem_res_desc(IORES_DESC_NONE,
-					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
-					p->res->start, p->res->end, cxlr,
-					is_system_ram) > 0)
+		if (region_is_system_ram(cxlr, p))
 			return 0;
 
 		/*
@@ -3193,6 +3299,17 @@  static int cxl_region_probe(struct device *dev)
 
 		/* HDM-H routes to device-dax */
 		return devm_cxl_add_dax_region(cxlr);
+	case CXL_DECODER_DC0:
+	case CXL_DECODER_DC1:
+	case CXL_DECODER_DC2:
+	case CXL_DECODER_DC3:
+	case CXL_DECODER_DC4:
+	case CXL_DECODER_DC5:
+	case CXL_DECODER_DC6:
+	case CXL_DECODER_DC7:
+		if (region_is_system_ram(cxlr, p))
+			return 0;
+		return devm_cxl_add_dc_region(cxlr);
 	default:
 		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
 			cxlr->mode);
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 8400af85d99f..7ac1237938b7 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -335,6 +335,14 @@  enum cxl_decoder_mode {
 	CXL_DECODER_NONE,
 	CXL_DECODER_RAM,
 	CXL_DECODER_PMEM,
+	CXL_DECODER_DC0,
+	CXL_DECODER_DC1,
+	CXL_DECODER_DC2,
+	CXL_DECODER_DC3,
+	CXL_DECODER_DC4,
+	CXL_DECODER_DC5,
+	CXL_DECODER_DC6,
+	CXL_DECODER_DC7,
 	CXL_DECODER_MIXED,
 	CXL_DECODER_DEAD,
 };
@@ -345,6 +353,14 @@  static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
 		[CXL_DECODER_NONE] = "none",
 		[CXL_DECODER_RAM] = "ram",
 		[CXL_DECODER_PMEM] = "pmem",
+		[CXL_DECODER_DC0] = "dc0",
+		[CXL_DECODER_DC1] = "dc1",
+		[CXL_DECODER_DC2] = "dc2",
+		[CXL_DECODER_DC3] = "dc3",
+		[CXL_DECODER_DC4] = "dc4",
+		[CXL_DECODER_DC5] = "dc5",
+		[CXL_DECODER_DC6] = "dc6",
+		[CXL_DECODER_DC7] = "dc7",
 		[CXL_DECODER_MIXED] = "mixed",
 	};
 
@@ -353,6 +369,11 @@  static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
 	return "mixed";
 }
 
+static inline bool decoder_mode_is_dc(enum cxl_decoder_mode mode)
+{
+	return (mode >= CXL_DECODER_DC0 && mode <= CXL_DECODER_DC7);
+}
+
 /*
  * Track whether this decoder is reserved for region autodiscovery, or
  * free for userspace provisioning.
@@ -375,6 +396,7 @@  struct cxl_endpoint_decoder {
 	struct cxl_decoder cxld;
 	struct resource *dpa_res;
 	resource_size_t skip;
+	struct xarray skip_res;
 	enum cxl_decoder_mode mode;
 	enum cxl_decoder_state state;
 	int pos;
@@ -475,6 +497,11 @@  struct cxl_region_params {
  */
 #define CXL_REGION_F_AUTO 1
 
+struct cxl_dc_region {
+	struct xarray dax_dev_list;
+	struct cxl_dax_region *cxlr_dax;
+};
+
 /**
  * struct cxl_region - CXL region
  * @dev: This region's device
@@ -493,6 +520,7 @@  struct cxl_region {
 	enum cxl_decoder_type type;
 	struct cxl_nvdimm_bridge *cxl_nvb;
 	struct cxl_pmem_region *cxlr_pmem;
+	struct cxl_dc_region *cxlr_dc;
 	unsigned long flags;
 	struct cxl_region_params params;
 };
diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index ccdf8de85bd5..eb5eb81bfbd7 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -23,11 +23,15 @@  static int cxl_dax_region_probe(struct device *dev)
 	if (!dax_region)
 		return -ENOMEM;
 
+	if (decoder_mode_is_dc(cxlr->mode))
+		return 0;
+
 	data = (struct dev_dax_data) {
 		.dax_region = dax_region,
 		.id = -1,
 		.size = range_len(&cxlr_dax->hpa_range),
 	};
+
 	dev_dax = devm_create_dev_dax(&data);
 	if (IS_ERR(dev_dax))
 		return PTR_ERR(dev_dax);